1 /* 2 * kmp_csupport.cpp -- kfront linkage support for OpenMP. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "omp.h" /* extern "C" declarations of user-visible routines */ 15 #include "kmp.h" 16 #include "kmp_error.h" 17 #include "kmp_i18n.h" 18 #include "kmp_itt.h" 19 #include "kmp_lock.h" 20 #include "kmp_stats.h" 21 22 #if OMPT_SUPPORT 23 #include "ompt-specific.h" 24 #endif 25 26 #define MAX_MESSAGE 512 27 28 // flags will be used in future, e.g. to implement openmp_strict library 29 // restrictions 30 31 /*! 32 * @ingroup STARTUP_SHUTDOWN 33 * @param loc in source location information 34 * @param flags in for future use (currently ignored) 35 * 36 * Initialize the runtime library. This call is optional; if it is not made then 37 * it will be implicitly called by attempts to use other library functions. 38 */ 39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) { 40 // By default __kmpc_begin() is no-op. 41 char *env; 42 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL && 43 __kmp_str_match_true(env)) { 44 __kmp_middle_initialize(); 45 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n")); 46 } else if (__kmp_ignore_mppbeg() == FALSE) { 47 // By default __kmp_ignore_mppbeg() returns TRUE. 48 __kmp_internal_begin(); 49 KC_TRACE(10, ("__kmpc_begin: called\n")); 50 } 51 } 52 53 /*! 54 * @ingroup STARTUP_SHUTDOWN 55 * @param loc source location information 56 * 57 * Shutdown the runtime library. This is also optional, and even if called will 58 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to 59 * zero. 60 */ 61 void __kmpc_end(ident_t *loc) { 62 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() 63 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND 64 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() 65 // returns FALSE and __kmpc_end() will unregister this root (it can cause 66 // library shut down). 67 if (__kmp_ignore_mppend() == FALSE) { 68 KC_TRACE(10, ("__kmpc_end: called\n")); 69 KA_TRACE(30, ("__kmpc_end\n")); 70 71 __kmp_internal_end_thread(-1); 72 } 73 } 74 75 /*! 76 @ingroup THREAD_STATES 77 @param loc Source location information. 78 @return The global thread index of the active thread. 79 80 This function can be called in any context. 81 82 If the runtime has ony been entered at the outermost level from a 83 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is 84 that which would be returned by omp_get_thread_num() in the outermost 85 active parallel construct. (Or zero if there is no active parallel 86 construct, since the master thread is necessarily thread zero). 87 88 If multiple non-OpenMP threads all enter an OpenMP construct then this 89 will be a unique thread identifier among all the threads created by 90 the OpenMP runtime (but the value cannote be defined in terms of 91 OpenMP thread ids returned by omp_get_thread_num()). 92 */ 93 kmp_int32 __kmpc_global_thread_num(ident_t *loc) { 94 kmp_int32 gtid = __kmp_entry_gtid(); 95 96 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid)); 97 98 return gtid; 99 } 100 101 /*! 102 @ingroup THREAD_STATES 103 @param loc Source location information. 104 @return The number of threads under control of the OpenMP<sup>*</sup> runtime 105 106 This function can be called in any context. 107 It returns the total number of threads under the control of the OpenMP runtime. 108 That is not a number that can be determined by any OpenMP standard calls, since 109 the library may be called from more than one non-OpenMP thread, and this 110 reflects the total over all such calls. Similarly the runtime maintains 111 underlying threads even when they are not active (since the cost of creating 112 and destroying OS threads is high), this call counts all such threads even if 113 they are not waiting for work. 114 */ 115 kmp_int32 __kmpc_global_num_threads(ident_t *loc) { 116 KC_TRACE(10, 117 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth)); 118 119 return TCR_4(__kmp_all_nth); 120 } 121 122 /*! 123 @ingroup THREAD_STATES 124 @param loc Source location information. 125 @return The thread number of the calling thread in the innermost active parallel 126 construct. 127 */ 128 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) { 129 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n")); 130 return __kmp_tid_from_gtid(__kmp_entry_gtid()); 131 } 132 133 /*! 134 @ingroup THREAD_STATES 135 @param loc Source location information. 136 @return The number of threads in the innermost active parallel construct. 137 */ 138 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) { 139 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n")); 140 141 return __kmp_entry_thread()->th.th_team->t.t_nproc; 142 } 143 144 /*! 145 * @ingroup DEPRECATED 146 * @param loc location description 147 * 148 * This function need not be called. It always returns TRUE. 149 */ 150 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) { 151 #ifndef KMP_DEBUG 152 153 return TRUE; 154 155 #else 156 157 const char *semi2; 158 const char *semi3; 159 int line_no; 160 161 if (__kmp_par_range == 0) { 162 return TRUE; 163 } 164 semi2 = loc->psource; 165 if (semi2 == NULL) { 166 return TRUE; 167 } 168 semi2 = strchr(semi2, ';'); 169 if (semi2 == NULL) { 170 return TRUE; 171 } 172 semi2 = strchr(semi2 + 1, ';'); 173 if (semi2 == NULL) { 174 return TRUE; 175 } 176 if (__kmp_par_range_filename[0]) { 177 const char *name = semi2 - 1; 178 while ((name > loc->psource) && (*name != '/') && (*name != ';')) { 179 name--; 180 } 181 if ((*name == '/') || (*name == ';')) { 182 name++; 183 } 184 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) { 185 return __kmp_par_range < 0; 186 } 187 } 188 semi3 = strchr(semi2 + 1, ';'); 189 if (__kmp_par_range_routine[0]) { 190 if ((semi3 != NULL) && (semi3 > semi2) && 191 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) { 192 return __kmp_par_range < 0; 193 } 194 } 195 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) { 196 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) { 197 return __kmp_par_range > 0; 198 } 199 return __kmp_par_range < 0; 200 } 201 return TRUE; 202 203 #endif /* KMP_DEBUG */ 204 } 205 206 /*! 207 @ingroup THREAD_STATES 208 @param loc Source location information. 209 @return 1 if this thread is executing inside an active parallel region, zero if 210 not. 211 */ 212 kmp_int32 __kmpc_in_parallel(ident_t *loc) { 213 return __kmp_entry_thread()->th.th_root->r.r_active; 214 } 215 216 /*! 217 @ingroup PARALLEL 218 @param loc source location information 219 @param global_tid global thread number 220 @param num_threads number of threads requested for this parallel construct 221 222 Set the number of threads to be used by the next fork spawned by this thread. 223 This call is only required if the parallel construct has a `num_threads` clause. 224 */ 225 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, 226 kmp_int32 num_threads) { 227 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n", 228 global_tid, num_threads)); 229 230 __kmp_push_num_threads(loc, global_tid, num_threads); 231 } 232 233 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) { 234 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n")); 235 236 /* the num_threads are automatically popped */ 237 } 238 239 #if OMP_40_ENABLED 240 241 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, 242 kmp_int32 proc_bind) { 243 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid, 244 proc_bind)); 245 246 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind); 247 } 248 249 #endif /* OMP_40_ENABLED */ 250 251 /*! 252 @ingroup PARALLEL 253 @param loc source location information 254 @param argc total number of arguments in the ellipsis 255 @param microtask pointer to callback routine consisting of outlined parallel 256 construct 257 @param ... pointers to shared variables that aren't global 258 259 Do the actual fork and call the microtask in the relevant number of threads. 260 */ 261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) { 262 int gtid = __kmp_entry_gtid(); 263 264 #if (KMP_STATS_ENABLED) 265 // If we were in a serial region, then stop the serial timer, record 266 // the event, and start parallel region timer 267 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 268 if (previous_state == stats_state_e::SERIAL_REGION) { 269 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead); 270 } else { 271 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead); 272 } 273 int inParallel = __kmpc_in_parallel(loc); 274 if (inParallel) { 275 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL); 276 } else { 277 KMP_COUNT_BLOCK(OMP_PARALLEL); 278 } 279 #endif 280 281 // maybe to save thr_state is enough here 282 { 283 va_list ap; 284 va_start(ap, microtask); 285 286 #if OMPT_SUPPORT 287 omp_frame_t *ompt_frame; 288 if (ompt_enabled.enabled) { 289 kmp_info_t *master_th = __kmp_threads[gtid]; 290 kmp_team_t *parent_team = master_th->th.th_team; 291 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info; 292 if (lwt) 293 ompt_frame = &(lwt->ompt_task_info.frame); 294 else { 295 int tid = __kmp_tid_from_gtid(gtid); 296 ompt_frame = &( 297 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame); 298 } 299 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 300 OMPT_STORE_RETURN_ADDRESS(gtid); 301 } 302 #endif 303 304 #if INCLUDE_SSC_MARKS 305 SSC_MARK_FORKING(); 306 #endif 307 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 308 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task 309 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, 310 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 311 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 312 &ap 313 #else 314 ap 315 #endif 316 ); 317 #if INCLUDE_SSC_MARKS 318 SSC_MARK_JOINING(); 319 #endif 320 __kmp_join_call(loc, gtid 321 #if OMPT_SUPPORT 322 , 323 fork_context_intel 324 #endif 325 ); 326 327 va_end(ap); 328 } 329 330 #if KMP_STATS_ENABLED 331 if (previous_state == stats_state_e::SERIAL_REGION) { 332 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial); 333 } else { 334 KMP_POP_PARTITIONED_TIMER(); 335 } 336 #endif // KMP_STATS_ENABLED 337 } 338 339 #if OMP_40_ENABLED 340 /*! 341 @ingroup PARALLEL 342 @param loc source location information 343 @param global_tid global thread number 344 @param num_teams number of teams requested for the teams construct 345 @param num_threads number of threads per team requested for the teams construct 346 347 Set the number of teams to be used by the teams construct. 348 This call is only required if the teams construct has a `num_teams` clause 349 or a `thread_limit` clause (or both). 350 */ 351 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, 352 kmp_int32 num_teams, kmp_int32 num_threads) { 353 KA_TRACE(20, 354 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n", 355 global_tid, num_teams, num_threads)); 356 357 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads); 358 } 359 360 /*! 361 @ingroup PARALLEL 362 @param loc source location information 363 @param argc total number of arguments in the ellipsis 364 @param microtask pointer to callback routine consisting of outlined teams 365 construct 366 @param ... pointers to shared variables that aren't global 367 368 Do the actual fork and call the microtask in the relevant number of threads. 369 */ 370 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, 371 ...) { 372 int gtid = __kmp_entry_gtid(); 373 kmp_info_t *this_thr = __kmp_threads[gtid]; 374 va_list ap; 375 va_start(ap, microtask); 376 377 KMP_COUNT_BLOCK(OMP_TEAMS); 378 379 // remember teams entry point and nesting level 380 this_thr->th.th_teams_microtask = microtask; 381 this_thr->th.th_teams_level = 382 this_thr->th.th_team->t.t_level; // AC: can be >0 on host 383 384 #if OMPT_SUPPORT 385 kmp_team_t *parent_team = this_thr->th.th_team; 386 int tid = __kmp_tid_from_gtid(gtid); 387 if (ompt_enabled.enabled) { 388 parent_team->t.t_implicit_task_taskdata[tid] 389 .ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1); 390 } 391 OMPT_STORE_RETURN_ADDRESS(gtid); 392 #endif 393 394 // check if __kmpc_push_num_teams called, set default number of teams 395 // otherwise 396 if (this_thr->th.th_teams_size.nteams == 0) { 397 __kmp_push_num_teams(loc, gtid, 0, 0); 398 } 399 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1); 400 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1); 401 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1); 402 403 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 404 VOLATILE_CAST(microtask_t) 405 __kmp_teams_master, // "wrapped" task 406 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, 407 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 408 &ap 409 #else 410 ap 411 #endif 412 ); 413 __kmp_join_call(loc, gtid 414 #if OMPT_SUPPORT 415 , 416 fork_context_intel 417 #endif 418 ); 419 420 this_thr->th.th_teams_microtask = NULL; 421 this_thr->th.th_teams_level = 0; 422 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L; 423 va_end(ap); 424 } 425 #endif /* OMP_40_ENABLED */ 426 427 // I don't think this function should ever have been exported. 428 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated 429 // openmp code ever called it, but it's been exported from the RTL for so 430 // long that I'm afraid to remove the definition. 431 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); } 432 433 /*! 434 @ingroup PARALLEL 435 @param loc source location information 436 @param global_tid global thread number 437 438 Enter a serialized parallel construct. This interface is used to handle a 439 conditional parallel region, like this, 440 @code 441 #pragma omp parallel if (condition) 442 @endcode 443 when the condition is false. 444 */ 445 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 446 // The implementation is now in kmp_runtime.cpp so that it can share static 447 // functions with kmp_fork_call since the tasks to be done are similar in 448 // each case. 449 #if OMPT_SUPPORT 450 OMPT_STORE_RETURN_ADDRESS(global_tid); 451 #endif 452 __kmp_serialized_parallel(loc, global_tid); 453 } 454 455 /*! 456 @ingroup PARALLEL 457 @param loc source location information 458 @param global_tid global thread number 459 460 Leave a serialized parallel construct. 461 */ 462 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 463 kmp_internal_control_t *top; 464 kmp_info_t *this_thr; 465 kmp_team_t *serial_team; 466 467 KC_TRACE(10, 468 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid)); 469 470 /* skip all this code for autopar serialized loops since it results in 471 unacceptable overhead */ 472 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 473 return; 474 475 // Not autopar code 476 if (!TCR_4(__kmp_init_parallel)) 477 __kmp_parallel_initialize(); 478 479 this_thr = __kmp_threads[global_tid]; 480 serial_team = this_thr->th.th_serial_team; 481 482 #if OMP_45_ENABLED 483 kmp_task_team_t *task_team = this_thr->th.th_task_team; 484 485 // we need to wait for the proxy tasks before finishing the thread 486 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) 487 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL)); 488 #endif 489 490 KMP_MB(); 491 KMP_DEBUG_ASSERT(serial_team); 492 KMP_ASSERT(serial_team->t.t_serialized); 493 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 494 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team); 495 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 496 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 497 498 #if OMPT_SUPPORT 499 if (ompt_enabled.enabled && 500 this_thr->th.ompt_thread_info.state != omp_state_overhead) { 501 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = NULL; 502 if (ompt_enabled.ompt_callback_implicit_task) { 503 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 504 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1, 505 OMPT_CUR_TASK_INFO(this_thr)->thread_num); 506 } 507 508 // reset clear the task id only after unlinking the task 509 ompt_data_t *parent_task_data; 510 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL); 511 512 if (ompt_enabled.ompt_callback_parallel_end) { 513 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 514 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data, 515 ompt_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid)); 516 } 517 __ompt_lw_taskteam_unlink(this_thr); 518 this_thr->th.ompt_thread_info.state = omp_state_overhead; 519 } 520 #endif 521 522 /* If necessary, pop the internal control stack values and replace the team 523 * values */ 524 top = serial_team->t.t_control_stack_top; 525 if (top && top->serial_nesting_level == serial_team->t.t_serialized) { 526 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top); 527 serial_team->t.t_control_stack_top = top->next; 528 __kmp_free(top); 529 } 530 531 // if( serial_team -> t.t_serialized > 1 ) 532 serial_team->t.t_level--; 533 534 /* pop dispatch buffers stack */ 535 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer); 536 { 537 dispatch_private_info_t *disp_buffer = 538 serial_team->t.t_dispatch->th_disp_buffer; 539 serial_team->t.t_dispatch->th_disp_buffer = 540 serial_team->t.t_dispatch->th_disp_buffer->next; 541 __kmp_free(disp_buffer); 542 } 543 544 --serial_team->t.t_serialized; 545 if (serial_team->t.t_serialized == 0) { 546 547 /* return to the parallel section */ 548 549 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 550 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) { 551 __kmp_clear_x87_fpu_status_word(); 552 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word); 553 __kmp_load_mxcsr(&serial_team->t.t_mxcsr); 554 } 555 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 556 557 this_thr->th.th_team = serial_team->t.t_parent; 558 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid; 559 560 /* restore values cached in the thread */ 561 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */ 562 this_thr->th.th_team_master = 563 serial_team->t.t_parent->t.t_threads[0]; /* JPH */ 564 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized; 565 566 /* TODO the below shouldn't need to be adjusted for serialized teams */ 567 this_thr->th.th_dispatch = 568 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid]; 569 570 __kmp_pop_current_task_from_thread(this_thr); 571 572 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0); 573 this_thr->th.th_current_task->td_flags.executing = 1; 574 575 if (__kmp_tasking_mode != tskm_immediate_exec) { 576 // Copy the task team from the new child / old parent team to the thread. 577 this_thr->th.th_task_team = 578 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]; 579 KA_TRACE(20, 580 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / " 581 "team %p\n", 582 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 583 } 584 } else { 585 if (__kmp_tasking_mode != tskm_immediate_exec) { 586 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting " 587 "depth of serial team %p to %d\n", 588 global_tid, serial_team, serial_team->t.t_serialized)); 589 } 590 } 591 592 if (__kmp_env_consistency_check) 593 __kmp_pop_parallel(global_tid, NULL); 594 #if OMPT_SUPPORT 595 if (ompt_enabled.enabled) 596 this_thr->th.ompt_thread_info.state = 597 ((this_thr->th.th_team_serialized) ? omp_state_work_serial 598 : omp_state_work_parallel); 599 #endif 600 } 601 602 /*! 603 @ingroup SYNCHRONIZATION 604 @param loc source location information. 605 606 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though 607 depending on the memory ordering convention obeyed by the compiler 608 even that may not be necessary). 609 */ 610 void __kmpc_flush(ident_t *loc) { 611 KC_TRACE(10, ("__kmpc_flush: called\n")); 612 613 /* need explicit __mf() here since use volatile instead in library */ 614 KMP_MB(); /* Flush all pending memory write invalidates. */ 615 616 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64) 617 #if KMP_MIC 618 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used. 619 // We shouldn't need it, though, since the ABI rules require that 620 // * If the compiler generates NGO stores it also generates the fence 621 // * If users hand-code NGO stores they should insert the fence 622 // therefore no incomplete unordered stores should be visible. 623 #else 624 // C74404 625 // This is to address non-temporal store instructions (sfence needed). 626 // The clflush instruction is addressed either (mfence needed). 627 // Probably the non-temporal load monvtdqa instruction should also be 628 // addressed. 629 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2. 630 if (!__kmp_cpuinfo.initialized) { 631 __kmp_query_cpuid(&__kmp_cpuinfo); 632 } 633 if (!__kmp_cpuinfo.sse2) { 634 // CPU cannot execute SSE2 instructions. 635 } else { 636 #if KMP_COMPILER_ICC 637 _mm_mfence(); 638 #elif KMP_COMPILER_MSVC 639 MemoryBarrier(); 640 #else 641 __sync_synchronize(); 642 #endif // KMP_COMPILER_ICC 643 } 644 #endif // KMP_MIC 645 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64) 646 // Nothing to see here move along 647 #elif KMP_ARCH_PPC64 648 // Nothing needed here (we have a real MB above). 649 #if KMP_OS_CNK 650 // The flushing thread needs to yield here; this prevents a 651 // busy-waiting thread from saturating the pipeline. flush is 652 // often used in loops like this: 653 // while (!flag) { 654 // #pragma omp flush(flag) 655 // } 656 // and adding the yield here is good for at least a 10x speedup 657 // when running >2 threads per core (on the NAS LU benchmark). 658 __kmp_yield(TRUE); 659 #endif 660 #else 661 #error Unknown or unsupported architecture 662 #endif 663 664 #if OMPT_SUPPORT && OMPT_OPTIONAL 665 if (ompt_enabled.ompt_callback_flush) { 666 ompt_callbacks.ompt_callback(ompt_callback_flush)( 667 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0)); 668 } 669 #endif 670 } 671 672 /* -------------------------------------------------------------------------- */ 673 /*! 674 @ingroup SYNCHRONIZATION 675 @param loc source location information 676 @param global_tid thread id. 677 678 Execute a barrier. 679 */ 680 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) { 681 KMP_COUNT_BLOCK(OMP_BARRIER); 682 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid)); 683 684 if (!TCR_4(__kmp_init_parallel)) 685 __kmp_parallel_initialize(); 686 687 if (__kmp_env_consistency_check) { 688 if (loc == 0) { 689 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 690 } 691 692 __kmp_check_barrier(global_tid, ct_barrier, loc); 693 } 694 695 #if OMPT_SUPPORT 696 omp_frame_t *ompt_frame; 697 if (ompt_enabled.enabled) { 698 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 699 if (ompt_frame->enter_frame == NULL) 700 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 701 OMPT_STORE_RETURN_ADDRESS(global_tid); 702 } 703 #endif 704 __kmp_threads[global_tid]->th.th_ident = loc; 705 // TODO: explicit barrier_wait_id: 706 // this function is called when 'barrier' directive is present or 707 // implicit barrier at the end of a worksharing construct. 708 // 1) better to add a per-thread barrier counter to a thread data structure 709 // 2) set to 0 when a new team is created 710 // 4) no sync is required 711 712 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 713 #if OMPT_SUPPORT && OMPT_OPTIONAL 714 if (ompt_enabled.enabled) { 715 ompt_frame->enter_frame = NULL; 716 } 717 #endif 718 } 719 720 /* The BARRIER for a MASTER section is always explicit */ 721 /*! 722 @ingroup WORK_SHARING 723 @param loc source location information. 724 @param global_tid global thread number . 725 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise. 726 */ 727 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) { 728 int status = 0; 729 730 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid)); 731 732 if (!TCR_4(__kmp_init_parallel)) 733 __kmp_parallel_initialize(); 734 735 if (KMP_MASTER_GTID(global_tid)) { 736 KMP_COUNT_BLOCK(OMP_MASTER); 737 KMP_PUSH_PARTITIONED_TIMER(OMP_master); 738 status = 1; 739 } 740 741 #if OMPT_SUPPORT && OMPT_OPTIONAL 742 if (status) { 743 if (ompt_enabled.ompt_callback_master) { 744 kmp_info_t *this_thr = __kmp_threads[global_tid]; 745 kmp_team_t *team = this_thr->th.th_team; 746 747 int tid = __kmp_tid_from_gtid(global_tid); 748 ompt_callbacks.ompt_callback(ompt_callback_master)( 749 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data), 750 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 751 OMPT_GET_RETURN_ADDRESS(0)); 752 } 753 } 754 #endif 755 756 if (__kmp_env_consistency_check) { 757 #if KMP_USE_DYNAMIC_LOCK 758 if (status) 759 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0); 760 else 761 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0); 762 #else 763 if (status) 764 __kmp_push_sync(global_tid, ct_master, loc, NULL); 765 else 766 __kmp_check_sync(global_tid, ct_master, loc, NULL); 767 #endif 768 } 769 770 return status; 771 } 772 773 /*! 774 @ingroup WORK_SHARING 775 @param loc source location information. 776 @param global_tid global thread number . 777 778 Mark the end of a <tt>master</tt> region. This should only be called by the 779 thread that executes the <tt>master</tt> region. 780 */ 781 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) { 782 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid)); 783 784 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid)); 785 KMP_POP_PARTITIONED_TIMER(); 786 787 #if OMPT_SUPPORT && OMPT_OPTIONAL 788 kmp_info_t *this_thr = __kmp_threads[global_tid]; 789 kmp_team_t *team = this_thr->th.th_team; 790 if (ompt_enabled.ompt_callback_master) { 791 int tid = __kmp_tid_from_gtid(global_tid); 792 ompt_callbacks.ompt_callback(ompt_callback_master)( 793 ompt_scope_end, &(team->t.ompt_team_info.parallel_data), 794 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 795 OMPT_GET_RETURN_ADDRESS(0)); 796 } 797 #endif 798 799 if (__kmp_env_consistency_check) { 800 if (global_tid < 0) 801 KMP_WARNING(ThreadIdentInvalid); 802 803 if (KMP_MASTER_GTID(global_tid)) 804 __kmp_pop_sync(global_tid, ct_master, loc); 805 } 806 } 807 808 /*! 809 @ingroup WORK_SHARING 810 @param loc source location information. 811 @param gtid global thread number. 812 813 Start execution of an <tt>ordered</tt> construct. 814 */ 815 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) { 816 int cid = 0; 817 kmp_info_t *th; 818 KMP_DEBUG_ASSERT(__kmp_init_serial); 819 820 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid)); 821 822 if (!TCR_4(__kmp_init_parallel)) 823 __kmp_parallel_initialize(); 824 825 #if USE_ITT_BUILD 826 __kmp_itt_ordered_prep(gtid); 827 // TODO: ordered_wait_id 828 #endif /* USE_ITT_BUILD */ 829 830 th = __kmp_threads[gtid]; 831 832 #if OMPT_SUPPORT && OMPT_OPTIONAL 833 kmp_team_t *team; 834 omp_wait_id_t lck; 835 void *codeptr_ra; 836 if (ompt_enabled.enabled) { 837 OMPT_STORE_RETURN_ADDRESS(gtid); 838 team = __kmp_team_from_gtid(gtid); 839 lck = (omp_wait_id_t)&team->t.t_ordered.dt.t_value; 840 /* OMPT state update */ 841 th->th.ompt_thread_info.wait_id = lck; 842 th->th.ompt_thread_info.state = omp_state_wait_ordered; 843 844 /* OMPT event callback */ 845 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 846 if (ompt_enabled.ompt_callback_mutex_acquire) { 847 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 848 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, 849 (omp_wait_id_t)lck, codeptr_ra); 850 } 851 } 852 #endif 853 854 if (th->th.th_dispatch->th_deo_fcn != 0) 855 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc); 856 else 857 __kmp_parallel_deo(>id, &cid, loc); 858 859 #if OMPT_SUPPORT && OMPT_OPTIONAL 860 if (ompt_enabled.enabled) { 861 /* OMPT state update */ 862 th->th.ompt_thread_info.state = omp_state_work_parallel; 863 th->th.ompt_thread_info.wait_id = 0; 864 865 /* OMPT event callback */ 866 if (ompt_enabled.ompt_callback_mutex_acquired) { 867 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 868 ompt_mutex_ordered, (omp_wait_id_t)lck, codeptr_ra); 869 } 870 } 871 #endif 872 873 #if USE_ITT_BUILD 874 __kmp_itt_ordered_start(gtid); 875 #endif /* USE_ITT_BUILD */ 876 } 877 878 /*! 879 @ingroup WORK_SHARING 880 @param loc source location information. 881 @param gtid global thread number. 882 883 End execution of an <tt>ordered</tt> construct. 884 */ 885 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) { 886 int cid = 0; 887 kmp_info_t *th; 888 889 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid)); 890 891 #if USE_ITT_BUILD 892 __kmp_itt_ordered_end(gtid); 893 // TODO: ordered_wait_id 894 #endif /* USE_ITT_BUILD */ 895 896 th = __kmp_threads[gtid]; 897 898 if (th->th.th_dispatch->th_dxo_fcn != 0) 899 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc); 900 else 901 __kmp_parallel_dxo(>id, &cid, loc); 902 903 #if OMPT_SUPPORT && OMPT_OPTIONAL 904 OMPT_STORE_RETURN_ADDRESS(gtid); 905 if (ompt_enabled.ompt_callback_mutex_released) { 906 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 907 ompt_mutex_ordered, 908 (omp_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value, 909 OMPT_LOAD_RETURN_ADDRESS(gtid)); 910 } 911 #endif 912 } 913 914 #if KMP_USE_DYNAMIC_LOCK 915 916 static __forceinline void 917 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc, 918 kmp_int32 gtid, kmp_indirect_locktag_t tag) { 919 // Pointer to the allocated indirect lock is written to crit, while indexing 920 // is ignored. 921 void *idx; 922 kmp_indirect_lock_t **lck; 923 lck = (kmp_indirect_lock_t **)crit; 924 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag); 925 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock); 926 KMP_SET_I_LOCK_LOCATION(ilk, loc); 927 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section); 928 KA_TRACE(20, 929 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag)); 930 #if USE_ITT_BUILD 931 __kmp_itt_critical_creating(ilk->lock, loc); 932 #endif 933 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk); 934 if (status == 0) { 935 #if USE_ITT_BUILD 936 __kmp_itt_critical_destroyed(ilk->lock); 937 #endif 938 // We don't really need to destroy the unclaimed lock here since it will be 939 // cleaned up at program exit. 940 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx); 941 } 942 KMP_DEBUG_ASSERT(*lck != NULL); 943 } 944 945 // Fast-path acquire tas lock 946 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \ 947 { \ 948 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 949 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 950 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 951 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 952 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \ 953 kmp_uint32 spins; \ 954 KMP_FSYNC_PREPARE(l); \ 955 KMP_INIT_YIELD(spins); \ 956 if (TCR_4(__kmp_nth) > \ 957 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 958 KMP_YIELD(TRUE); \ 959 } else { \ 960 KMP_YIELD_SPIN(spins); \ 961 } \ 962 kmp_backoff_t backoff = __kmp_spin_backoff_params; \ 963 while ( \ 964 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 965 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \ 966 __kmp_spin_backoff(&backoff); \ 967 if (TCR_4(__kmp_nth) > \ 968 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 969 KMP_YIELD(TRUE); \ 970 } else { \ 971 KMP_YIELD_SPIN(spins); \ 972 } \ 973 } \ 974 } \ 975 KMP_FSYNC_ACQUIRED(l); \ 976 } 977 978 // Fast-path test tas lock 979 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \ 980 { \ 981 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 982 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 983 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 984 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \ 985 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \ 986 } 987 988 // Fast-path release tas lock 989 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \ 990 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); } 991 992 #if KMP_USE_FUTEX 993 994 #include <sys/syscall.h> 995 #include <unistd.h> 996 #ifndef FUTEX_WAIT 997 #define FUTEX_WAIT 0 998 #endif 999 #ifndef FUTEX_WAKE 1000 #define FUTEX_WAKE 1 1001 #endif 1002 1003 // Fast-path acquire futex lock 1004 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \ 1005 { \ 1006 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1007 kmp_int32 gtid_code = (gtid + 1) << 1; \ 1008 KMP_MB(); \ 1009 KMP_FSYNC_PREPARE(ftx); \ 1010 kmp_int32 poll_val; \ 1011 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \ 1012 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1013 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \ 1014 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \ 1015 if (!cond) { \ 1016 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \ 1017 poll_val | \ 1018 KMP_LOCK_BUSY(1, futex))) { \ 1019 continue; \ 1020 } \ 1021 poll_val |= KMP_LOCK_BUSY(1, futex); \ 1022 } \ 1023 kmp_int32 rc; \ 1024 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \ 1025 NULL, NULL, 0)) != 0) { \ 1026 continue; \ 1027 } \ 1028 gtid_code |= 1; \ 1029 } \ 1030 KMP_FSYNC_ACQUIRED(ftx); \ 1031 } 1032 1033 // Fast-path test futex lock 1034 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \ 1035 { \ 1036 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1037 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1038 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \ 1039 KMP_FSYNC_ACQUIRED(ftx); \ 1040 rc = TRUE; \ 1041 } else { \ 1042 rc = FALSE; \ 1043 } \ 1044 } 1045 1046 // Fast-path release futex lock 1047 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \ 1048 { \ 1049 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1050 KMP_MB(); \ 1051 KMP_FSYNC_RELEASING(ftx); \ 1052 kmp_int32 poll_val = \ 1053 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \ 1054 if (KMP_LOCK_STRIP(poll_val) & 1) { \ 1055 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \ 1056 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \ 1057 } \ 1058 KMP_MB(); \ 1059 KMP_YIELD(TCR_4(__kmp_nth) > \ 1060 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \ 1061 } 1062 1063 #endif // KMP_USE_FUTEX 1064 1065 #else // KMP_USE_DYNAMIC_LOCK 1066 1067 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit, 1068 ident_t const *loc, 1069 kmp_int32 gtid) { 1070 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit; 1071 1072 // Because of the double-check, the following load doesn't need to be volatile 1073 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1074 1075 if (lck == NULL) { 1076 void *idx; 1077 1078 // Allocate & initialize the lock. 1079 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup() 1080 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section); 1081 __kmp_init_user_lock_with_checks(lck); 1082 __kmp_set_user_lock_location(lck, loc); 1083 #if USE_ITT_BUILD 1084 __kmp_itt_critical_creating(lck); 1085 // __kmp_itt_critical_creating() should be called *before* the first usage 1086 // of underlying lock. It is the only place where we can guarantee it. There 1087 // are chances the lock will destroyed with no usage, but it is not a 1088 // problem, because this is not real event seen by user but rather setting 1089 // name for object (lock). See more details in kmp_itt.h. 1090 #endif /* USE_ITT_BUILD */ 1091 1092 // Use a cmpxchg instruction to slam the start of the critical section with 1093 // the lock pointer. If another thread beat us to it, deallocate the lock, 1094 // and use the lock that the other thread allocated. 1095 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck); 1096 1097 if (status == 0) { 1098 // Deallocate the lock and reload the value. 1099 #if USE_ITT_BUILD 1100 __kmp_itt_critical_destroyed(lck); 1101 // Let ITT know the lock is destroyed and the same memory location may be reused 1102 // for another purpose. 1103 #endif /* USE_ITT_BUILD */ 1104 __kmp_destroy_user_lock_with_checks(lck); 1105 __kmp_user_lock_free(&idx, gtid, lck); 1106 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1107 KMP_DEBUG_ASSERT(lck != NULL); 1108 } 1109 } 1110 return lck; 1111 } 1112 1113 #endif // KMP_USE_DYNAMIC_LOCK 1114 1115 /*! 1116 @ingroup WORK_SHARING 1117 @param loc source location information. 1118 @param global_tid global thread number . 1119 @param crit identity of the critical section. This could be a pointer to a lock 1120 associated with the critical section, or some other suitably unique value. 1121 1122 Enter code protected by a `critical` construct. 1123 This function blocks until the executing thread can enter the critical section. 1124 */ 1125 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, 1126 kmp_critical_name *crit) { 1127 #if KMP_USE_DYNAMIC_LOCK 1128 #if OMPT_SUPPORT && OMPT_OPTIONAL 1129 OMPT_STORE_RETURN_ADDRESS(global_tid); 1130 #endif // OMPT_SUPPORT 1131 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); 1132 #else 1133 KMP_COUNT_BLOCK(OMP_CRITICAL); 1134 #if OMPT_SUPPORT && OMPT_OPTIONAL 1135 omp_state_t prev_state = omp_state_undefined; 1136 ompt_thread_info_t ti; 1137 #endif 1138 kmp_user_lock_p lck; 1139 1140 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1141 1142 // TODO: add THR_OVHD_STATE 1143 1144 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1145 KMP_CHECK_USER_LOCK_INIT(); 1146 1147 if ((__kmp_user_lock_kind == lk_tas) && 1148 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1149 lck = (kmp_user_lock_p)crit; 1150 } 1151 #if KMP_USE_FUTEX 1152 else if ((__kmp_user_lock_kind == lk_futex) && 1153 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1154 lck = (kmp_user_lock_p)crit; 1155 } 1156 #endif 1157 else { // ticket, queuing or drdpa 1158 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 1159 } 1160 1161 if (__kmp_env_consistency_check) 1162 __kmp_push_sync(global_tid, ct_critical, loc, lck); 1163 1164 // since the critical directive binds to all threads, not just the current 1165 // team we have to check this even if we are in a serialized team. 1166 // also, even if we are the uber thread, we still have to conduct the lock, 1167 // as we have to contend with sibling threads. 1168 1169 #if USE_ITT_BUILD 1170 __kmp_itt_critical_acquiring(lck); 1171 #endif /* USE_ITT_BUILD */ 1172 #if OMPT_SUPPORT && OMPT_OPTIONAL 1173 OMPT_STORE_RETURN_ADDRESS(gtid); 1174 void *codeptr_ra = NULL; 1175 if (ompt_enabled.enabled) { 1176 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1177 /* OMPT state update */ 1178 prev_state = ti.state; 1179 ti.wait_id = (omp_wait_id_t)lck; 1180 ti.state = omp_state_wait_critical; 1181 1182 /* OMPT event callback */ 1183 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 1184 if (ompt_enabled.ompt_callback_mutex_acquire) { 1185 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1186 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 1187 (omp_wait_id_t)crit, codeptr_ra); 1188 } 1189 } 1190 #endif 1191 // Value of 'crit' should be good for using as a critical_id of the critical 1192 // section directive. 1193 __kmp_acquire_user_lock_with_checks(lck, global_tid); 1194 1195 #if USE_ITT_BUILD 1196 __kmp_itt_critical_acquired(lck); 1197 #endif /* USE_ITT_BUILD */ 1198 #if OMPT_SUPPORT && OMPT_OPTIONAL 1199 if (ompt_enabled.enabled) { 1200 /* OMPT state update */ 1201 ti.state = prev_state; 1202 ti.wait_id = 0; 1203 1204 /* OMPT event callback */ 1205 if (ompt_enabled.ompt_callback_mutex_acquired) { 1206 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1207 ompt_mutex_critical, (omp_wait_id_t)crit, codeptr_ra); 1208 } 1209 } 1210 #endif 1211 KMP_POP_PARTITIONED_TIMER(); 1212 1213 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1214 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1215 #endif // KMP_USE_DYNAMIC_LOCK 1216 } 1217 1218 #if KMP_USE_DYNAMIC_LOCK 1219 1220 // Converts the given hint to an internal lock implementation 1221 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) { 1222 #if KMP_USE_TSX 1223 #define KMP_TSX_LOCK(seq) lockseq_##seq 1224 #else 1225 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq 1226 #endif 1227 1228 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1229 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm) 1230 #else 1231 #define KMP_CPUINFO_RTM 0 1232 #endif 1233 1234 // Hints that do not require further logic 1235 if (hint & kmp_lock_hint_hle) 1236 return KMP_TSX_LOCK(hle); 1237 if (hint & kmp_lock_hint_rtm) 1238 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq; 1239 if (hint & kmp_lock_hint_adaptive) 1240 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq; 1241 1242 // Rule out conflicting hints first by returning the default lock 1243 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended)) 1244 return __kmp_user_lock_seq; 1245 if ((hint & omp_lock_hint_speculative) && 1246 (hint & omp_lock_hint_nonspeculative)) 1247 return __kmp_user_lock_seq; 1248 1249 // Do not even consider speculation when it appears to be contended 1250 if (hint & omp_lock_hint_contended) 1251 return lockseq_queuing; 1252 1253 // Uncontended lock without speculation 1254 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative)) 1255 return lockseq_tas; 1256 1257 // HLE lock for speculation 1258 if (hint & omp_lock_hint_speculative) 1259 return KMP_TSX_LOCK(hle); 1260 1261 return __kmp_user_lock_seq; 1262 } 1263 1264 #if OMPT_SUPPORT && OMPT_OPTIONAL 1265 #if KMP_USE_DYNAMIC_LOCK 1266 static kmp_mutex_impl_t 1267 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) { 1268 if (user_lock) { 1269 switch (KMP_EXTRACT_D_TAG(user_lock)) { 1270 case 0: 1271 break; 1272 #if KMP_USE_FUTEX 1273 case locktag_futex: 1274 return kmp_mutex_impl_queuing; 1275 #endif 1276 case locktag_tas: 1277 return kmp_mutex_impl_spin; 1278 #if KMP_USE_TSX 1279 case locktag_hle: 1280 return kmp_mutex_impl_speculative; 1281 #endif 1282 default: 1283 return ompt_mutex_impl_unknown; 1284 } 1285 ilock = KMP_LOOKUP_I_LOCK(user_lock); 1286 } 1287 KMP_ASSERT(ilock); 1288 switch (ilock->type) { 1289 #if KMP_USE_TSX 1290 case locktag_adaptive: 1291 case locktag_rtm: 1292 return kmp_mutex_impl_speculative; 1293 #endif 1294 case locktag_nested_tas: 1295 return kmp_mutex_impl_spin; 1296 #if KMP_USE_FUTEX 1297 case locktag_nested_futex: 1298 #endif 1299 case locktag_ticket: 1300 case locktag_queuing: 1301 case locktag_drdpa: 1302 case locktag_nested_ticket: 1303 case locktag_nested_queuing: 1304 case locktag_nested_drdpa: 1305 return kmp_mutex_impl_queuing; 1306 default: 1307 return ompt_mutex_impl_unknown; 1308 } 1309 } 1310 #else 1311 // For locks without dynamic binding 1312 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() { 1313 switch (__kmp_user_lock_kind) { 1314 case lk_tas: 1315 return kmp_mutex_impl_spin; 1316 #if KMP_USE_FUTEX 1317 case lk_futex: 1318 #endif 1319 case lk_ticket: 1320 case lk_queuing: 1321 case lk_drdpa: 1322 return kmp_mutex_impl_queuing; 1323 #if KMP_USE_TSX 1324 case lk_hle: 1325 case lk_rtm: 1326 case lk_adaptive: 1327 return kmp_mutex_impl_speculative; 1328 #endif 1329 default: 1330 return ompt_mutex_impl_unknown; 1331 } 1332 } 1333 #endif // KMP_USE_DYNAMIC_LOCK 1334 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1335 1336 /*! 1337 @ingroup WORK_SHARING 1338 @param loc source location information. 1339 @param global_tid global thread number. 1340 @param crit identity of the critical section. This could be a pointer to a lock 1341 associated with the critical section, or some other suitably unique value. 1342 @param hint the lock hint. 1343 1344 Enter code protected by a `critical` construct with a hint. The hint value is 1345 used to suggest a lock implementation. This function blocks until the executing 1346 thread can enter the critical section unless the hint suggests use of 1347 speculative execution and the hardware supports it. 1348 */ 1349 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid, 1350 kmp_critical_name *crit, uintptr_t hint) { 1351 KMP_COUNT_BLOCK(OMP_CRITICAL); 1352 kmp_user_lock_p lck; 1353 #if OMPT_SUPPORT && OMPT_OPTIONAL 1354 omp_state_t prev_state = omp_state_undefined; 1355 ompt_thread_info_t ti; 1356 // This is the case, if called from __kmpc_critical: 1357 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1358 if (!codeptr) 1359 codeptr = OMPT_GET_RETURN_ADDRESS(0); 1360 #endif 1361 1362 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1363 1364 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 1365 // Check if it is initialized. 1366 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1367 if (*lk == 0) { 1368 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint); 1369 if (KMP_IS_D_LOCK(lckseq)) { 1370 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 1371 KMP_GET_D_TAG(lckseq)); 1372 } else { 1373 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq)); 1374 } 1375 } 1376 // Branch for accessing the actual lock object and set operation. This 1377 // branching is inevitable since this lock initialization does not follow the 1378 // normal dispatch path (lock table is not used). 1379 if (KMP_EXTRACT_D_TAG(lk) != 0) { 1380 lck = (kmp_user_lock_p)lk; 1381 if (__kmp_env_consistency_check) { 1382 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1383 __kmp_map_hint_to_lock(hint)); 1384 } 1385 #if USE_ITT_BUILD 1386 __kmp_itt_critical_acquiring(lck); 1387 #endif 1388 #if OMPT_SUPPORT && OMPT_OPTIONAL 1389 if (ompt_enabled.enabled) { 1390 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1391 /* OMPT state update */ 1392 prev_state = ti.state; 1393 ti.wait_id = (omp_wait_id_t)lck; 1394 ti.state = omp_state_wait_critical; 1395 1396 /* OMPT event callback */ 1397 if (ompt_enabled.ompt_callback_mutex_acquire) { 1398 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1399 ompt_mutex_critical, (unsigned int)hint, 1400 __ompt_get_mutex_impl_type(crit), (omp_wait_id_t)crit, codeptr); 1401 } 1402 } 1403 #endif 1404 #if KMP_USE_INLINED_TAS 1405 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1406 KMP_ACQUIRE_TAS_LOCK(lck, global_tid); 1407 } else 1408 #elif KMP_USE_INLINED_FUTEX 1409 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1410 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid); 1411 } else 1412 #endif 1413 { 1414 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 1415 } 1416 } else { 1417 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 1418 lck = ilk->lock; 1419 if (__kmp_env_consistency_check) { 1420 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1421 __kmp_map_hint_to_lock(hint)); 1422 } 1423 #if USE_ITT_BUILD 1424 __kmp_itt_critical_acquiring(lck); 1425 #endif 1426 #if OMPT_SUPPORT && OMPT_OPTIONAL 1427 if (ompt_enabled.enabled) { 1428 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1429 /* OMPT state update */ 1430 prev_state = ti.state; 1431 ti.wait_id = (omp_wait_id_t)lck; 1432 ti.state = omp_state_wait_critical; 1433 1434 /* OMPT event callback */ 1435 if (ompt_enabled.ompt_callback_mutex_acquire) { 1436 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1437 ompt_mutex_critical, (unsigned int)hint, 1438 __ompt_get_mutex_impl_type(0, ilk), (omp_wait_id_t)crit, codeptr); 1439 } 1440 } 1441 #endif 1442 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 1443 } 1444 KMP_POP_PARTITIONED_TIMER(); 1445 1446 #if USE_ITT_BUILD 1447 __kmp_itt_critical_acquired(lck); 1448 #endif /* USE_ITT_BUILD */ 1449 #if OMPT_SUPPORT && OMPT_OPTIONAL 1450 if (ompt_enabled.enabled) { 1451 /* OMPT state update */ 1452 ti.state = prev_state; 1453 ti.wait_id = 0; 1454 1455 /* OMPT event callback */ 1456 if (ompt_enabled.ompt_callback_mutex_acquired) { 1457 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1458 ompt_mutex_critical, (omp_wait_id_t)crit, codeptr); 1459 } 1460 } 1461 #endif 1462 1463 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1464 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1465 } // __kmpc_critical_with_hint 1466 1467 #endif // KMP_USE_DYNAMIC_LOCK 1468 1469 /*! 1470 @ingroup WORK_SHARING 1471 @param loc source location information. 1472 @param global_tid global thread number . 1473 @param crit identity of the critical section. This could be a pointer to a lock 1474 associated with the critical section, or some other suitably unique value. 1475 1476 Leave a critical section, releasing any lock that was held during its execution. 1477 */ 1478 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, 1479 kmp_critical_name *crit) { 1480 kmp_user_lock_p lck; 1481 1482 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid)); 1483 1484 #if KMP_USE_DYNAMIC_LOCK 1485 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 1486 lck = (kmp_user_lock_p)crit; 1487 KMP_ASSERT(lck != NULL); 1488 if (__kmp_env_consistency_check) { 1489 __kmp_pop_sync(global_tid, ct_critical, loc); 1490 } 1491 #if USE_ITT_BUILD 1492 __kmp_itt_critical_releasing(lck); 1493 #endif 1494 #if KMP_USE_INLINED_TAS 1495 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1496 KMP_RELEASE_TAS_LOCK(lck, global_tid); 1497 } else 1498 #elif KMP_USE_INLINED_FUTEX 1499 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1500 KMP_RELEASE_FUTEX_LOCK(lck, global_tid); 1501 } else 1502 #endif 1503 { 1504 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 1505 } 1506 } else { 1507 kmp_indirect_lock_t *ilk = 1508 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 1509 KMP_ASSERT(ilk != NULL); 1510 lck = ilk->lock; 1511 if (__kmp_env_consistency_check) { 1512 __kmp_pop_sync(global_tid, ct_critical, loc); 1513 } 1514 #if USE_ITT_BUILD 1515 __kmp_itt_critical_releasing(lck); 1516 #endif 1517 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid); 1518 } 1519 1520 #else // KMP_USE_DYNAMIC_LOCK 1521 1522 if ((__kmp_user_lock_kind == lk_tas) && 1523 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1524 lck = (kmp_user_lock_p)crit; 1525 } 1526 #if KMP_USE_FUTEX 1527 else if ((__kmp_user_lock_kind == lk_futex) && 1528 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1529 lck = (kmp_user_lock_p)crit; 1530 } 1531 #endif 1532 else { // ticket, queuing or drdpa 1533 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit)); 1534 } 1535 1536 KMP_ASSERT(lck != NULL); 1537 1538 if (__kmp_env_consistency_check) 1539 __kmp_pop_sync(global_tid, ct_critical, loc); 1540 1541 #if USE_ITT_BUILD 1542 __kmp_itt_critical_releasing(lck); 1543 #endif /* USE_ITT_BUILD */ 1544 // Value of 'crit' should be good for using as a critical_id of the critical 1545 // section directive. 1546 __kmp_release_user_lock_with_checks(lck, global_tid); 1547 1548 #endif // KMP_USE_DYNAMIC_LOCK 1549 1550 #if OMPT_SUPPORT && OMPT_OPTIONAL 1551 /* OMPT release event triggers after lock is released; place here to trigger 1552 * for all #if branches */ 1553 OMPT_STORE_RETURN_ADDRESS(global_tid); 1554 if (ompt_enabled.ompt_callback_mutex_released) { 1555 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 1556 ompt_mutex_critical, (omp_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0)); 1557 } 1558 #endif 1559 1560 KMP_POP_PARTITIONED_TIMER(); 1561 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid)); 1562 } 1563 1564 /*! 1565 @ingroup SYNCHRONIZATION 1566 @param loc source location information 1567 @param global_tid thread id. 1568 @return one if the thread should execute the master block, zero otherwise 1569 1570 Start execution of a combined barrier and master. The barrier is executed inside 1571 this function. 1572 */ 1573 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1574 int status; 1575 1576 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid)); 1577 1578 if (!TCR_4(__kmp_init_parallel)) 1579 __kmp_parallel_initialize(); 1580 1581 if (__kmp_env_consistency_check) 1582 __kmp_check_barrier(global_tid, ct_barrier, loc); 1583 1584 #if OMPT_SUPPORT 1585 omp_frame_t *ompt_frame; 1586 if (ompt_enabled.enabled) { 1587 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1588 if (ompt_frame->enter_frame == NULL) 1589 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1590 OMPT_STORE_RETURN_ADDRESS(global_tid); 1591 } 1592 #endif 1593 #if USE_ITT_NOTIFY 1594 __kmp_threads[global_tid]->th.th_ident = loc; 1595 #endif 1596 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL); 1597 #if OMPT_SUPPORT && OMPT_OPTIONAL 1598 if (ompt_enabled.enabled) { 1599 ompt_frame->enter_frame = NULL; 1600 } 1601 #endif 1602 1603 return (status != 0) ? 0 : 1; 1604 } 1605 1606 /*! 1607 @ingroup SYNCHRONIZATION 1608 @param loc source location information 1609 @param global_tid thread id. 1610 1611 Complete the execution of a combined barrier and master. This function should 1612 only be called at the completion of the <tt>master</tt> code. Other threads will 1613 still be waiting at the barrier and this call releases them. 1614 */ 1615 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1616 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid)); 1617 1618 __kmp_end_split_barrier(bs_plain_barrier, global_tid); 1619 } 1620 1621 /*! 1622 @ingroup SYNCHRONIZATION 1623 @param loc source location information 1624 @param global_tid thread id. 1625 @return one if the thread should execute the master block, zero otherwise 1626 1627 Start execution of a combined barrier and master(nowait) construct. 1628 The barrier is executed inside this function. 1629 There is no equivalent "end" function, since the 1630 */ 1631 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) { 1632 kmp_int32 ret; 1633 1634 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid)); 1635 1636 if (!TCR_4(__kmp_init_parallel)) 1637 __kmp_parallel_initialize(); 1638 1639 if (__kmp_env_consistency_check) { 1640 if (loc == 0) { 1641 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 1642 } 1643 __kmp_check_barrier(global_tid, ct_barrier, loc); 1644 } 1645 1646 #if OMPT_SUPPORT 1647 omp_frame_t *ompt_frame; 1648 if (ompt_enabled.enabled) { 1649 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1650 if (ompt_frame->enter_frame == NULL) 1651 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1652 OMPT_STORE_RETURN_ADDRESS(global_tid); 1653 } 1654 #endif 1655 #if USE_ITT_NOTIFY 1656 __kmp_threads[global_tid]->th.th_ident = loc; 1657 #endif 1658 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 1659 #if OMPT_SUPPORT && OMPT_OPTIONAL 1660 if (ompt_enabled.enabled) { 1661 ompt_frame->enter_frame = NULL; 1662 } 1663 #endif 1664 1665 ret = __kmpc_master(loc, global_tid); 1666 1667 if (__kmp_env_consistency_check) { 1668 /* there's no __kmpc_end_master called; so the (stats) */ 1669 /* actions of __kmpc_end_master are done here */ 1670 1671 if (global_tid < 0) { 1672 KMP_WARNING(ThreadIdentInvalid); 1673 } 1674 if (ret) { 1675 /* only one thread should do the pop since only */ 1676 /* one did the push (see __kmpc_master()) */ 1677 1678 __kmp_pop_sync(global_tid, ct_master, loc); 1679 } 1680 } 1681 1682 return (ret); 1683 } 1684 1685 /* The BARRIER for a SINGLE process section is always explicit */ 1686 /*! 1687 @ingroup WORK_SHARING 1688 @param loc source location information 1689 @param global_tid global thread number 1690 @return One if this thread should execute the single construct, zero otherwise. 1691 1692 Test whether to execute a <tt>single</tt> construct. 1693 There are no implicit barriers in the two "single" calls, rather the compiler 1694 should introduce an explicit barrier if it is required. 1695 */ 1696 1697 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) { 1698 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE); 1699 1700 if (rc) { 1701 // We are going to execute the single statement, so we should count it. 1702 KMP_COUNT_BLOCK(OMP_SINGLE); 1703 KMP_PUSH_PARTITIONED_TIMER(OMP_single); 1704 } 1705 1706 #if OMPT_SUPPORT && OMPT_OPTIONAL 1707 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1708 kmp_team_t *team = this_thr->th.th_team; 1709 int tid = __kmp_tid_from_gtid(global_tid); 1710 1711 if (ompt_enabled.enabled) { 1712 if (rc) { 1713 if (ompt_enabled.ompt_callback_work) { 1714 ompt_callbacks.ompt_callback(ompt_callback_work)( 1715 ompt_work_single_executor, ompt_scope_begin, 1716 &(team->t.ompt_team_info.parallel_data), 1717 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1718 1, OMPT_GET_RETURN_ADDRESS(0)); 1719 } 1720 } else { 1721 if (ompt_enabled.ompt_callback_work) { 1722 ompt_callbacks.ompt_callback(ompt_callback_work)( 1723 ompt_work_single_other, ompt_scope_begin, 1724 &(team->t.ompt_team_info.parallel_data), 1725 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1726 1, OMPT_GET_RETURN_ADDRESS(0)); 1727 ompt_callbacks.ompt_callback(ompt_callback_work)( 1728 ompt_work_single_other, ompt_scope_end, 1729 &(team->t.ompt_team_info.parallel_data), 1730 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1731 1, OMPT_GET_RETURN_ADDRESS(0)); 1732 } 1733 } 1734 } 1735 #endif 1736 1737 return rc; 1738 } 1739 1740 /*! 1741 @ingroup WORK_SHARING 1742 @param loc source location information 1743 @param global_tid global thread number 1744 1745 Mark the end of a <tt>single</tt> construct. This function should 1746 only be called by the thread that executed the block of code protected 1747 by the `single` construct. 1748 */ 1749 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) { 1750 __kmp_exit_single(global_tid); 1751 KMP_POP_PARTITIONED_TIMER(); 1752 1753 #if OMPT_SUPPORT && OMPT_OPTIONAL 1754 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1755 kmp_team_t *team = this_thr->th.th_team; 1756 int tid = __kmp_tid_from_gtid(global_tid); 1757 1758 if (ompt_enabled.ompt_callback_work) { 1759 ompt_callbacks.ompt_callback(ompt_callback_work)( 1760 ompt_work_single_executor, ompt_scope_end, 1761 &(team->t.ompt_team_info.parallel_data), 1762 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1, 1763 OMPT_GET_RETURN_ADDRESS(0)); 1764 } 1765 #endif 1766 } 1767 1768 /*! 1769 @ingroup WORK_SHARING 1770 @param loc Source location 1771 @param global_tid Global thread id 1772 1773 Mark the end of a statically scheduled loop. 1774 */ 1775 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) { 1776 KMP_POP_PARTITIONED_TIMER(); 1777 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid)); 1778 1779 #if OMPT_SUPPORT && OMPT_OPTIONAL 1780 if (ompt_enabled.ompt_callback_work) { 1781 ompt_work_type_t ompt_work_type = ompt_work_loop; 1782 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1783 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1784 // Determine workshare type 1785 if (loc != NULL) { 1786 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) { 1787 ompt_work_type = ompt_work_loop; 1788 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) { 1789 ompt_work_type = ompt_work_sections; 1790 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) { 1791 ompt_work_type = ompt_work_distribute; 1792 } else { 1793 // use default set above. 1794 // a warning about this case is provided in __kmpc_for_static_init 1795 } 1796 KMP_DEBUG_ASSERT(ompt_work_type); 1797 } 1798 ompt_callbacks.ompt_callback(ompt_callback_work)( 1799 ompt_work_type, ompt_scope_end, &(team_info->parallel_data), 1800 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)); 1801 } 1802 #endif 1803 if (__kmp_env_consistency_check) 1804 __kmp_pop_workshare(global_tid, ct_pdo, loc); 1805 } 1806 1807 // User routines which take C-style arguments (call by value) 1808 // different from the Fortran equivalent routines 1809 1810 void ompc_set_num_threads(int arg) { 1811 // !!!!! TODO: check the per-task binding 1812 __kmp_set_num_threads(arg, __kmp_entry_gtid()); 1813 } 1814 1815 void ompc_set_dynamic(int flag) { 1816 kmp_info_t *thread; 1817 1818 /* For the thread-private implementation of the internal controls */ 1819 thread = __kmp_entry_thread(); 1820 1821 __kmp_save_internal_controls(thread); 1822 1823 set__dynamic(thread, flag ? TRUE : FALSE); 1824 } 1825 1826 void ompc_set_nested(int flag) { 1827 kmp_info_t *thread; 1828 1829 /* For the thread-private internal controls implementation */ 1830 thread = __kmp_entry_thread(); 1831 1832 __kmp_save_internal_controls(thread); 1833 1834 set__nested(thread, flag ? TRUE : FALSE); 1835 } 1836 1837 void ompc_set_max_active_levels(int max_active_levels) { 1838 /* TO DO */ 1839 /* we want per-task implementation of this internal control */ 1840 1841 /* For the per-thread internal controls implementation */ 1842 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels); 1843 } 1844 1845 void ompc_set_schedule(omp_sched_t kind, int modifier) { 1846 // !!!!! TODO: check the per-task binding 1847 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier); 1848 } 1849 1850 int ompc_get_ancestor_thread_num(int level) { 1851 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level); 1852 } 1853 1854 int ompc_get_team_size(int level) { 1855 return __kmp_get_team_size(__kmp_entry_gtid(), level); 1856 } 1857 1858 void kmpc_set_stacksize(int arg) { 1859 // __kmp_aux_set_stacksize initializes the library if needed 1860 __kmp_aux_set_stacksize(arg); 1861 } 1862 1863 void kmpc_set_stacksize_s(size_t arg) { 1864 // __kmp_aux_set_stacksize initializes the library if needed 1865 __kmp_aux_set_stacksize(arg); 1866 } 1867 1868 void kmpc_set_blocktime(int arg) { 1869 int gtid, tid; 1870 kmp_info_t *thread; 1871 1872 gtid = __kmp_entry_gtid(); 1873 tid = __kmp_tid_from_gtid(gtid); 1874 thread = __kmp_thread_from_gtid(gtid); 1875 1876 __kmp_aux_set_blocktime(arg, thread, tid); 1877 } 1878 1879 void kmpc_set_library(int arg) { 1880 // __kmp_user_set_library initializes the library if needed 1881 __kmp_user_set_library((enum library_type)arg); 1882 } 1883 1884 void kmpc_set_defaults(char const *str) { 1885 // __kmp_aux_set_defaults initializes the library if needed 1886 __kmp_aux_set_defaults(str, KMP_STRLEN(str)); 1887 } 1888 1889 void kmpc_set_disp_num_buffers(int arg) { 1890 // ignore after initialization because some teams have already 1891 // allocated dispatch buffers 1892 if (__kmp_init_serial == 0 && arg > 0) 1893 __kmp_dispatch_num_buffers = arg; 1894 } 1895 1896 int kmpc_set_affinity_mask_proc(int proc, void **mask) { 1897 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1898 return -1; 1899 #else 1900 if (!TCR_4(__kmp_init_middle)) { 1901 __kmp_middle_initialize(); 1902 } 1903 return __kmp_aux_set_affinity_mask_proc(proc, mask); 1904 #endif 1905 } 1906 1907 int kmpc_unset_affinity_mask_proc(int proc, void **mask) { 1908 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1909 return -1; 1910 #else 1911 if (!TCR_4(__kmp_init_middle)) { 1912 __kmp_middle_initialize(); 1913 } 1914 return __kmp_aux_unset_affinity_mask_proc(proc, mask); 1915 #endif 1916 } 1917 1918 int kmpc_get_affinity_mask_proc(int proc, void **mask) { 1919 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1920 return -1; 1921 #else 1922 if (!TCR_4(__kmp_init_middle)) { 1923 __kmp_middle_initialize(); 1924 } 1925 return __kmp_aux_get_affinity_mask_proc(proc, mask); 1926 #endif 1927 } 1928 1929 /* -------------------------------------------------------------------------- */ 1930 /*! 1931 @ingroup THREADPRIVATE 1932 @param loc source location information 1933 @param gtid global thread number 1934 @param cpy_size size of the cpy_data buffer 1935 @param cpy_data pointer to data to be copied 1936 @param cpy_func helper function to call for copying data 1937 @param didit flag variable: 1=single thread; 0=not single thread 1938 1939 __kmpc_copyprivate implements the interface for the private data broadcast 1940 needed for the copyprivate clause associated with a single region in an 1941 OpenMP<sup>*</sup> program (both C and Fortran). 1942 All threads participating in the parallel region call this routine. 1943 One of the threads (called the single thread) should have the <tt>didit</tt> 1944 variable set to 1 and all other threads should have that variable set to 0. 1945 All threads pass a pointer to a data buffer (cpy_data) that they have built. 1946 1947 The OpenMP specification forbids the use of nowait on the single region when a 1948 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a 1949 barrier internally to avoid race conditions, so the code generation for the 1950 single region should avoid generating a barrier after the call to @ref 1951 __kmpc_copyprivate. 1952 1953 The <tt>gtid</tt> parameter is the global thread id for the current thread. 1954 The <tt>loc</tt> parameter is a pointer to source location information. 1955 1956 Internal implementation: The single thread will first copy its descriptor 1957 address (cpy_data) to a team-private location, then the other threads will each 1958 call the function pointed to by the parameter cpy_func, which carries out the 1959 copy by copying the data using the cpy_data buffer. 1960 1961 The cpy_func routine used for the copy and the contents of the data area defined 1962 by cpy_data and cpy_size may be built in any fashion that will allow the copy 1963 to be done. For instance, the cpy_data buffer can hold the actual data to be 1964 copied or it may hold a list of pointers to the data. The cpy_func routine must 1965 interpret the cpy_data buffer appropriately. 1966 1967 The interface to cpy_func is as follows: 1968 @code 1969 void cpy_func( void *destination, void *source ) 1970 @endcode 1971 where void *destination is the cpy_data pointer for the thread being copied to 1972 and void *source is the cpy_data pointer for the thread being copied from. 1973 */ 1974 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, 1975 void *cpy_data, void (*cpy_func)(void *, void *), 1976 kmp_int32 didit) { 1977 void **data_ptr; 1978 1979 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid)); 1980 1981 KMP_MB(); 1982 1983 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data; 1984 1985 if (__kmp_env_consistency_check) { 1986 if (loc == 0) { 1987 KMP_WARNING(ConstructIdentInvalid); 1988 } 1989 } 1990 1991 // ToDo: Optimize the following two barriers into some kind of split barrier 1992 1993 if (didit) 1994 *data_ptr = cpy_data; 1995 1996 #if OMPT_SUPPORT 1997 omp_frame_t *ompt_frame; 1998 if (ompt_enabled.enabled) { 1999 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 2000 if (ompt_frame->enter_frame == NULL) 2001 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 2002 OMPT_STORE_RETURN_ADDRESS(gtid); 2003 } 2004 #endif 2005 /* This barrier is not a barrier region boundary */ 2006 #if USE_ITT_NOTIFY 2007 __kmp_threads[gtid]->th.th_ident = loc; 2008 #endif 2009 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2010 2011 if (!didit) 2012 (*cpy_func)(cpy_data, *data_ptr); 2013 2014 // Consider next barrier a user-visible barrier for barrier region boundaries 2015 // Nesting checks are already handled by the single construct checks 2016 2017 #if OMPT_SUPPORT 2018 if (ompt_enabled.enabled) { 2019 OMPT_STORE_RETURN_ADDRESS(gtid); 2020 } 2021 #endif 2022 #if USE_ITT_NOTIFY 2023 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. 2024 // tasks can overwrite the location) 2025 #endif 2026 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2027 #if OMPT_SUPPORT && OMPT_OPTIONAL 2028 if (ompt_enabled.enabled) { 2029 ompt_frame->enter_frame = NULL; 2030 } 2031 #endif 2032 } 2033 2034 /* -------------------------------------------------------------------------- */ 2035 2036 #define INIT_LOCK __kmp_init_user_lock_with_checks 2037 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks 2038 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks 2039 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed 2040 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks 2041 #define ACQUIRE_NESTED_LOCK_TIMED \ 2042 __kmp_acquire_nested_user_lock_with_checks_timed 2043 #define RELEASE_LOCK __kmp_release_user_lock_with_checks 2044 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks 2045 #define TEST_LOCK __kmp_test_user_lock_with_checks 2046 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks 2047 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks 2048 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks 2049 2050 // TODO: Make check abort messages use location info & pass it into 2051 // with_checks routines 2052 2053 #if KMP_USE_DYNAMIC_LOCK 2054 2055 // internal lock initializer 2056 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock, 2057 kmp_dyna_lockseq_t seq) { 2058 if (KMP_IS_D_LOCK(seq)) { 2059 KMP_INIT_D_LOCK(lock, seq); 2060 #if USE_ITT_BUILD 2061 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL); 2062 #endif 2063 } else { 2064 KMP_INIT_I_LOCK(lock, seq); 2065 #if USE_ITT_BUILD 2066 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2067 __kmp_itt_lock_creating(ilk->lock, loc); 2068 #endif 2069 } 2070 } 2071 2072 // internal nest lock initializer 2073 static __forceinline void 2074 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, 2075 kmp_dyna_lockseq_t seq) { 2076 #if KMP_USE_TSX 2077 // Don't have nested lock implementation for speculative locks 2078 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive) 2079 seq = __kmp_user_lock_seq; 2080 #endif 2081 switch (seq) { 2082 case lockseq_tas: 2083 seq = lockseq_nested_tas; 2084 break; 2085 #if KMP_USE_FUTEX 2086 case lockseq_futex: 2087 seq = lockseq_nested_futex; 2088 break; 2089 #endif 2090 case lockseq_ticket: 2091 seq = lockseq_nested_ticket; 2092 break; 2093 case lockseq_queuing: 2094 seq = lockseq_nested_queuing; 2095 break; 2096 case lockseq_drdpa: 2097 seq = lockseq_nested_drdpa; 2098 break; 2099 default: 2100 seq = lockseq_nested_queuing; 2101 } 2102 KMP_INIT_I_LOCK(lock, seq); 2103 #if USE_ITT_BUILD 2104 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2105 __kmp_itt_lock_creating(ilk->lock, loc); 2106 #endif 2107 } 2108 2109 /* initialize the lock with a hint */ 2110 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, 2111 uintptr_t hint) { 2112 KMP_DEBUG_ASSERT(__kmp_init_serial); 2113 if (__kmp_env_consistency_check && user_lock == NULL) { 2114 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint"); 2115 } 2116 2117 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2118 2119 #if OMPT_SUPPORT && OMPT_OPTIONAL 2120 // This is the case, if called from omp_init_lock_with_hint: 2121 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2122 if (!codeptr) 2123 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2124 if (ompt_enabled.ompt_callback_lock_init) { 2125 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2126 ompt_mutex_lock, (omp_lock_hint_t)hint, 2127 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2128 codeptr); 2129 } 2130 #endif 2131 } 2132 2133 /* initialize the lock with a hint */ 2134 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, 2135 void **user_lock, uintptr_t hint) { 2136 KMP_DEBUG_ASSERT(__kmp_init_serial); 2137 if (__kmp_env_consistency_check && user_lock == NULL) { 2138 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint"); 2139 } 2140 2141 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2142 2143 #if OMPT_SUPPORT && OMPT_OPTIONAL 2144 // This is the case, if called from omp_init_lock_with_hint: 2145 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2146 if (!codeptr) 2147 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2148 if (ompt_enabled.ompt_callback_lock_init) { 2149 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2150 ompt_mutex_nest_lock, (omp_lock_hint_t)hint, 2151 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2152 codeptr); 2153 } 2154 #endif 2155 } 2156 2157 #endif // KMP_USE_DYNAMIC_LOCK 2158 2159 /* initialize the lock */ 2160 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2161 #if KMP_USE_DYNAMIC_LOCK 2162 2163 KMP_DEBUG_ASSERT(__kmp_init_serial); 2164 if (__kmp_env_consistency_check && user_lock == NULL) { 2165 KMP_FATAL(LockIsUninitialized, "omp_init_lock"); 2166 } 2167 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2168 2169 #if OMPT_SUPPORT && OMPT_OPTIONAL 2170 // This is the case, if called from omp_init_lock_with_hint: 2171 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2172 if (!codeptr) 2173 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2174 if (ompt_enabled.ompt_callback_lock_init) { 2175 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2176 ompt_mutex_lock, omp_lock_hint_none, 2177 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2178 codeptr); 2179 } 2180 #endif 2181 2182 #else // KMP_USE_DYNAMIC_LOCK 2183 2184 static char const *const func = "omp_init_lock"; 2185 kmp_user_lock_p lck; 2186 KMP_DEBUG_ASSERT(__kmp_init_serial); 2187 2188 if (__kmp_env_consistency_check) { 2189 if (user_lock == NULL) { 2190 KMP_FATAL(LockIsUninitialized, func); 2191 } 2192 } 2193 2194 KMP_CHECK_USER_LOCK_INIT(); 2195 2196 if ((__kmp_user_lock_kind == lk_tas) && 2197 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2198 lck = (kmp_user_lock_p)user_lock; 2199 } 2200 #if KMP_USE_FUTEX 2201 else if ((__kmp_user_lock_kind == lk_futex) && 2202 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2203 lck = (kmp_user_lock_p)user_lock; 2204 } 2205 #endif 2206 else { 2207 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2208 } 2209 INIT_LOCK(lck); 2210 __kmp_set_user_lock_location(lck, loc); 2211 2212 #if OMPT_SUPPORT && OMPT_OPTIONAL 2213 // This is the case, if called from omp_init_lock_with_hint: 2214 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2215 if (!codeptr) 2216 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2217 if (ompt_enabled.ompt_callback_lock_init) { 2218 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2219 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2220 (omp_wait_id_t)user_lock, codeptr); 2221 } 2222 #endif 2223 2224 #if USE_ITT_BUILD 2225 __kmp_itt_lock_creating(lck); 2226 #endif /* USE_ITT_BUILD */ 2227 2228 #endif // KMP_USE_DYNAMIC_LOCK 2229 } // __kmpc_init_lock 2230 2231 /* initialize the lock */ 2232 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2233 #if KMP_USE_DYNAMIC_LOCK 2234 2235 KMP_DEBUG_ASSERT(__kmp_init_serial); 2236 if (__kmp_env_consistency_check && user_lock == NULL) { 2237 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock"); 2238 } 2239 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2240 2241 #if OMPT_SUPPORT && OMPT_OPTIONAL 2242 // This is the case, if called from omp_init_lock_with_hint: 2243 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2244 if (!codeptr) 2245 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2246 if (ompt_enabled.ompt_callback_lock_init) { 2247 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2248 ompt_mutex_nest_lock, omp_lock_hint_none, 2249 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2250 codeptr); 2251 } 2252 #endif 2253 2254 #else // KMP_USE_DYNAMIC_LOCK 2255 2256 static char const *const func = "omp_init_nest_lock"; 2257 kmp_user_lock_p lck; 2258 KMP_DEBUG_ASSERT(__kmp_init_serial); 2259 2260 if (__kmp_env_consistency_check) { 2261 if (user_lock == NULL) { 2262 KMP_FATAL(LockIsUninitialized, func); 2263 } 2264 } 2265 2266 KMP_CHECK_USER_LOCK_INIT(); 2267 2268 if ((__kmp_user_lock_kind == lk_tas) && 2269 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2270 OMP_NEST_LOCK_T_SIZE)) { 2271 lck = (kmp_user_lock_p)user_lock; 2272 } 2273 #if KMP_USE_FUTEX 2274 else if ((__kmp_user_lock_kind == lk_futex) && 2275 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2276 OMP_NEST_LOCK_T_SIZE)) { 2277 lck = (kmp_user_lock_p)user_lock; 2278 } 2279 #endif 2280 else { 2281 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2282 } 2283 2284 INIT_NESTED_LOCK(lck); 2285 __kmp_set_user_lock_location(lck, loc); 2286 2287 #if OMPT_SUPPORT && OMPT_OPTIONAL 2288 // This is the case, if called from omp_init_lock_with_hint: 2289 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2290 if (!codeptr) 2291 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2292 if (ompt_enabled.ompt_callback_lock_init) { 2293 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2294 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2295 (omp_wait_id_t)user_lock, codeptr); 2296 } 2297 #endif 2298 2299 #if USE_ITT_BUILD 2300 __kmp_itt_lock_creating(lck); 2301 #endif /* USE_ITT_BUILD */ 2302 2303 #endif // KMP_USE_DYNAMIC_LOCK 2304 } // __kmpc_init_nest_lock 2305 2306 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2307 #if KMP_USE_DYNAMIC_LOCK 2308 2309 #if USE_ITT_BUILD 2310 kmp_user_lock_p lck; 2311 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2312 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2313 } else { 2314 lck = (kmp_user_lock_p)user_lock; 2315 } 2316 __kmp_itt_lock_destroyed(lck); 2317 #endif 2318 #if OMPT_SUPPORT && OMPT_OPTIONAL 2319 // This is the case, if called from omp_init_lock_with_hint: 2320 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2321 if (!codeptr) 2322 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2323 if (ompt_enabled.ompt_callback_lock_destroy) { 2324 kmp_user_lock_p lck; 2325 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2326 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2327 } else { 2328 lck = (kmp_user_lock_p)user_lock; 2329 } 2330 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2331 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2332 } 2333 #endif 2334 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2335 #else 2336 kmp_user_lock_p lck; 2337 2338 if ((__kmp_user_lock_kind == lk_tas) && 2339 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2340 lck = (kmp_user_lock_p)user_lock; 2341 } 2342 #if KMP_USE_FUTEX 2343 else if ((__kmp_user_lock_kind == lk_futex) && 2344 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2345 lck = (kmp_user_lock_p)user_lock; 2346 } 2347 #endif 2348 else { 2349 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock"); 2350 } 2351 2352 #if OMPT_SUPPORT && OMPT_OPTIONAL 2353 // This is the case, if called from omp_init_lock_with_hint: 2354 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2355 if (!codeptr) 2356 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2357 if (ompt_enabled.ompt_callback_lock_destroy) { 2358 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2359 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2360 } 2361 #endif 2362 2363 #if USE_ITT_BUILD 2364 __kmp_itt_lock_destroyed(lck); 2365 #endif /* USE_ITT_BUILD */ 2366 DESTROY_LOCK(lck); 2367 2368 if ((__kmp_user_lock_kind == lk_tas) && 2369 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2370 ; 2371 } 2372 #if KMP_USE_FUTEX 2373 else if ((__kmp_user_lock_kind == lk_futex) && 2374 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2375 ; 2376 } 2377 #endif 2378 else { 2379 __kmp_user_lock_free(user_lock, gtid, lck); 2380 } 2381 #endif // KMP_USE_DYNAMIC_LOCK 2382 } // __kmpc_destroy_lock 2383 2384 /* destroy the lock */ 2385 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2386 #if KMP_USE_DYNAMIC_LOCK 2387 2388 #if USE_ITT_BUILD 2389 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock); 2390 __kmp_itt_lock_destroyed(ilk->lock); 2391 #endif 2392 #if OMPT_SUPPORT && OMPT_OPTIONAL 2393 // This is the case, if called from omp_init_lock_with_hint: 2394 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2395 if (!codeptr) 2396 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2397 if (ompt_enabled.ompt_callback_lock_destroy) { 2398 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2399 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2400 } 2401 #endif 2402 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2403 2404 #else // KMP_USE_DYNAMIC_LOCK 2405 2406 kmp_user_lock_p lck; 2407 2408 if ((__kmp_user_lock_kind == lk_tas) && 2409 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2410 OMP_NEST_LOCK_T_SIZE)) { 2411 lck = (kmp_user_lock_p)user_lock; 2412 } 2413 #if KMP_USE_FUTEX 2414 else if ((__kmp_user_lock_kind == lk_futex) && 2415 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2416 OMP_NEST_LOCK_T_SIZE)) { 2417 lck = (kmp_user_lock_p)user_lock; 2418 } 2419 #endif 2420 else { 2421 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock"); 2422 } 2423 2424 #if OMPT_SUPPORT && OMPT_OPTIONAL 2425 // This is the case, if called from omp_init_lock_with_hint: 2426 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2427 if (!codeptr) 2428 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2429 if (ompt_enabled.ompt_callback_lock_destroy) { 2430 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2431 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2432 } 2433 #endif 2434 2435 #if USE_ITT_BUILD 2436 __kmp_itt_lock_destroyed(lck); 2437 #endif /* USE_ITT_BUILD */ 2438 2439 DESTROY_NESTED_LOCK(lck); 2440 2441 if ((__kmp_user_lock_kind == lk_tas) && 2442 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2443 OMP_NEST_LOCK_T_SIZE)) { 2444 ; 2445 } 2446 #if KMP_USE_FUTEX 2447 else if ((__kmp_user_lock_kind == lk_futex) && 2448 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2449 OMP_NEST_LOCK_T_SIZE)) { 2450 ; 2451 } 2452 #endif 2453 else { 2454 __kmp_user_lock_free(user_lock, gtid, lck); 2455 } 2456 #endif // KMP_USE_DYNAMIC_LOCK 2457 } // __kmpc_destroy_nest_lock 2458 2459 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2460 KMP_COUNT_BLOCK(OMP_set_lock); 2461 #if KMP_USE_DYNAMIC_LOCK 2462 int tag = KMP_EXTRACT_D_TAG(user_lock); 2463 #if USE_ITT_BUILD 2464 __kmp_itt_lock_acquiring( 2465 (kmp_user_lock_p) 2466 user_lock); // itt function will get to the right lock object. 2467 #endif 2468 #if OMPT_SUPPORT && OMPT_OPTIONAL 2469 // This is the case, if called from omp_init_lock_with_hint: 2470 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2471 if (!codeptr) 2472 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2473 if (ompt_enabled.ompt_callback_mutex_acquire) { 2474 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2475 ompt_mutex_lock, omp_lock_hint_none, 2476 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2477 codeptr); 2478 } 2479 #endif 2480 #if KMP_USE_INLINED_TAS 2481 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2482 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid); 2483 } else 2484 #elif KMP_USE_INLINED_FUTEX 2485 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2486 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid); 2487 } else 2488 #endif 2489 { 2490 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2491 } 2492 #if USE_ITT_BUILD 2493 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2494 #endif 2495 #if OMPT_SUPPORT && OMPT_OPTIONAL 2496 if (ompt_enabled.ompt_callback_mutex_acquired) { 2497 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2498 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2499 } 2500 #endif 2501 2502 #else // KMP_USE_DYNAMIC_LOCK 2503 2504 kmp_user_lock_p lck; 2505 2506 if ((__kmp_user_lock_kind == lk_tas) && 2507 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2508 lck = (kmp_user_lock_p)user_lock; 2509 } 2510 #if KMP_USE_FUTEX 2511 else if ((__kmp_user_lock_kind == lk_futex) && 2512 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2513 lck = (kmp_user_lock_p)user_lock; 2514 } 2515 #endif 2516 else { 2517 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock"); 2518 } 2519 2520 #if USE_ITT_BUILD 2521 __kmp_itt_lock_acquiring(lck); 2522 #endif /* USE_ITT_BUILD */ 2523 #if OMPT_SUPPORT && OMPT_OPTIONAL 2524 // This is the case, if called from omp_init_lock_with_hint: 2525 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2526 if (!codeptr) 2527 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2528 if (ompt_enabled.ompt_callback_mutex_acquire) { 2529 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2530 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2531 (omp_wait_id_t)lck, codeptr); 2532 } 2533 #endif 2534 2535 ACQUIRE_LOCK(lck, gtid); 2536 2537 #if USE_ITT_BUILD 2538 __kmp_itt_lock_acquired(lck); 2539 #endif /* USE_ITT_BUILD */ 2540 2541 #if OMPT_SUPPORT && OMPT_OPTIONAL 2542 if (ompt_enabled.ompt_callback_mutex_acquired) { 2543 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2544 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2545 } 2546 #endif 2547 2548 #endif // KMP_USE_DYNAMIC_LOCK 2549 } 2550 2551 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2552 #if KMP_USE_DYNAMIC_LOCK 2553 2554 #if USE_ITT_BUILD 2555 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2556 #endif 2557 #if OMPT_SUPPORT && OMPT_OPTIONAL 2558 // This is the case, if called from omp_init_lock_with_hint: 2559 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2560 if (!codeptr) 2561 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2562 if (ompt_enabled.enabled) { 2563 if (ompt_enabled.ompt_callback_mutex_acquire) { 2564 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2565 ompt_mutex_nest_lock, omp_lock_hint_none, 2566 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2567 codeptr); 2568 } 2569 } 2570 #endif 2571 int acquire_status = 2572 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid); 2573 (void) acquire_status; 2574 #if USE_ITT_BUILD 2575 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2576 #endif 2577 2578 #if OMPT_SUPPORT && OMPT_OPTIONAL 2579 if (ompt_enabled.enabled) { 2580 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2581 if (ompt_enabled.ompt_callback_mutex_acquired) { 2582 // lock_first 2583 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2584 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2585 } 2586 } else { 2587 if (ompt_enabled.ompt_callback_nest_lock) { 2588 // lock_next 2589 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2590 ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr); 2591 } 2592 } 2593 } 2594 #endif 2595 2596 #else // KMP_USE_DYNAMIC_LOCK 2597 int acquire_status; 2598 kmp_user_lock_p lck; 2599 2600 if ((__kmp_user_lock_kind == lk_tas) && 2601 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2602 OMP_NEST_LOCK_T_SIZE)) { 2603 lck = (kmp_user_lock_p)user_lock; 2604 } 2605 #if KMP_USE_FUTEX 2606 else if ((__kmp_user_lock_kind == lk_futex) && 2607 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2608 OMP_NEST_LOCK_T_SIZE)) { 2609 lck = (kmp_user_lock_p)user_lock; 2610 } 2611 #endif 2612 else { 2613 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock"); 2614 } 2615 2616 #if USE_ITT_BUILD 2617 __kmp_itt_lock_acquiring(lck); 2618 #endif /* USE_ITT_BUILD */ 2619 #if OMPT_SUPPORT && OMPT_OPTIONAL 2620 // This is the case, if called from omp_init_lock_with_hint: 2621 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2622 if (!codeptr) 2623 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2624 if (ompt_enabled.enabled) { 2625 if (ompt_enabled.ompt_callback_mutex_acquire) { 2626 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2627 ompt_mutex_nest_lock, omp_lock_hint_none, 2628 __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr); 2629 } 2630 } 2631 #endif 2632 2633 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status); 2634 2635 #if USE_ITT_BUILD 2636 __kmp_itt_lock_acquired(lck); 2637 #endif /* USE_ITT_BUILD */ 2638 2639 #if OMPT_SUPPORT && OMPT_OPTIONAL 2640 if (ompt_enabled.enabled) { 2641 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2642 if (ompt_enabled.ompt_callback_mutex_acquired) { 2643 // lock_first 2644 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2645 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 2646 } 2647 } else { 2648 if (ompt_enabled.ompt_callback_nest_lock) { 2649 // lock_next 2650 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2651 ompt_scope_begin, (omp_wait_id_t)lck, codeptr); 2652 } 2653 } 2654 } 2655 #endif 2656 2657 #endif // KMP_USE_DYNAMIC_LOCK 2658 } 2659 2660 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2661 #if KMP_USE_DYNAMIC_LOCK 2662 2663 int tag = KMP_EXTRACT_D_TAG(user_lock); 2664 #if USE_ITT_BUILD 2665 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2666 #endif 2667 #if KMP_USE_INLINED_TAS 2668 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2669 KMP_RELEASE_TAS_LOCK(user_lock, gtid); 2670 } else 2671 #elif KMP_USE_INLINED_FUTEX 2672 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2673 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid); 2674 } else 2675 #endif 2676 { 2677 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2678 } 2679 2680 #if OMPT_SUPPORT && OMPT_OPTIONAL 2681 // This is the case, if called from omp_init_lock_with_hint: 2682 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2683 if (!codeptr) 2684 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2685 if (ompt_enabled.ompt_callback_mutex_released) { 2686 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2687 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2688 } 2689 #endif 2690 2691 #else // KMP_USE_DYNAMIC_LOCK 2692 2693 kmp_user_lock_p lck; 2694 2695 /* Can't use serial interval since not block structured */ 2696 /* release the lock */ 2697 2698 if ((__kmp_user_lock_kind == lk_tas) && 2699 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2700 #if KMP_OS_LINUX && \ 2701 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2702 // "fast" path implemented to fix customer performance issue 2703 #if USE_ITT_BUILD 2704 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2705 #endif /* USE_ITT_BUILD */ 2706 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0); 2707 KMP_MB(); 2708 2709 #if OMPT_SUPPORT && OMPT_OPTIONAL 2710 // This is the case, if called from omp_init_lock_with_hint: 2711 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2712 if (!codeptr) 2713 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2714 if (ompt_enabled.ompt_callback_mutex_released) { 2715 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2716 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2717 } 2718 #endif 2719 2720 return; 2721 #else 2722 lck = (kmp_user_lock_p)user_lock; 2723 #endif 2724 } 2725 #if KMP_USE_FUTEX 2726 else if ((__kmp_user_lock_kind == lk_futex) && 2727 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2728 lck = (kmp_user_lock_p)user_lock; 2729 } 2730 #endif 2731 else { 2732 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock"); 2733 } 2734 2735 #if USE_ITT_BUILD 2736 __kmp_itt_lock_releasing(lck); 2737 #endif /* USE_ITT_BUILD */ 2738 2739 RELEASE_LOCK(lck, gtid); 2740 2741 #if OMPT_SUPPORT && OMPT_OPTIONAL 2742 // This is the case, if called from omp_init_lock_with_hint: 2743 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2744 if (!codeptr) 2745 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2746 if (ompt_enabled.ompt_callback_mutex_released) { 2747 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2748 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2749 } 2750 #endif 2751 2752 #endif // KMP_USE_DYNAMIC_LOCK 2753 } 2754 2755 /* release the lock */ 2756 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2757 #if KMP_USE_DYNAMIC_LOCK 2758 2759 #if USE_ITT_BUILD 2760 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2761 #endif 2762 int release_status = 2763 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid); 2764 (void) release_status; 2765 2766 #if OMPT_SUPPORT && OMPT_OPTIONAL 2767 // This is the case, if called from omp_init_lock_with_hint: 2768 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2769 if (!codeptr) 2770 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2771 if (ompt_enabled.enabled) { 2772 if (release_status == KMP_LOCK_RELEASED) { 2773 if (ompt_enabled.ompt_callback_mutex_released) { 2774 // release_lock_last 2775 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2776 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2777 } 2778 } else if (ompt_enabled.ompt_callback_nest_lock) { 2779 // release_lock_prev 2780 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2781 ompt_scope_end, (omp_wait_id_t)user_lock, codeptr); 2782 } 2783 } 2784 #endif 2785 2786 #else // KMP_USE_DYNAMIC_LOCK 2787 2788 kmp_user_lock_p lck; 2789 2790 /* Can't use serial interval since not block structured */ 2791 2792 if ((__kmp_user_lock_kind == lk_tas) && 2793 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2794 OMP_NEST_LOCK_T_SIZE)) { 2795 #if KMP_OS_LINUX && \ 2796 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2797 // "fast" path implemented to fix customer performance issue 2798 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock; 2799 #if USE_ITT_BUILD 2800 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2801 #endif /* USE_ITT_BUILD */ 2802 2803 #if OMPT_SUPPORT && OMPT_OPTIONAL 2804 int release_status = KMP_LOCK_STILL_HELD; 2805 #endif 2806 2807 if (--(tl->lk.depth_locked) == 0) { 2808 TCW_4(tl->lk.poll, 0); 2809 #if OMPT_SUPPORT && OMPT_OPTIONAL 2810 release_status = KMP_LOCK_RELEASED; 2811 #endif 2812 } 2813 KMP_MB(); 2814 2815 #if OMPT_SUPPORT && OMPT_OPTIONAL 2816 // This is the case, if called from omp_init_lock_with_hint: 2817 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2818 if (!codeptr) 2819 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2820 if (ompt_enabled.enabled) { 2821 if (release_status == KMP_LOCK_RELEASED) { 2822 if (ompt_enabled.ompt_callback_mutex_released) { 2823 // release_lock_last 2824 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2825 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 2826 } 2827 } else if (ompt_enabled.ompt_callback_nest_lock) { 2828 // release_lock_previous 2829 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2830 ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr); 2831 } 2832 } 2833 #endif 2834 2835 return; 2836 #else 2837 lck = (kmp_user_lock_p)user_lock; 2838 #endif 2839 } 2840 #if KMP_USE_FUTEX 2841 else if ((__kmp_user_lock_kind == lk_futex) && 2842 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2843 OMP_NEST_LOCK_T_SIZE)) { 2844 lck = (kmp_user_lock_p)user_lock; 2845 } 2846 #endif 2847 else { 2848 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock"); 2849 } 2850 2851 #if USE_ITT_BUILD 2852 __kmp_itt_lock_releasing(lck); 2853 #endif /* USE_ITT_BUILD */ 2854 2855 int release_status; 2856 release_status = RELEASE_NESTED_LOCK(lck, gtid); 2857 #if OMPT_SUPPORT && OMPT_OPTIONAL 2858 // This is the case, if called from omp_init_lock_with_hint: 2859 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2860 if (!codeptr) 2861 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2862 if (ompt_enabled.enabled) { 2863 if (release_status == KMP_LOCK_RELEASED) { 2864 if (ompt_enabled.ompt_callback_mutex_released) { 2865 // release_lock_last 2866 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2867 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 2868 } 2869 } else if (ompt_enabled.ompt_callback_nest_lock) { 2870 // release_lock_previous 2871 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2872 ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr); 2873 } 2874 } 2875 #endif 2876 2877 #endif // KMP_USE_DYNAMIC_LOCK 2878 } 2879 2880 /* try to acquire the lock */ 2881 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2882 KMP_COUNT_BLOCK(OMP_test_lock); 2883 2884 #if KMP_USE_DYNAMIC_LOCK 2885 int rc; 2886 int tag = KMP_EXTRACT_D_TAG(user_lock); 2887 #if USE_ITT_BUILD 2888 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2889 #endif 2890 #if OMPT_SUPPORT && OMPT_OPTIONAL 2891 // This is the case, if called from omp_init_lock_with_hint: 2892 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2893 if (!codeptr) 2894 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2895 if (ompt_enabled.ompt_callback_mutex_acquire) { 2896 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2897 ompt_mutex_lock, omp_lock_hint_none, 2898 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2899 codeptr); 2900 } 2901 #endif 2902 #if KMP_USE_INLINED_TAS 2903 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2904 KMP_TEST_TAS_LOCK(user_lock, gtid, rc); 2905 } else 2906 #elif KMP_USE_INLINED_FUTEX 2907 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2908 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc); 2909 } else 2910 #endif 2911 { 2912 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2913 } 2914 if (rc) { 2915 #if USE_ITT_BUILD 2916 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2917 #endif 2918 #if OMPT_SUPPORT && OMPT_OPTIONAL 2919 if (ompt_enabled.ompt_callback_mutex_acquired) { 2920 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2921 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2922 } 2923 #endif 2924 return FTN_TRUE; 2925 } else { 2926 #if USE_ITT_BUILD 2927 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 2928 #endif 2929 return FTN_FALSE; 2930 } 2931 2932 #else // KMP_USE_DYNAMIC_LOCK 2933 2934 kmp_user_lock_p lck; 2935 int rc; 2936 2937 if ((__kmp_user_lock_kind == lk_tas) && 2938 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2939 lck = (kmp_user_lock_p)user_lock; 2940 } 2941 #if KMP_USE_FUTEX 2942 else if ((__kmp_user_lock_kind == lk_futex) && 2943 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2944 lck = (kmp_user_lock_p)user_lock; 2945 } 2946 #endif 2947 else { 2948 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock"); 2949 } 2950 2951 #if USE_ITT_BUILD 2952 __kmp_itt_lock_acquiring(lck); 2953 #endif /* USE_ITT_BUILD */ 2954 #if OMPT_SUPPORT && OMPT_OPTIONAL 2955 // This is the case, if called from omp_init_lock_with_hint: 2956 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2957 if (!codeptr) 2958 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2959 if (ompt_enabled.ompt_callback_mutex_acquire) { 2960 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2961 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2962 (omp_wait_id_t)lck, codeptr); 2963 } 2964 #endif 2965 2966 rc = TEST_LOCK(lck, gtid); 2967 #if USE_ITT_BUILD 2968 if (rc) { 2969 __kmp_itt_lock_acquired(lck); 2970 } else { 2971 __kmp_itt_lock_cancelled(lck); 2972 } 2973 #endif /* USE_ITT_BUILD */ 2974 #if OMPT_SUPPORT && OMPT_OPTIONAL 2975 if (rc && ompt_enabled.ompt_callback_mutex_acquired) { 2976 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2977 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2978 } 2979 #endif 2980 2981 return (rc ? FTN_TRUE : FTN_FALSE); 2982 2983 /* Can't use serial interval since not block structured */ 2984 2985 #endif // KMP_USE_DYNAMIC_LOCK 2986 } 2987 2988 /* try to acquire the lock */ 2989 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2990 #if KMP_USE_DYNAMIC_LOCK 2991 int rc; 2992 #if USE_ITT_BUILD 2993 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2994 #endif 2995 #if OMPT_SUPPORT && OMPT_OPTIONAL 2996 // This is the case, if called from omp_init_lock_with_hint: 2997 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2998 if (!codeptr) 2999 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3000 if (ompt_enabled.ompt_callback_mutex_acquire) { 3001 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3002 ompt_mutex_nest_lock, omp_lock_hint_none, 3003 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 3004 codeptr); 3005 } 3006 #endif 3007 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid); 3008 #if USE_ITT_BUILD 3009 if (rc) { 3010 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 3011 } else { 3012 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 3013 } 3014 #endif 3015 #if OMPT_SUPPORT && OMPT_OPTIONAL 3016 if (ompt_enabled.enabled && rc) { 3017 if (rc == 1) { 3018 if (ompt_enabled.ompt_callback_mutex_acquired) { 3019 // lock_first 3020 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3021 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 3022 } 3023 } else { 3024 if (ompt_enabled.ompt_callback_nest_lock) { 3025 // lock_next 3026 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3027 ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr); 3028 } 3029 } 3030 } 3031 #endif 3032 return rc; 3033 3034 #else // KMP_USE_DYNAMIC_LOCK 3035 3036 kmp_user_lock_p lck; 3037 int rc; 3038 3039 if ((__kmp_user_lock_kind == lk_tas) && 3040 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 3041 OMP_NEST_LOCK_T_SIZE)) { 3042 lck = (kmp_user_lock_p)user_lock; 3043 } 3044 #if KMP_USE_FUTEX 3045 else if ((__kmp_user_lock_kind == lk_futex) && 3046 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 3047 OMP_NEST_LOCK_T_SIZE)) { 3048 lck = (kmp_user_lock_p)user_lock; 3049 } 3050 #endif 3051 else { 3052 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock"); 3053 } 3054 3055 #if USE_ITT_BUILD 3056 __kmp_itt_lock_acquiring(lck); 3057 #endif /* USE_ITT_BUILD */ 3058 3059 #if OMPT_SUPPORT && OMPT_OPTIONAL 3060 // This is the case, if called from omp_init_lock_with_hint: 3061 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3062 if (!codeptr) 3063 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3064 if (ompt_enabled.enabled) && 3065 ompt_enabled.ompt_callback_mutex_acquire) { 3066 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3067 ompt_mutex_nest_lock, omp_lock_hint_none, 3068 __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr); 3069 } 3070 #endif 3071 3072 rc = TEST_NESTED_LOCK(lck, gtid); 3073 #if USE_ITT_BUILD 3074 if (rc) { 3075 __kmp_itt_lock_acquired(lck); 3076 } else { 3077 __kmp_itt_lock_cancelled(lck); 3078 } 3079 #endif /* USE_ITT_BUILD */ 3080 #if OMPT_SUPPORT && OMPT_OPTIONAL 3081 if (ompt_enabled.enabled && rc) { 3082 if (rc == 1) { 3083 if (ompt_enabled.ompt_callback_mutex_acquired) { 3084 // lock_first 3085 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3086 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 3087 } 3088 } else { 3089 if (ompt_enabled.ompt_callback_nest_lock) { 3090 // lock_next 3091 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3092 ompt_mutex_scope_begin, (omp_wait_id_t)lck, codeptr); 3093 } 3094 } 3095 } 3096 #endif 3097 return rc; 3098 3099 /* Can't use serial interval since not block structured */ 3100 3101 #endif // KMP_USE_DYNAMIC_LOCK 3102 } 3103 3104 // Interface to fast scalable reduce methods routines 3105 3106 // keep the selected method in a thread local structure for cross-function 3107 // usage: will be used in __kmpc_end_reduce* functions; 3108 // another solution: to re-determine the method one more time in 3109 // __kmpc_end_reduce* functions (new prototype required then) 3110 // AT: which solution is better? 3111 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \ 3112 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod)) 3113 3114 #define __KMP_GET_REDUCTION_METHOD(gtid) \ 3115 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) 3116 3117 // description of the packed_reduction_method variable: look at the macros in 3118 // kmp.h 3119 3120 // used in a critical section reduce block 3121 static __forceinline void 3122 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3123 kmp_critical_name *crit) { 3124 3125 // this lock was visible to a customer and to the threading profile tool as a 3126 // serial overhead span (although it's used for an internal purpose only) 3127 // why was it visible in previous implementation? 3128 // should we keep it visible in new reduce block? 3129 kmp_user_lock_p lck; 3130 3131 #if KMP_USE_DYNAMIC_LOCK 3132 3133 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 3134 // Check if it is initialized. 3135 if (*lk == 0) { 3136 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3137 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 3138 KMP_GET_D_TAG(__kmp_user_lock_seq)); 3139 } else { 3140 __kmp_init_indirect_csptr(crit, loc, global_tid, 3141 KMP_GET_I_TAG(__kmp_user_lock_seq)); 3142 } 3143 } 3144 // Branch for accessing the actual lock object and set operation. This 3145 // branching is inevitable since this lock initialization does not follow the 3146 // normal dispatch path (lock table is not used). 3147 if (KMP_EXTRACT_D_TAG(lk) != 0) { 3148 lck = (kmp_user_lock_p)lk; 3149 KMP_DEBUG_ASSERT(lck != NULL); 3150 if (__kmp_env_consistency_check) { 3151 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3152 } 3153 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 3154 } else { 3155 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 3156 lck = ilk->lock; 3157 KMP_DEBUG_ASSERT(lck != NULL); 3158 if (__kmp_env_consistency_check) { 3159 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3160 } 3161 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 3162 } 3163 3164 #else // KMP_USE_DYNAMIC_LOCK 3165 3166 // We know that the fast reduction code is only emitted by Intel compilers 3167 // with 32 byte critical sections. If there isn't enough space, then we 3168 // have to use a pointer. 3169 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) { 3170 lck = (kmp_user_lock_p)crit; 3171 } else { 3172 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 3173 } 3174 KMP_DEBUG_ASSERT(lck != NULL); 3175 3176 if (__kmp_env_consistency_check) 3177 __kmp_push_sync(global_tid, ct_critical, loc, lck); 3178 3179 __kmp_acquire_user_lock_with_checks(lck, global_tid); 3180 3181 #endif // KMP_USE_DYNAMIC_LOCK 3182 } 3183 3184 // used in a critical section reduce block 3185 static __forceinline void 3186 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3187 kmp_critical_name *crit) { 3188 3189 kmp_user_lock_p lck; 3190 3191 #if KMP_USE_DYNAMIC_LOCK 3192 3193 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3194 lck = (kmp_user_lock_p)crit; 3195 if (__kmp_env_consistency_check) 3196 __kmp_pop_sync(global_tid, ct_critical, loc); 3197 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 3198 } else { 3199 kmp_indirect_lock_t *ilk = 3200 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 3201 if (__kmp_env_consistency_check) 3202 __kmp_pop_sync(global_tid, ct_critical, loc); 3203 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid); 3204 } 3205 3206 #else // KMP_USE_DYNAMIC_LOCK 3207 3208 // We know that the fast reduction code is only emitted by Intel compilers 3209 // with 32 byte critical sections. If there isn't enough space, then we have 3210 // to use a pointer. 3211 if (__kmp_base_user_lock_size > 32) { 3212 lck = *((kmp_user_lock_p *)crit); 3213 KMP_ASSERT(lck != NULL); 3214 } else { 3215 lck = (kmp_user_lock_p)crit; 3216 } 3217 3218 if (__kmp_env_consistency_check) 3219 __kmp_pop_sync(global_tid, ct_critical, loc); 3220 3221 __kmp_release_user_lock_with_checks(lck, global_tid); 3222 3223 #endif // KMP_USE_DYNAMIC_LOCK 3224 } // __kmp_end_critical_section_reduce_block 3225 3226 #if OMP_40_ENABLED 3227 static __forceinline int 3228 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p, 3229 int *task_state) { 3230 kmp_team_t *team; 3231 3232 // Check if we are inside the teams construct? 3233 if (th->th.th_teams_microtask) { 3234 *team_p = team = th->th.th_team; 3235 if (team->t.t_level == th->th.th_teams_level) { 3236 // This is reduction at teams construct. 3237 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0 3238 // Let's swap teams temporarily for the reduction. 3239 th->th.th_info.ds.ds_tid = team->t.t_master_tid; 3240 th->th.th_team = team->t.t_parent; 3241 th->th.th_team_nproc = th->th.th_team->t.t_nproc; 3242 th->th.th_task_team = th->th.th_team->t.t_task_team[0]; 3243 *task_state = th->th.th_task_state; 3244 th->th.th_task_state = 0; 3245 3246 return 1; 3247 } 3248 } 3249 return 0; 3250 } 3251 3252 static __forceinline void 3253 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) { 3254 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction. 3255 th->th.th_info.ds.ds_tid = 0; 3256 th->th.th_team = team; 3257 th->th.th_team_nproc = team->t.t_nproc; 3258 th->th.th_task_team = team->t.t_task_team[task_state]; 3259 th->th.th_task_state = task_state; 3260 } 3261 #endif 3262 3263 /* 2.a.i. Reduce Block without a terminating barrier */ 3264 /*! 3265 @ingroup SYNCHRONIZATION 3266 @param loc source location information 3267 @param global_tid global thread number 3268 @param num_vars number of items (variables) to be reduced 3269 @param reduce_size size of data in bytes to be reduced 3270 @param reduce_data pointer to data to be reduced 3271 @param reduce_func callback function providing reduction operation on two 3272 operands and returning result of reduction in lhs_data 3273 @param lck pointer to the unique lock data structure 3274 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3275 threads if atomic reduction needed 3276 3277 The nowait version is used for a reduce clause with the nowait argument. 3278 */ 3279 kmp_int32 3280 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3281 size_t reduce_size, void *reduce_data, 3282 void (*reduce_func)(void *lhs_data, void *rhs_data), 3283 kmp_critical_name *lck) { 3284 3285 KMP_COUNT_BLOCK(REDUCE_nowait); 3286 int retval = 0; 3287 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3288 #if OMP_40_ENABLED 3289 kmp_info_t *th; 3290 kmp_team_t *team; 3291 int teams_swapped = 0, task_state; 3292 #endif 3293 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid)); 3294 3295 // why do we need this initialization here at all? 3296 // Reduction clause can not be used as a stand-alone directive. 3297 3298 // do not call __kmp_serial_initialize(), it will be called by 3299 // __kmp_parallel_initialize() if needed 3300 // possible detection of false-positive race by the threadchecker ??? 3301 if (!TCR_4(__kmp_init_parallel)) 3302 __kmp_parallel_initialize(); 3303 3304 // check correctness of reduce block nesting 3305 #if KMP_USE_DYNAMIC_LOCK 3306 if (__kmp_env_consistency_check) 3307 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3308 #else 3309 if (__kmp_env_consistency_check) 3310 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3311 #endif 3312 3313 #if OMP_40_ENABLED 3314 th = __kmp_thread_from_gtid(global_tid); 3315 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3316 #endif // OMP_40_ENABLED 3317 3318 // packed_reduction_method value will be reused by __kmp_end_reduce* function, 3319 // the value should be kept in a variable 3320 // the variable should be either a construct-specific or thread-specific 3321 // property, not a team specific property 3322 // (a thread can reach the next reduce block on the next construct, reduce 3323 // method may differ on the next construct) 3324 // an ident_t "loc" parameter could be used as a construct-specific property 3325 // (what if loc == 0?) 3326 // (if both construct-specific and team-specific variables were shared, 3327 // then unness extra syncs should be needed) 3328 // a thread-specific variable is better regarding two issues above (next 3329 // construct and extra syncs) 3330 // a thread-specific "th_local.reduction_method" variable is used currently 3331 // each thread executes 'determine' and 'set' lines (no need to execute by one 3332 // thread, to avoid unness extra syncs) 3333 3334 packed_reduction_method = __kmp_determine_reduction_method( 3335 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3336 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3337 3338 if (packed_reduction_method == critical_reduce_block) { 3339 3340 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3341 retval = 1; 3342 3343 } else if (packed_reduction_method == empty_reduce_block) { 3344 3345 // usage: if team size == 1, no synchronization is required ( Intel 3346 // platforms only ) 3347 retval = 1; 3348 3349 } else if (packed_reduction_method == atomic_reduce_block) { 3350 3351 retval = 2; 3352 3353 // all threads should do this pop here (because __kmpc_end_reduce_nowait() 3354 // won't be called by the code gen) 3355 // (it's not quite good, because the checking block has been closed by 3356 // this 'pop', 3357 // but atomic operation has not been executed yet, will be executed 3358 // slightly later, literally on next instruction) 3359 if (__kmp_env_consistency_check) 3360 __kmp_pop_sync(global_tid, ct_reduce, loc); 3361 3362 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3363 tree_reduce_block)) { 3364 3365 // AT: performance issue: a real barrier here 3366 // AT: (if master goes slow, other threads are blocked here waiting for the 3367 // master to come and release them) 3368 // AT: (it's not what a customer might expect specifying NOWAIT clause) 3369 // AT: (specifying NOWAIT won't result in improvement of performance, it'll 3370 // be confusing to a customer) 3371 // AT: another implementation of *barrier_gather*nowait() (or some other design) 3372 // might go faster and be more in line with sense of NOWAIT 3373 // AT: TO DO: do epcc test and compare times 3374 3375 // this barrier should be invisible to a customer and to the threading profile 3376 // tool (it's neither a terminating barrier nor customer's code, it's 3377 // used for an internal purpose) 3378 #if OMPT_SUPPORT 3379 // JP: can this barrier potentially leed to task scheduling? 3380 // JP: as long as there is a barrier in the implementation, OMPT should and 3381 // will provide the barrier events 3382 // so we set-up the necessary frame/return addresses. 3383 omp_frame_t *ompt_frame; 3384 if (ompt_enabled.enabled) { 3385 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3386 if (ompt_frame->enter_frame == NULL) 3387 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3388 OMPT_STORE_RETURN_ADDRESS(global_tid); 3389 } 3390 #endif 3391 #if USE_ITT_NOTIFY 3392 __kmp_threads[global_tid]->th.th_ident = loc; 3393 #endif 3394 retval = 3395 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3396 global_tid, FALSE, reduce_size, reduce_data, reduce_func); 3397 retval = (retval != 0) ? (0) : (1); 3398 #if OMPT_SUPPORT && OMPT_OPTIONAL 3399 if (ompt_enabled.enabled) { 3400 ompt_frame->enter_frame = NULL; 3401 } 3402 #endif 3403 3404 // all other workers except master should do this pop here 3405 // ( none of other workers will get to __kmpc_end_reduce_nowait() ) 3406 if (__kmp_env_consistency_check) { 3407 if (retval == 0) { 3408 __kmp_pop_sync(global_tid, ct_reduce, loc); 3409 } 3410 } 3411 3412 } else { 3413 3414 // should never reach this block 3415 KMP_ASSERT(0); // "unexpected method" 3416 } 3417 #if OMP_40_ENABLED 3418 if (teams_swapped) { 3419 __kmp_restore_swapped_teams(th, team, task_state); 3420 } 3421 #endif 3422 KA_TRACE( 3423 10, 3424 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", 3425 global_tid, packed_reduction_method, retval)); 3426 3427 return retval; 3428 } 3429 3430 /*! 3431 @ingroup SYNCHRONIZATION 3432 @param loc source location information 3433 @param global_tid global thread id. 3434 @param lck pointer to the unique lock data structure 3435 3436 Finish the execution of a reduce nowait. 3437 */ 3438 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, 3439 kmp_critical_name *lck) { 3440 3441 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3442 3443 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid)); 3444 3445 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3446 3447 if (packed_reduction_method == critical_reduce_block) { 3448 3449 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3450 3451 } else if (packed_reduction_method == empty_reduce_block) { 3452 3453 // usage: if team size == 1, no synchronization is required ( on Intel 3454 // platforms only ) 3455 3456 } else if (packed_reduction_method == atomic_reduce_block) { 3457 3458 // neither master nor other workers should get here 3459 // (code gen does not generate this call in case 2: atomic reduce block) 3460 // actually it's better to remove this elseif at all; 3461 // after removal this value will checked by the 'else' and will assert 3462 3463 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3464 tree_reduce_block)) { 3465 3466 // only master gets here 3467 3468 } else { 3469 3470 // should never reach this block 3471 KMP_ASSERT(0); // "unexpected method" 3472 } 3473 3474 if (__kmp_env_consistency_check) 3475 __kmp_pop_sync(global_tid, ct_reduce, loc); 3476 3477 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", 3478 global_tid, packed_reduction_method)); 3479 3480 return; 3481 } 3482 3483 /* 2.a.ii. Reduce Block with a terminating barrier */ 3484 3485 /*! 3486 @ingroup SYNCHRONIZATION 3487 @param loc source location information 3488 @param global_tid global thread number 3489 @param num_vars number of items (variables) to be reduced 3490 @param reduce_size size of data in bytes to be reduced 3491 @param reduce_data pointer to data to be reduced 3492 @param reduce_func callback function providing reduction operation on two 3493 operands and returning result of reduction in lhs_data 3494 @param lck pointer to the unique lock data structure 3495 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3496 threads if atomic reduction needed 3497 3498 A blocking reduce that includes an implicit barrier. 3499 */ 3500 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3501 size_t reduce_size, void *reduce_data, 3502 void (*reduce_func)(void *lhs_data, void *rhs_data), 3503 kmp_critical_name *lck) { 3504 KMP_COUNT_BLOCK(REDUCE_wait); 3505 int retval = 0; 3506 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3507 #if OMP_40_ENABLED 3508 kmp_info_t *th; 3509 kmp_team_t *team; 3510 int teams_swapped = 0, task_state; 3511 #endif 3512 3513 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid)); 3514 3515 // why do we need this initialization here at all? 3516 // Reduction clause can not be a stand-alone directive. 3517 3518 // do not call __kmp_serial_initialize(), it will be called by 3519 // __kmp_parallel_initialize() if needed 3520 // possible detection of false-positive race by the threadchecker ??? 3521 if (!TCR_4(__kmp_init_parallel)) 3522 __kmp_parallel_initialize(); 3523 3524 // check correctness of reduce block nesting 3525 #if KMP_USE_DYNAMIC_LOCK 3526 if (__kmp_env_consistency_check) 3527 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3528 #else 3529 if (__kmp_env_consistency_check) 3530 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3531 #endif 3532 3533 #if OMP_40_ENABLED 3534 th = __kmp_thread_from_gtid(global_tid); 3535 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3536 #endif // OMP_40_ENABLED 3537 3538 packed_reduction_method = __kmp_determine_reduction_method( 3539 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3540 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3541 3542 if (packed_reduction_method == critical_reduce_block) { 3543 3544 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3545 retval = 1; 3546 3547 } else if (packed_reduction_method == empty_reduce_block) { 3548 3549 // usage: if team size == 1, no synchronization is required ( Intel 3550 // platforms only ) 3551 retval = 1; 3552 3553 } else if (packed_reduction_method == atomic_reduce_block) { 3554 3555 retval = 2; 3556 3557 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3558 tree_reduce_block)) { 3559 3560 // case tree_reduce_block: 3561 // this barrier should be visible to a customer and to the threading profile 3562 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3563 #if OMPT_SUPPORT 3564 omp_frame_t *ompt_frame; 3565 if (ompt_enabled.enabled) { 3566 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3567 if (ompt_frame->enter_frame == NULL) 3568 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3569 OMPT_STORE_RETURN_ADDRESS(global_tid); 3570 } 3571 #endif 3572 #if USE_ITT_NOTIFY 3573 __kmp_threads[global_tid]->th.th_ident = 3574 loc; // needed for correct notification of frames 3575 #endif 3576 retval = 3577 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3578 global_tid, TRUE, reduce_size, reduce_data, reduce_func); 3579 retval = (retval != 0) ? (0) : (1); 3580 #if OMPT_SUPPORT && OMPT_OPTIONAL 3581 if (ompt_enabled.enabled) { 3582 ompt_frame->enter_frame = NULL; 3583 } 3584 #endif 3585 3586 // all other workers except master should do this pop here 3587 // ( none of other workers except master will enter __kmpc_end_reduce() ) 3588 if (__kmp_env_consistency_check) { 3589 if (retval == 0) { // 0: all other workers; 1: master 3590 __kmp_pop_sync(global_tid, ct_reduce, loc); 3591 } 3592 } 3593 3594 } else { 3595 3596 // should never reach this block 3597 KMP_ASSERT(0); // "unexpected method" 3598 } 3599 #if OMP_40_ENABLED 3600 if (teams_swapped) { 3601 __kmp_restore_swapped_teams(th, team, task_state); 3602 } 3603 #endif 3604 3605 KA_TRACE(10, 3606 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", 3607 global_tid, packed_reduction_method, retval)); 3608 3609 return retval; 3610 } 3611 3612 /*! 3613 @ingroup SYNCHRONIZATION 3614 @param loc source location information 3615 @param global_tid global thread id. 3616 @param lck pointer to the unique lock data structure 3617 3618 Finish the execution of a blocking reduce. 3619 The <tt>lck</tt> pointer must be the same as that used in the corresponding 3620 start function. 3621 */ 3622 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, 3623 kmp_critical_name *lck) { 3624 3625 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3626 #if OMP_40_ENABLED 3627 kmp_info_t *th; 3628 kmp_team_t *team; 3629 int teams_swapped = 0, task_state; 3630 #endif 3631 3632 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid)); 3633 3634 #if OMP_40_ENABLED 3635 th = __kmp_thread_from_gtid(global_tid); 3636 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3637 #endif // OMP_40_ENABLED 3638 3639 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3640 3641 // this barrier should be visible to a customer and to the threading profile 3642 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3643 3644 if (packed_reduction_method == critical_reduce_block) { 3645 3646 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3647 3648 // TODO: implicit barrier: should be exposed 3649 #if OMPT_SUPPORT 3650 omp_frame_t *ompt_frame; 3651 if (ompt_enabled.enabled) { 3652 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3653 if (ompt_frame->enter_frame == NULL) 3654 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3655 OMPT_STORE_RETURN_ADDRESS(global_tid); 3656 } 3657 #endif 3658 #if USE_ITT_NOTIFY 3659 __kmp_threads[global_tid]->th.th_ident = loc; 3660 #endif 3661 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3662 #if OMPT_SUPPORT && OMPT_OPTIONAL 3663 if (ompt_enabled.enabled) { 3664 ompt_frame->enter_frame = NULL; 3665 } 3666 #endif 3667 3668 } else if (packed_reduction_method == empty_reduce_block) { 3669 3670 // usage: if team size==1, no synchronization is required (Intel platforms only) 3671 3672 // TODO: implicit barrier: should be exposed 3673 #if OMPT_SUPPORT 3674 omp_frame_t *ompt_frame; 3675 if (ompt_enabled.enabled) { 3676 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3677 if (ompt_frame->enter_frame == NULL) 3678 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3679 OMPT_STORE_RETURN_ADDRESS(global_tid); 3680 } 3681 #endif 3682 #if USE_ITT_NOTIFY 3683 __kmp_threads[global_tid]->th.th_ident = loc; 3684 #endif 3685 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3686 #if OMPT_SUPPORT && OMPT_OPTIONAL 3687 if (ompt_enabled.enabled) { 3688 ompt_frame->enter_frame = NULL; 3689 } 3690 #endif 3691 3692 } else if (packed_reduction_method == atomic_reduce_block) { 3693 3694 #if OMPT_SUPPORT 3695 omp_frame_t *ompt_frame; 3696 if (ompt_enabled.enabled) { 3697 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3698 if (ompt_frame->enter_frame == NULL) 3699 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3700 OMPT_STORE_RETURN_ADDRESS(global_tid); 3701 } 3702 #endif 3703 // TODO: implicit barrier: should be exposed 3704 #if USE_ITT_NOTIFY 3705 __kmp_threads[global_tid]->th.th_ident = loc; 3706 #endif 3707 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3708 #if OMPT_SUPPORT && OMPT_OPTIONAL 3709 if (ompt_enabled.enabled) { 3710 ompt_frame->enter_frame = NULL; 3711 } 3712 #endif 3713 3714 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3715 tree_reduce_block)) { 3716 3717 // only master executes here (master releases all other workers) 3718 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3719 global_tid); 3720 3721 } else { 3722 3723 // should never reach this block 3724 KMP_ASSERT(0); // "unexpected method" 3725 } 3726 #if OMP_40_ENABLED 3727 if (teams_swapped) { 3728 __kmp_restore_swapped_teams(th, team, task_state); 3729 } 3730 #endif 3731 3732 if (__kmp_env_consistency_check) 3733 __kmp_pop_sync(global_tid, ct_reduce, loc); 3734 3735 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n", 3736 global_tid, packed_reduction_method)); 3737 3738 return; 3739 } 3740 3741 #undef __KMP_GET_REDUCTION_METHOD 3742 #undef __KMP_SET_REDUCTION_METHOD 3743 3744 /* end of interface to fast scalable reduce routines */ 3745 3746 kmp_uint64 __kmpc_get_taskid() { 3747 3748 kmp_int32 gtid; 3749 kmp_info_t *thread; 3750 3751 gtid = __kmp_get_gtid(); 3752 if (gtid < 0) { 3753 return 0; 3754 } 3755 thread = __kmp_thread_from_gtid(gtid); 3756 return thread->th.th_current_task->td_task_id; 3757 3758 } // __kmpc_get_taskid 3759 3760 kmp_uint64 __kmpc_get_parent_taskid() { 3761 3762 kmp_int32 gtid; 3763 kmp_info_t *thread; 3764 kmp_taskdata_t *parent_task; 3765 3766 gtid = __kmp_get_gtid(); 3767 if (gtid < 0) { 3768 return 0; 3769 } 3770 thread = __kmp_thread_from_gtid(gtid); 3771 parent_task = thread->th.th_current_task->td_parent; 3772 return (parent_task == NULL ? 0 : parent_task->td_task_id); 3773 3774 } // __kmpc_get_parent_taskid 3775 3776 #if OMP_45_ENABLED 3777 /*! 3778 @ingroup WORK_SHARING 3779 @param loc source location information. 3780 @param gtid global thread number. 3781 @param num_dims number of associated doacross loops. 3782 @param dims info on loops bounds. 3783 3784 Initialize doacross loop information. 3785 Expect compiler send us inclusive bounds, 3786 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2. 3787 */ 3788 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, 3789 const struct kmp_dim *dims) { 3790 int j, idx; 3791 kmp_int64 last, trace_count; 3792 kmp_info_t *th = __kmp_threads[gtid]; 3793 kmp_team_t *team = th->th.th_team; 3794 kmp_uint32 *flags; 3795 kmp_disp_t *pr_buf = th->th.th_dispatch; 3796 dispatch_shared_info_t *sh_buf; 3797 3798 KA_TRACE( 3799 20, 3800 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n", 3801 gtid, num_dims, !team->t.t_serialized)); 3802 KMP_DEBUG_ASSERT(dims != NULL); 3803 KMP_DEBUG_ASSERT(num_dims > 0); 3804 3805 if (team->t.t_serialized) { 3806 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n")); 3807 return; // no dependencies if team is serialized 3808 } 3809 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 3810 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for 3811 // the next loop 3812 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3813 3814 // Save bounds info into allocated private buffer 3815 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL); 3816 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc( 3817 th, sizeof(kmp_int64) * (4 * num_dims + 1)); 3818 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3819 pr_buf->th_doacross_info[0] = 3820 (kmp_int64)num_dims; // first element is number of dimensions 3821 // Save also address of num_done in order to access it later without knowing 3822 // the buffer index 3823 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done; 3824 pr_buf->th_doacross_info[2] = dims[0].lo; 3825 pr_buf->th_doacross_info[3] = dims[0].up; 3826 pr_buf->th_doacross_info[4] = dims[0].st; 3827 last = 5; 3828 for (j = 1; j < num_dims; ++j) { 3829 kmp_int64 3830 range_length; // To keep ranges of all dimensions but the first dims[0] 3831 if (dims[j].st == 1) { // most common case 3832 // AC: should we care of ranges bigger than LLONG_MAX? (not for now) 3833 range_length = dims[j].up - dims[j].lo + 1; 3834 } else { 3835 if (dims[j].st > 0) { 3836 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo); 3837 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1; 3838 } else { // negative increment 3839 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up); 3840 range_length = 3841 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1; 3842 } 3843 } 3844 pr_buf->th_doacross_info[last++] = range_length; 3845 pr_buf->th_doacross_info[last++] = dims[j].lo; 3846 pr_buf->th_doacross_info[last++] = dims[j].up; 3847 pr_buf->th_doacross_info[last++] = dims[j].st; 3848 } 3849 3850 // Compute total trip count. 3851 // Start with range of dims[0] which we don't need to keep in the buffer. 3852 if (dims[0].st == 1) { // most common case 3853 trace_count = dims[0].up - dims[0].lo + 1; 3854 } else if (dims[0].st > 0) { 3855 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo); 3856 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1; 3857 } else { // negative increment 3858 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up); 3859 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1; 3860 } 3861 for (j = 1; j < num_dims; ++j) { 3862 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges 3863 } 3864 KMP_DEBUG_ASSERT(trace_count > 0); 3865 3866 // Check if shared buffer is not occupied by other loop (idx - 3867 // __kmp_dispatch_num_buffers) 3868 if (idx != sh_buf->doacross_buf_idx) { 3869 // Shared buffer is occupied, wait for it to be free 3870 __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx, 3871 __kmp_eq_4, NULL); 3872 } 3873 #if KMP_32_BIT_ARCH 3874 // Check if we are the first thread. After the CAS the first thread gets 0, 3875 // others get 1 if initialization is in progress, allocated pointer otherwise. 3876 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated. 3877 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32( 3878 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1); 3879 #else 3880 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64( 3881 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL); 3882 #endif 3883 if (flags == NULL) { 3884 // we are the first thread, allocate the array of flags 3885 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration 3886 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1); 3887 KMP_MB(); 3888 sh_buf->doacross_flags = flags; 3889 } else if (flags == (kmp_uint32 *)1) { 3890 #if KMP_32_BIT_ARCH 3891 // initialization is still in progress, need to wait 3892 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1) 3893 #else 3894 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL) 3895 #endif 3896 KMP_YIELD(TRUE); 3897 KMP_MB(); 3898 } else { 3899 KMP_MB(); 3900 } 3901 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value 3902 pr_buf->th_doacross_flags = 3903 sh_buf->doacross_flags; // save private copy in order to not 3904 // touch shared buffer on each iteration 3905 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid)); 3906 } 3907 3908 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) { 3909 kmp_int32 shft, num_dims, i; 3910 kmp_uint32 flag; 3911 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 3912 kmp_info_t *th = __kmp_threads[gtid]; 3913 kmp_team_t *team = th->th.th_team; 3914 kmp_disp_t *pr_buf; 3915 kmp_int64 lo, up, st; 3916 3917 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid)); 3918 if (team->t.t_serialized) { 3919 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n")); 3920 return; // no dependencies if team is serialized 3921 } 3922 3923 // calculate sequential iteration number and check out-of-bounds condition 3924 pr_buf = th->th.th_dispatch; 3925 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3926 num_dims = pr_buf->th_doacross_info[0]; 3927 lo = pr_buf->th_doacross_info[2]; 3928 up = pr_buf->th_doacross_info[3]; 3929 st = pr_buf->th_doacross_info[4]; 3930 if (st == 1) { // most common case 3931 if (vec[0] < lo || vec[0] > up) { 3932 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3933 "bounds [%lld,%lld]\n", 3934 gtid, vec[0], lo, up)); 3935 return; 3936 } 3937 iter_number = vec[0] - lo; 3938 } else if (st > 0) { 3939 if (vec[0] < lo || vec[0] > up) { 3940 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3941 "bounds [%lld,%lld]\n", 3942 gtid, vec[0], lo, up)); 3943 return; 3944 } 3945 iter_number = (kmp_uint64)(vec[0] - lo) / st; 3946 } else { // negative increment 3947 if (vec[0] > lo || vec[0] < up) { 3948 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3949 "bounds [%lld,%lld]\n", 3950 gtid, vec[0], lo, up)); 3951 return; 3952 } 3953 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 3954 } 3955 for (i = 1; i < num_dims; ++i) { 3956 kmp_int64 iter, ln; 3957 kmp_int32 j = i * 4; 3958 ln = pr_buf->th_doacross_info[j + 1]; 3959 lo = pr_buf->th_doacross_info[j + 2]; 3960 up = pr_buf->th_doacross_info[j + 3]; 3961 st = pr_buf->th_doacross_info[j + 4]; 3962 if (st == 1) { 3963 if (vec[i] < lo || vec[i] > up) { 3964 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3965 "bounds [%lld,%lld]\n", 3966 gtid, vec[i], lo, up)); 3967 return; 3968 } 3969 iter = vec[i] - lo; 3970 } else if (st > 0) { 3971 if (vec[i] < lo || vec[i] > up) { 3972 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3973 "bounds [%lld,%lld]\n", 3974 gtid, vec[i], lo, up)); 3975 return; 3976 } 3977 iter = (kmp_uint64)(vec[i] - lo) / st; 3978 } else { // st < 0 3979 if (vec[i] > lo || vec[i] < up) { 3980 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3981 "bounds [%lld,%lld]\n", 3982 gtid, vec[i], lo, up)); 3983 return; 3984 } 3985 iter = (kmp_uint64)(lo - vec[i]) / (-st); 3986 } 3987 iter_number = iter + ln * iter_number; 3988 } 3989 shft = iter_number % 32; // use 32-bit granularity 3990 iter_number >>= 5; // divided by 32 3991 flag = 1 << shft; 3992 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) { 3993 KMP_YIELD(TRUE); 3994 } 3995 KMP_MB(); 3996 KA_TRACE(20, 3997 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", 3998 gtid, (iter_number << 5) + shft)); 3999 } 4000 4001 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) { 4002 kmp_int32 shft, num_dims, i; 4003 kmp_uint32 flag; 4004 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 4005 kmp_info_t *th = __kmp_threads[gtid]; 4006 kmp_team_t *team = th->th.th_team; 4007 kmp_disp_t *pr_buf; 4008 kmp_int64 lo, st; 4009 4010 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid)); 4011 if (team->t.t_serialized) { 4012 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n")); 4013 return; // no dependencies if team is serialized 4014 } 4015 4016 // calculate sequential iteration number (same as in "wait" but no 4017 // out-of-bounds checks) 4018 pr_buf = th->th.th_dispatch; 4019 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 4020 num_dims = pr_buf->th_doacross_info[0]; 4021 lo = pr_buf->th_doacross_info[2]; 4022 st = pr_buf->th_doacross_info[4]; 4023 if (st == 1) { // most common case 4024 iter_number = vec[0] - lo; 4025 } else if (st > 0) { 4026 iter_number = (kmp_uint64)(vec[0] - lo) / st; 4027 } else { // negative increment 4028 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 4029 } 4030 for (i = 1; i < num_dims; ++i) { 4031 kmp_int64 iter, ln; 4032 kmp_int32 j = i * 4; 4033 ln = pr_buf->th_doacross_info[j + 1]; 4034 lo = pr_buf->th_doacross_info[j + 2]; 4035 st = pr_buf->th_doacross_info[j + 4]; 4036 if (st == 1) { 4037 iter = vec[i] - lo; 4038 } else if (st > 0) { 4039 iter = (kmp_uint64)(vec[i] - lo) / st; 4040 } else { // st < 0 4041 iter = (kmp_uint64)(lo - vec[i]) / (-st); 4042 } 4043 iter_number = iter + ln * iter_number; 4044 } 4045 shft = iter_number % 32; // use 32-bit granularity 4046 iter_number >>= 5; // divided by 32 4047 flag = 1 << shft; 4048 KMP_MB(); 4049 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) 4050 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag); 4051 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid, 4052 (iter_number << 5) + shft)); 4053 } 4054 4055 void __kmpc_doacross_fini(ident_t *loc, int gtid) { 4056 kmp_int32 num_done; 4057 kmp_info_t *th = __kmp_threads[gtid]; 4058 kmp_team_t *team = th->th.th_team; 4059 kmp_disp_t *pr_buf = th->th.th_dispatch; 4060 4061 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid)); 4062 if (team->t.t_serialized) { 4063 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team)); 4064 return; // nothing to do 4065 } 4066 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1; 4067 if (num_done == th->th.th_team_nproc) { 4068 // we are the last thread, need to free shared resources 4069 int idx = pr_buf->th_doacross_buf_idx - 1; 4070 dispatch_shared_info_t *sh_buf = 4071 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 4072 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == 4073 (kmp_int64)&sh_buf->doacross_num_done); 4074 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done); 4075 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx); 4076 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags)); 4077 sh_buf->doacross_flags = NULL; 4078 sh_buf->doacross_num_done = 0; 4079 sh_buf->doacross_buf_idx += 4080 __kmp_dispatch_num_buffers; // free buffer for future re-use 4081 } 4082 // free private resources (need to keep buffer index forever) 4083 pr_buf->th_doacross_flags = NULL; 4084 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info); 4085 pr_buf->th_doacross_info = NULL; 4086 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid)); 4087 } 4088 #endif 4089 4090 #if OMP_50_ENABLED 4091 int __kmpc_get_target_offload(void) { 4092 if (!__kmp_init_serial) { 4093 __kmp_serial_initialize(); 4094 } 4095 return __kmp_target_offload; 4096 } 4097 #endif // OMP_50_ENABLED 4098 4099 // end of file // 4100