1 /* 2 * kmp_csupport.cpp -- kfront linkage support for OpenMP. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "omp.h" /* extern "C" declarations of user-visible routines */ 15 #include "kmp.h" 16 #include "kmp_error.h" 17 #include "kmp_i18n.h" 18 #include "kmp_itt.h" 19 #include "kmp_lock.h" 20 #include "kmp_stats.h" 21 22 #if OMPT_SUPPORT 23 #include "ompt-specific.h" 24 #endif 25 26 #define MAX_MESSAGE 512 27 28 // flags will be used in future, e.g. to implement openmp_strict library 29 // restrictions 30 31 /*! 32 * @ingroup STARTUP_SHUTDOWN 33 * @param loc in source location information 34 * @param flags in for future use (currently ignored) 35 * 36 * Initialize the runtime library. This call is optional; if it is not made then 37 * it will be implicitly called by attempts to use other library functions. 38 */ 39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) { 40 // By default __kmpc_begin() is no-op. 41 char *env; 42 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL && 43 __kmp_str_match_true(env)) { 44 __kmp_middle_initialize(); 45 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n")); 46 } else if (__kmp_ignore_mppbeg() == FALSE) { 47 // By default __kmp_ignore_mppbeg() returns TRUE. 48 __kmp_internal_begin(); 49 KC_TRACE(10, ("__kmpc_begin: called\n")); 50 } 51 } 52 53 /*! 54 * @ingroup STARTUP_SHUTDOWN 55 * @param loc source location information 56 * 57 * Shutdown the runtime library. This is also optional, and even if called will 58 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to 59 * zero. 60 */ 61 void __kmpc_end(ident_t *loc) { 62 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() 63 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND 64 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() 65 // returns FALSE and __kmpc_end() will unregister this root (it can cause 66 // library shut down). 67 if (__kmp_ignore_mppend() == FALSE) { 68 KC_TRACE(10, ("__kmpc_end: called\n")); 69 KA_TRACE(30, ("__kmpc_end\n")); 70 71 __kmp_internal_end_thread(-1); 72 } 73 } 74 75 /*! 76 @ingroup THREAD_STATES 77 @param loc Source location information. 78 @return The global thread index of the active thread. 79 80 This function can be called in any context. 81 82 If the runtime has ony been entered at the outermost level from a 83 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is 84 that which would be returned by omp_get_thread_num() in the outermost 85 active parallel construct. (Or zero if there is no active parallel 86 construct, since the master thread is necessarily thread zero). 87 88 If multiple non-OpenMP threads all enter an OpenMP construct then this 89 will be a unique thread identifier among all the threads created by 90 the OpenMP runtime (but the value cannote be defined in terms of 91 OpenMP thread ids returned by omp_get_thread_num()). 92 */ 93 kmp_int32 __kmpc_global_thread_num(ident_t *loc) { 94 kmp_int32 gtid = __kmp_entry_gtid(); 95 96 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid)); 97 98 return gtid; 99 } 100 101 /*! 102 @ingroup THREAD_STATES 103 @param loc Source location information. 104 @return The number of threads under control of the OpenMP<sup>*</sup> runtime 105 106 This function can be called in any context. 107 It returns the total number of threads under the control of the OpenMP runtime. 108 That is not a number that can be determined by any OpenMP standard calls, since 109 the library may be called from more than one non-OpenMP thread, and this 110 reflects the total over all such calls. Similarly the runtime maintains 111 underlying threads even when they are not active (since the cost of creating 112 and destroying OS threads is high), this call counts all such threads even if 113 they are not waiting for work. 114 */ 115 kmp_int32 __kmpc_global_num_threads(ident_t *loc) { 116 KC_TRACE(10, 117 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth)); 118 119 return TCR_4(__kmp_all_nth); 120 } 121 122 /*! 123 @ingroup THREAD_STATES 124 @param loc Source location information. 125 @return The thread number of the calling thread in the innermost active parallel 126 construct. 127 */ 128 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) { 129 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n")); 130 return __kmp_tid_from_gtid(__kmp_entry_gtid()); 131 } 132 133 /*! 134 @ingroup THREAD_STATES 135 @param loc Source location information. 136 @return The number of threads in the innermost active parallel construct. 137 */ 138 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) { 139 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n")); 140 141 return __kmp_entry_thread()->th.th_team->t.t_nproc; 142 } 143 144 /*! 145 * @ingroup DEPRECATED 146 * @param loc location description 147 * 148 * This function need not be called. It always returns TRUE. 149 */ 150 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) { 151 #ifndef KMP_DEBUG 152 153 return TRUE; 154 155 #else 156 157 const char *semi2; 158 const char *semi3; 159 int line_no; 160 161 if (__kmp_par_range == 0) { 162 return TRUE; 163 } 164 semi2 = loc->psource; 165 if (semi2 == NULL) { 166 return TRUE; 167 } 168 semi2 = strchr(semi2, ';'); 169 if (semi2 == NULL) { 170 return TRUE; 171 } 172 semi2 = strchr(semi2 + 1, ';'); 173 if (semi2 == NULL) { 174 return TRUE; 175 } 176 if (__kmp_par_range_filename[0]) { 177 const char *name = semi2 - 1; 178 while ((name > loc->psource) && (*name != '/') && (*name != ';')) { 179 name--; 180 } 181 if ((*name == '/') || (*name == ';')) { 182 name++; 183 } 184 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) { 185 return __kmp_par_range < 0; 186 } 187 } 188 semi3 = strchr(semi2 + 1, ';'); 189 if (__kmp_par_range_routine[0]) { 190 if ((semi3 != NULL) && (semi3 > semi2) && 191 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) { 192 return __kmp_par_range < 0; 193 } 194 } 195 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) { 196 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) { 197 return __kmp_par_range > 0; 198 } 199 return __kmp_par_range < 0; 200 } 201 return TRUE; 202 203 #endif /* KMP_DEBUG */ 204 } 205 206 /*! 207 @ingroup THREAD_STATES 208 @param loc Source location information. 209 @return 1 if this thread is executing inside an active parallel region, zero if 210 not. 211 */ 212 kmp_int32 __kmpc_in_parallel(ident_t *loc) { 213 return __kmp_entry_thread()->th.th_root->r.r_active; 214 } 215 216 /*! 217 @ingroup PARALLEL 218 @param loc source location information 219 @param global_tid global thread number 220 @param num_threads number of threads requested for this parallel construct 221 222 Set the number of threads to be used by the next fork spawned by this thread. 223 This call is only required if the parallel construct has a `num_threads` clause. 224 */ 225 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, 226 kmp_int32 num_threads) { 227 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n", 228 global_tid, num_threads)); 229 230 __kmp_push_num_threads(loc, global_tid, num_threads); 231 } 232 233 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) { 234 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n")); 235 236 /* the num_threads are automatically popped */ 237 } 238 239 #if OMP_40_ENABLED 240 241 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, 242 kmp_int32 proc_bind) { 243 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid, 244 proc_bind)); 245 246 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind); 247 } 248 249 #endif /* OMP_40_ENABLED */ 250 251 /*! 252 @ingroup PARALLEL 253 @param loc source location information 254 @param argc total number of arguments in the ellipsis 255 @param microtask pointer to callback routine consisting of outlined parallel 256 construct 257 @param ... pointers to shared variables that aren't global 258 259 Do the actual fork and call the microtask in the relevant number of threads. 260 */ 261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) { 262 int gtid = __kmp_entry_gtid(); 263 264 #if (KMP_STATS_ENABLED) 265 // If we were in a serial region, then stop the serial timer, record 266 // the event, and start parallel region timer 267 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 268 if (previous_state == stats_state_e::SERIAL_REGION) { 269 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead); 270 } else { 271 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead); 272 } 273 int inParallel = __kmpc_in_parallel(loc); 274 if (inParallel) { 275 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL); 276 } else { 277 KMP_COUNT_BLOCK(OMP_PARALLEL); 278 } 279 #endif 280 281 // maybe to save thr_state is enough here 282 { 283 va_list ap; 284 va_start(ap, microtask); 285 286 #if OMPT_SUPPORT 287 omp_frame_t *ompt_frame; 288 if (ompt_enabled.enabled) { 289 kmp_info_t *master_th = __kmp_threads[gtid]; 290 kmp_team_t *parent_team = master_th->th.th_team; 291 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info; 292 if (lwt) 293 ompt_frame = &(lwt->ompt_task_info.frame); 294 else { 295 int tid = __kmp_tid_from_gtid(gtid); 296 ompt_frame = &( 297 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame); 298 } 299 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 300 OMPT_STORE_RETURN_ADDRESS(gtid); 301 } 302 #endif 303 304 #if INCLUDE_SSC_MARKS 305 SSC_MARK_FORKING(); 306 #endif 307 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 308 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task 309 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, 310 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 311 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 312 &ap 313 #else 314 ap 315 #endif 316 ); 317 #if INCLUDE_SSC_MARKS 318 SSC_MARK_JOINING(); 319 #endif 320 __kmp_join_call(loc, gtid 321 #if OMPT_SUPPORT 322 , 323 fork_context_intel 324 #endif 325 ); 326 327 va_end(ap); 328 } 329 330 #if KMP_STATS_ENABLED 331 if (previous_state == stats_state_e::SERIAL_REGION) { 332 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial); 333 } else { 334 KMP_POP_PARTITIONED_TIMER(); 335 } 336 #endif // KMP_STATS_ENABLED 337 } 338 339 #if OMP_40_ENABLED 340 /*! 341 @ingroup PARALLEL 342 @param loc source location information 343 @param global_tid global thread number 344 @param num_teams number of teams requested for the teams construct 345 @param num_threads number of threads per team requested for the teams construct 346 347 Set the number of teams to be used by the teams construct. 348 This call is only required if the teams construct has a `num_teams` clause 349 or a `thread_limit` clause (or both). 350 */ 351 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, 352 kmp_int32 num_teams, kmp_int32 num_threads) { 353 KA_TRACE(20, 354 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n", 355 global_tid, num_teams, num_threads)); 356 357 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads); 358 } 359 360 /*! 361 @ingroup PARALLEL 362 @param loc source location information 363 @param argc total number of arguments in the ellipsis 364 @param microtask pointer to callback routine consisting of outlined teams 365 construct 366 @param ... pointers to shared variables that aren't global 367 368 Do the actual fork and call the microtask in the relevant number of threads. 369 */ 370 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, 371 ...) { 372 int gtid = __kmp_entry_gtid(); 373 kmp_info_t *this_thr = __kmp_threads[gtid]; 374 va_list ap; 375 va_start(ap, microtask); 376 377 KMP_COUNT_BLOCK(OMP_TEAMS); 378 379 // remember teams entry point and nesting level 380 this_thr->th.th_teams_microtask = microtask; 381 this_thr->th.th_teams_level = 382 this_thr->th.th_team->t.t_level; // AC: can be >0 on host 383 384 #if OMPT_SUPPORT 385 kmp_team_t *parent_team = this_thr->th.th_team; 386 int tid = __kmp_tid_from_gtid(gtid); 387 if (ompt_enabled.enabled) { 388 parent_team->t.t_implicit_task_taskdata[tid] 389 .ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1); 390 } 391 OMPT_STORE_RETURN_ADDRESS(gtid); 392 #endif 393 394 // check if __kmpc_push_num_teams called, set default number of teams 395 // otherwise 396 if (this_thr->th.th_teams_size.nteams == 0) { 397 __kmp_push_num_teams(loc, gtid, 0, 0); 398 } 399 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1); 400 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1); 401 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1); 402 403 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 404 VOLATILE_CAST(microtask_t) 405 __kmp_teams_master, // "wrapped" task 406 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, 407 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 408 &ap 409 #else 410 ap 411 #endif 412 ); 413 __kmp_join_call(loc, gtid 414 #if OMPT_SUPPORT 415 , 416 fork_context_intel 417 #endif 418 ); 419 420 this_thr->th.th_teams_microtask = NULL; 421 this_thr->th.th_teams_level = 0; 422 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L; 423 va_end(ap); 424 } 425 #endif /* OMP_40_ENABLED */ 426 427 // I don't think this function should ever have been exported. 428 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated 429 // openmp code ever called it, but it's been exported from the RTL for so 430 // long that I'm afraid to remove the definition. 431 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); } 432 433 /*! 434 @ingroup PARALLEL 435 @param loc source location information 436 @param global_tid global thread number 437 438 Enter a serialized parallel construct. This interface is used to handle a 439 conditional parallel region, like this, 440 @code 441 #pragma omp parallel if (condition) 442 @endcode 443 when the condition is false. 444 */ 445 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 446 // The implementation is now in kmp_runtime.cpp so that it can share static 447 // functions with kmp_fork_call since the tasks to be done are similar in 448 // each case. 449 #if OMPT_SUPPORT 450 OMPT_STORE_RETURN_ADDRESS(global_tid); 451 #endif 452 __kmp_serialized_parallel(loc, global_tid); 453 } 454 455 /*! 456 @ingroup PARALLEL 457 @param loc source location information 458 @param global_tid global thread number 459 460 Leave a serialized parallel construct. 461 */ 462 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 463 kmp_internal_control_t *top; 464 kmp_info_t *this_thr; 465 kmp_team_t *serial_team; 466 467 KC_TRACE(10, 468 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid)); 469 470 /* skip all this code for autopar serialized loops since it results in 471 unacceptable overhead */ 472 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 473 return; 474 475 // Not autopar code 476 if (!TCR_4(__kmp_init_parallel)) 477 __kmp_parallel_initialize(); 478 479 this_thr = __kmp_threads[global_tid]; 480 serial_team = this_thr->th.th_serial_team; 481 482 #if OMP_45_ENABLED 483 kmp_task_team_t *task_team = this_thr->th.th_task_team; 484 485 // we need to wait for the proxy tasks before finishing the thread 486 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) 487 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL)); 488 #endif 489 490 KMP_MB(); 491 KMP_DEBUG_ASSERT(serial_team); 492 KMP_ASSERT(serial_team->t.t_serialized); 493 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 494 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team); 495 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 496 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 497 498 #if OMPT_SUPPORT 499 if (ompt_enabled.enabled && 500 this_thr->th.ompt_thread_info.state != omp_state_overhead) { 501 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = NULL; 502 if (ompt_enabled.ompt_callback_implicit_task) { 503 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 504 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1, 505 OMPT_CUR_TASK_INFO(this_thr)->thread_num); 506 } 507 508 // reset clear the task id only after unlinking the task 509 ompt_data_t *parent_task_data; 510 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL); 511 512 if (ompt_enabled.ompt_callback_parallel_end) { 513 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 514 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data, 515 ompt_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid)); 516 } 517 __ompt_lw_taskteam_unlink(this_thr); 518 this_thr->th.ompt_thread_info.state = omp_state_overhead; 519 } 520 #endif 521 522 /* If necessary, pop the internal control stack values and replace the team 523 * values */ 524 top = serial_team->t.t_control_stack_top; 525 if (top && top->serial_nesting_level == serial_team->t.t_serialized) { 526 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top); 527 serial_team->t.t_control_stack_top = top->next; 528 __kmp_free(top); 529 } 530 531 // if( serial_team -> t.t_serialized > 1 ) 532 serial_team->t.t_level--; 533 534 /* pop dispatch buffers stack */ 535 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer); 536 { 537 dispatch_private_info_t *disp_buffer = 538 serial_team->t.t_dispatch->th_disp_buffer; 539 serial_team->t.t_dispatch->th_disp_buffer = 540 serial_team->t.t_dispatch->th_disp_buffer->next; 541 __kmp_free(disp_buffer); 542 } 543 544 --serial_team->t.t_serialized; 545 if (serial_team->t.t_serialized == 0) { 546 547 /* return to the parallel section */ 548 549 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 550 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) { 551 __kmp_clear_x87_fpu_status_word(); 552 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word); 553 __kmp_load_mxcsr(&serial_team->t.t_mxcsr); 554 } 555 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 556 557 this_thr->th.th_team = serial_team->t.t_parent; 558 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid; 559 560 /* restore values cached in the thread */ 561 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */ 562 this_thr->th.th_team_master = 563 serial_team->t.t_parent->t.t_threads[0]; /* JPH */ 564 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized; 565 566 /* TODO the below shouldn't need to be adjusted for serialized teams */ 567 this_thr->th.th_dispatch = 568 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid]; 569 570 __kmp_pop_current_task_from_thread(this_thr); 571 572 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0); 573 this_thr->th.th_current_task->td_flags.executing = 1; 574 575 if (__kmp_tasking_mode != tskm_immediate_exec) { 576 // Copy the task team from the new child / old parent team to the thread. 577 this_thr->th.th_task_team = 578 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]; 579 KA_TRACE(20, 580 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / " 581 "team %p\n", 582 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 583 } 584 } else { 585 if (__kmp_tasking_mode != tskm_immediate_exec) { 586 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting " 587 "depth of serial team %p to %d\n", 588 global_tid, serial_team, serial_team->t.t_serialized)); 589 } 590 } 591 592 if (__kmp_env_consistency_check) 593 __kmp_pop_parallel(global_tid, NULL); 594 #if OMPT_SUPPORT 595 if (ompt_enabled.enabled) 596 this_thr->th.ompt_thread_info.state = 597 ((this_thr->th.th_team_serialized) ? omp_state_work_serial 598 : omp_state_work_parallel); 599 #endif 600 } 601 602 /*! 603 @ingroup SYNCHRONIZATION 604 @param loc source location information. 605 606 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though 607 depending on the memory ordering convention obeyed by the compiler 608 even that may not be necessary). 609 */ 610 void __kmpc_flush(ident_t *loc) { 611 KC_TRACE(10, ("__kmpc_flush: called\n")); 612 613 /* need explicit __mf() here since use volatile instead in library */ 614 KMP_MB(); /* Flush all pending memory write invalidates. */ 615 616 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64) 617 #if KMP_MIC 618 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used. 619 // We shouldn't need it, though, since the ABI rules require that 620 // * If the compiler generates NGO stores it also generates the fence 621 // * If users hand-code NGO stores they should insert the fence 622 // therefore no incomplete unordered stores should be visible. 623 #else 624 // C74404 625 // This is to address non-temporal store instructions (sfence needed). 626 // The clflush instruction is addressed either (mfence needed). 627 // Probably the non-temporal load monvtdqa instruction should also be 628 // addressed. 629 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2. 630 if (!__kmp_cpuinfo.initialized) { 631 __kmp_query_cpuid(&__kmp_cpuinfo); 632 } 633 if (!__kmp_cpuinfo.sse2) { 634 // CPU cannot execute SSE2 instructions. 635 } else { 636 #if KMP_COMPILER_ICC 637 _mm_mfence(); 638 #elif KMP_COMPILER_MSVC 639 MemoryBarrier(); 640 #else 641 __sync_synchronize(); 642 #endif // KMP_COMPILER_ICC 643 } 644 #endif // KMP_MIC 645 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64) 646 // Nothing to see here move along 647 #elif KMP_ARCH_PPC64 648 // Nothing needed here (we have a real MB above). 649 #if KMP_OS_CNK 650 // The flushing thread needs to yield here; this prevents a 651 // busy-waiting thread from saturating the pipeline. flush is 652 // often used in loops like this: 653 // while (!flag) { 654 // #pragma omp flush(flag) 655 // } 656 // and adding the yield here is good for at least a 10x speedup 657 // when running >2 threads per core (on the NAS LU benchmark). 658 __kmp_yield(TRUE); 659 #endif 660 #else 661 #error Unknown or unsupported architecture 662 #endif 663 664 #if OMPT_SUPPORT && OMPT_OPTIONAL 665 if (ompt_enabled.ompt_callback_flush) { 666 ompt_callbacks.ompt_callback(ompt_callback_flush)( 667 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0)); 668 } 669 #endif 670 } 671 672 /* -------------------------------------------------------------------------- */ 673 /*! 674 @ingroup SYNCHRONIZATION 675 @param loc source location information 676 @param global_tid thread id. 677 678 Execute a barrier. 679 */ 680 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) { 681 KMP_COUNT_BLOCK(OMP_BARRIER); 682 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid)); 683 684 if (!TCR_4(__kmp_init_parallel)) 685 __kmp_parallel_initialize(); 686 687 if (__kmp_env_consistency_check) { 688 if (loc == 0) { 689 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 690 } 691 692 __kmp_check_barrier(global_tid, ct_barrier, loc); 693 } 694 695 #if OMPT_SUPPORT 696 omp_frame_t *ompt_frame; 697 if (ompt_enabled.enabled) { 698 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 699 if (ompt_frame->enter_frame == NULL) 700 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 701 OMPT_STORE_RETURN_ADDRESS(global_tid); 702 } 703 #endif 704 __kmp_threads[global_tid]->th.th_ident = loc; 705 // TODO: explicit barrier_wait_id: 706 // this function is called when 'barrier' directive is present or 707 // implicit barrier at the end of a worksharing construct. 708 // 1) better to add a per-thread barrier counter to a thread data structure 709 // 2) set to 0 when a new team is created 710 // 4) no sync is required 711 712 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 713 #if OMPT_SUPPORT && OMPT_OPTIONAL 714 if (ompt_enabled.enabled) { 715 ompt_frame->enter_frame = NULL; 716 } 717 #endif 718 } 719 720 /* The BARRIER for a MASTER section is always explicit */ 721 /*! 722 @ingroup WORK_SHARING 723 @param loc source location information. 724 @param global_tid global thread number . 725 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise. 726 */ 727 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) { 728 int status = 0; 729 730 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid)); 731 732 if (!TCR_4(__kmp_init_parallel)) 733 __kmp_parallel_initialize(); 734 735 if (KMP_MASTER_GTID(global_tid)) { 736 KMP_COUNT_BLOCK(OMP_MASTER); 737 KMP_PUSH_PARTITIONED_TIMER(OMP_master); 738 status = 1; 739 } 740 741 #if OMPT_SUPPORT && OMPT_OPTIONAL 742 if (status) { 743 if (ompt_enabled.ompt_callback_master) { 744 kmp_info_t *this_thr = __kmp_threads[global_tid]; 745 kmp_team_t *team = this_thr->th.th_team; 746 747 int tid = __kmp_tid_from_gtid(global_tid); 748 ompt_callbacks.ompt_callback(ompt_callback_master)( 749 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data), 750 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 751 OMPT_GET_RETURN_ADDRESS(0)); 752 } 753 } 754 #endif 755 756 if (__kmp_env_consistency_check) { 757 #if KMP_USE_DYNAMIC_LOCK 758 if (status) 759 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0); 760 else 761 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0); 762 #else 763 if (status) 764 __kmp_push_sync(global_tid, ct_master, loc, NULL); 765 else 766 __kmp_check_sync(global_tid, ct_master, loc, NULL); 767 #endif 768 } 769 770 return status; 771 } 772 773 /*! 774 @ingroup WORK_SHARING 775 @param loc source location information. 776 @param global_tid global thread number . 777 778 Mark the end of a <tt>master</tt> region. This should only be called by the 779 thread that executes the <tt>master</tt> region. 780 */ 781 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) { 782 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid)); 783 784 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid)); 785 KMP_POP_PARTITIONED_TIMER(); 786 787 #if OMPT_SUPPORT && OMPT_OPTIONAL 788 kmp_info_t *this_thr = __kmp_threads[global_tid]; 789 kmp_team_t *team = this_thr->th.th_team; 790 if (ompt_enabled.ompt_callback_master) { 791 int tid = __kmp_tid_from_gtid(global_tid); 792 ompt_callbacks.ompt_callback(ompt_callback_master)( 793 ompt_scope_end, &(team->t.ompt_team_info.parallel_data), 794 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 795 OMPT_GET_RETURN_ADDRESS(0)); 796 } 797 #endif 798 799 if (__kmp_env_consistency_check) { 800 if (global_tid < 0) 801 KMP_WARNING(ThreadIdentInvalid); 802 803 if (KMP_MASTER_GTID(global_tid)) 804 __kmp_pop_sync(global_tid, ct_master, loc); 805 } 806 } 807 808 /*! 809 @ingroup WORK_SHARING 810 @param loc source location information. 811 @param gtid global thread number. 812 813 Start execution of an <tt>ordered</tt> construct. 814 */ 815 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) { 816 int cid = 0; 817 kmp_info_t *th; 818 KMP_DEBUG_ASSERT(__kmp_init_serial); 819 820 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid)); 821 822 if (!TCR_4(__kmp_init_parallel)) 823 __kmp_parallel_initialize(); 824 825 #if USE_ITT_BUILD 826 __kmp_itt_ordered_prep(gtid); 827 // TODO: ordered_wait_id 828 #endif /* USE_ITT_BUILD */ 829 830 th = __kmp_threads[gtid]; 831 832 #if OMPT_SUPPORT && OMPT_OPTIONAL 833 kmp_team_t *team; 834 omp_wait_id_t lck; 835 void *codeptr_ra; 836 if (ompt_enabled.enabled) { 837 OMPT_STORE_RETURN_ADDRESS(gtid); 838 team = __kmp_team_from_gtid(gtid); 839 lck = (omp_wait_id_t)&team->t.t_ordered.dt.t_value; 840 /* OMPT state update */ 841 th->th.ompt_thread_info.wait_id = lck; 842 th->th.ompt_thread_info.state = omp_state_wait_ordered; 843 844 /* OMPT event callback */ 845 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 846 if (ompt_enabled.ompt_callback_mutex_acquire) { 847 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 848 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, 849 (omp_wait_id_t)lck, codeptr_ra); 850 } 851 } 852 #endif 853 854 if (th->th.th_dispatch->th_deo_fcn != 0) 855 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc); 856 else 857 __kmp_parallel_deo(>id, &cid, loc); 858 859 #if OMPT_SUPPORT && OMPT_OPTIONAL 860 if (ompt_enabled.enabled) { 861 /* OMPT state update */ 862 th->th.ompt_thread_info.state = omp_state_work_parallel; 863 th->th.ompt_thread_info.wait_id = 0; 864 865 /* OMPT event callback */ 866 if (ompt_enabled.ompt_callback_mutex_acquired) { 867 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 868 ompt_mutex_ordered, (omp_wait_id_t)lck, codeptr_ra); 869 } 870 } 871 #endif 872 873 #if USE_ITT_BUILD 874 __kmp_itt_ordered_start(gtid); 875 #endif /* USE_ITT_BUILD */ 876 } 877 878 /*! 879 @ingroup WORK_SHARING 880 @param loc source location information. 881 @param gtid global thread number. 882 883 End execution of an <tt>ordered</tt> construct. 884 */ 885 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) { 886 int cid = 0; 887 kmp_info_t *th; 888 889 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid)); 890 891 #if USE_ITT_BUILD 892 __kmp_itt_ordered_end(gtid); 893 // TODO: ordered_wait_id 894 #endif /* USE_ITT_BUILD */ 895 896 th = __kmp_threads[gtid]; 897 898 if (th->th.th_dispatch->th_dxo_fcn != 0) 899 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc); 900 else 901 __kmp_parallel_dxo(>id, &cid, loc); 902 903 #if OMPT_SUPPORT && OMPT_OPTIONAL 904 OMPT_STORE_RETURN_ADDRESS(gtid); 905 if (ompt_enabled.ompt_callback_mutex_released) { 906 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 907 ompt_mutex_ordered, 908 (omp_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value, 909 OMPT_LOAD_RETURN_ADDRESS(gtid)); 910 } 911 #endif 912 } 913 914 #if KMP_USE_DYNAMIC_LOCK 915 916 static __forceinline void 917 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc, 918 kmp_int32 gtid, kmp_indirect_locktag_t tag) { 919 // Pointer to the allocated indirect lock is written to crit, while indexing 920 // is ignored. 921 void *idx; 922 kmp_indirect_lock_t **lck; 923 lck = (kmp_indirect_lock_t **)crit; 924 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag); 925 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock); 926 KMP_SET_I_LOCK_LOCATION(ilk, loc); 927 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section); 928 KA_TRACE(20, 929 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag)); 930 #if USE_ITT_BUILD 931 __kmp_itt_critical_creating(ilk->lock, loc); 932 #endif 933 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk); 934 if (status == 0) { 935 #if USE_ITT_BUILD 936 __kmp_itt_critical_destroyed(ilk->lock); 937 #endif 938 // We don't really need to destroy the unclaimed lock here since it will be 939 // cleaned up at program exit. 940 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx); 941 } 942 KMP_DEBUG_ASSERT(*lck != NULL); 943 } 944 945 // Fast-path acquire tas lock 946 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \ 947 { \ 948 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 949 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 950 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 951 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 952 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \ 953 kmp_uint32 spins; \ 954 KMP_FSYNC_PREPARE(l); \ 955 KMP_INIT_YIELD(spins); \ 956 if (TCR_4(__kmp_nth) > \ 957 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 958 KMP_YIELD(TRUE); \ 959 } else { \ 960 KMP_YIELD_SPIN(spins); \ 961 } \ 962 kmp_backoff_t backoff = __kmp_spin_backoff_params; \ 963 while ( \ 964 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 965 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \ 966 __kmp_spin_backoff(&backoff); \ 967 if (TCR_4(__kmp_nth) > \ 968 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 969 KMP_YIELD(TRUE); \ 970 } else { \ 971 KMP_YIELD_SPIN(spins); \ 972 } \ 973 } \ 974 } \ 975 KMP_FSYNC_ACQUIRED(l); \ 976 } 977 978 // Fast-path test tas lock 979 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \ 980 { \ 981 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 982 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 983 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 984 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \ 985 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \ 986 } 987 988 // Fast-path release tas lock 989 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \ 990 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); } 991 992 #if KMP_USE_FUTEX 993 994 #include <sys/syscall.h> 995 #include <unistd.h> 996 #ifndef FUTEX_WAIT 997 #define FUTEX_WAIT 0 998 #endif 999 #ifndef FUTEX_WAKE 1000 #define FUTEX_WAKE 1 1001 #endif 1002 1003 // Fast-path acquire futex lock 1004 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \ 1005 { \ 1006 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1007 kmp_int32 gtid_code = (gtid + 1) << 1; \ 1008 KMP_MB(); \ 1009 KMP_FSYNC_PREPARE(ftx); \ 1010 kmp_int32 poll_val; \ 1011 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \ 1012 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1013 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \ 1014 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \ 1015 if (!cond) { \ 1016 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \ 1017 poll_val | \ 1018 KMP_LOCK_BUSY(1, futex))) { \ 1019 continue; \ 1020 } \ 1021 poll_val |= KMP_LOCK_BUSY(1, futex); \ 1022 } \ 1023 kmp_int32 rc; \ 1024 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \ 1025 NULL, NULL, 0)) != 0) { \ 1026 continue; \ 1027 } \ 1028 gtid_code |= 1; \ 1029 } \ 1030 KMP_FSYNC_ACQUIRED(ftx); \ 1031 } 1032 1033 // Fast-path test futex lock 1034 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \ 1035 { \ 1036 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1037 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1038 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \ 1039 KMP_FSYNC_ACQUIRED(ftx); \ 1040 rc = TRUE; \ 1041 } else { \ 1042 rc = FALSE; \ 1043 } \ 1044 } 1045 1046 // Fast-path release futex lock 1047 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \ 1048 { \ 1049 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1050 KMP_MB(); \ 1051 KMP_FSYNC_RELEASING(ftx); \ 1052 kmp_int32 poll_val = \ 1053 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \ 1054 if (KMP_LOCK_STRIP(poll_val) & 1) { \ 1055 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \ 1056 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \ 1057 } \ 1058 KMP_MB(); \ 1059 KMP_YIELD(TCR_4(__kmp_nth) > \ 1060 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \ 1061 } 1062 1063 #endif // KMP_USE_FUTEX 1064 1065 #else // KMP_USE_DYNAMIC_LOCK 1066 1067 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit, 1068 ident_t const *loc, 1069 kmp_int32 gtid) { 1070 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit; 1071 1072 // Because of the double-check, the following load doesn't need to be volatile 1073 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1074 1075 if (lck == NULL) { 1076 void *idx; 1077 1078 // Allocate & initialize the lock. 1079 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup() 1080 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section); 1081 __kmp_init_user_lock_with_checks(lck); 1082 __kmp_set_user_lock_location(lck, loc); 1083 #if USE_ITT_BUILD 1084 __kmp_itt_critical_creating(lck); 1085 // __kmp_itt_critical_creating() should be called *before* the first usage 1086 // of underlying lock. It is the only place where we can guarantee it. There 1087 // are chances the lock will destroyed with no usage, but it is not a 1088 // problem, because this is not real event seen by user but rather setting 1089 // name for object (lock). See more details in kmp_itt.h. 1090 #endif /* USE_ITT_BUILD */ 1091 1092 // Use a cmpxchg instruction to slam the start of the critical section with 1093 // the lock pointer. If another thread beat us to it, deallocate the lock, 1094 // and use the lock that the other thread allocated. 1095 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck); 1096 1097 if (status == 0) { 1098 // Deallocate the lock and reload the value. 1099 #if USE_ITT_BUILD 1100 __kmp_itt_critical_destroyed(lck); 1101 // Let ITT know the lock is destroyed and the same memory location may be reused 1102 // for another purpose. 1103 #endif /* USE_ITT_BUILD */ 1104 __kmp_destroy_user_lock_with_checks(lck); 1105 __kmp_user_lock_free(&idx, gtid, lck); 1106 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1107 KMP_DEBUG_ASSERT(lck != NULL); 1108 } 1109 } 1110 return lck; 1111 } 1112 1113 #endif // KMP_USE_DYNAMIC_LOCK 1114 1115 /*! 1116 @ingroup WORK_SHARING 1117 @param loc source location information. 1118 @param global_tid global thread number . 1119 @param crit identity of the critical section. This could be a pointer to a lock 1120 associated with the critical section, or some other suitably unique value. 1121 1122 Enter code protected by a `critical` construct. 1123 This function blocks until the executing thread can enter the critical section. 1124 */ 1125 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, 1126 kmp_critical_name *crit) { 1127 #if KMP_USE_DYNAMIC_LOCK 1128 #if OMPT_SUPPORT && OMPT_OPTIONAL 1129 OMPT_STORE_RETURN_ADDRESS(global_tid); 1130 #endif // OMPT_SUPPORT 1131 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); 1132 #else 1133 KMP_COUNT_BLOCK(OMP_CRITICAL); 1134 #if OMPT_SUPPORT && OMPT_OPTIONAL 1135 omp_state_t prev_state = omp_state_undefined; 1136 ompt_thread_info_t ti; 1137 #endif 1138 kmp_user_lock_p lck; 1139 1140 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1141 1142 // TODO: add THR_OVHD_STATE 1143 1144 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1145 KMP_CHECK_USER_LOCK_INIT(); 1146 1147 if ((__kmp_user_lock_kind == lk_tas) && 1148 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1149 lck = (kmp_user_lock_p)crit; 1150 } 1151 #if KMP_USE_FUTEX 1152 else if ((__kmp_user_lock_kind == lk_futex) && 1153 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1154 lck = (kmp_user_lock_p)crit; 1155 } 1156 #endif 1157 else { // ticket, queuing or drdpa 1158 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 1159 } 1160 1161 if (__kmp_env_consistency_check) 1162 __kmp_push_sync(global_tid, ct_critical, loc, lck); 1163 1164 // since the critical directive binds to all threads, not just the current 1165 // team we have to check this even if we are in a serialized team. 1166 // also, even if we are the uber thread, we still have to conduct the lock, 1167 // as we have to contend with sibling threads. 1168 1169 #if USE_ITT_BUILD 1170 __kmp_itt_critical_acquiring(lck); 1171 #endif /* USE_ITT_BUILD */ 1172 #if OMPT_SUPPORT && OMPT_OPTIONAL 1173 OMPT_STORE_RETURN_ADDRESS(gtid); 1174 void *codeptr_ra = NULL; 1175 if (ompt_enabled.enabled) { 1176 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1177 /* OMPT state update */ 1178 prev_state = ti.state; 1179 ti.wait_id = (omp_wait_id_t)lck; 1180 ti.state = omp_state_wait_critical; 1181 1182 /* OMPT event callback */ 1183 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 1184 if (ompt_enabled.ompt_callback_mutex_acquire) { 1185 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1186 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 1187 (omp_wait_id_t)crit, codeptr_ra); 1188 } 1189 } 1190 #endif 1191 // Value of 'crit' should be good for using as a critical_id of the critical 1192 // section directive. 1193 __kmp_acquire_user_lock_with_checks(lck, global_tid); 1194 1195 #if USE_ITT_BUILD 1196 __kmp_itt_critical_acquired(lck); 1197 #endif /* USE_ITT_BUILD */ 1198 #if OMPT_SUPPORT && OMPT_OPTIONAL 1199 if (ompt_enabled.enabled) { 1200 /* OMPT state update */ 1201 ti.state = prev_state; 1202 ti.wait_id = 0; 1203 1204 /* OMPT event callback */ 1205 if (ompt_enabled.ompt_callback_mutex_acquired) { 1206 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1207 ompt_mutex_critical, (omp_wait_id_t)crit, codeptr_ra); 1208 } 1209 } 1210 #endif 1211 KMP_POP_PARTITIONED_TIMER(); 1212 1213 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1214 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1215 #endif // KMP_USE_DYNAMIC_LOCK 1216 } 1217 1218 #if KMP_USE_DYNAMIC_LOCK 1219 1220 // Converts the given hint to an internal lock implementation 1221 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) { 1222 #if KMP_USE_TSX 1223 #define KMP_TSX_LOCK(seq) lockseq_##seq 1224 #else 1225 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq 1226 #endif 1227 1228 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1229 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm) 1230 #else 1231 #define KMP_CPUINFO_RTM 0 1232 #endif 1233 1234 // Hints that do not require further logic 1235 if (hint & kmp_lock_hint_hle) 1236 return KMP_TSX_LOCK(hle); 1237 if (hint & kmp_lock_hint_rtm) 1238 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq; 1239 if (hint & kmp_lock_hint_adaptive) 1240 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq; 1241 1242 // Rule out conflicting hints first by returning the default lock 1243 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended)) 1244 return __kmp_user_lock_seq; 1245 if ((hint & omp_lock_hint_speculative) && 1246 (hint & omp_lock_hint_nonspeculative)) 1247 return __kmp_user_lock_seq; 1248 1249 // Do not even consider speculation when it appears to be contended 1250 if (hint & omp_lock_hint_contended) 1251 return lockseq_queuing; 1252 1253 // Uncontended lock without speculation 1254 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative)) 1255 return lockseq_tas; 1256 1257 // HLE lock for speculation 1258 if (hint & omp_lock_hint_speculative) 1259 return KMP_TSX_LOCK(hle); 1260 1261 return __kmp_user_lock_seq; 1262 } 1263 1264 #if OMPT_SUPPORT && OMPT_OPTIONAL 1265 static kmp_mutex_impl_t 1266 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) { 1267 if (user_lock) { 1268 switch (KMP_EXTRACT_D_TAG(user_lock)) { 1269 case 0: 1270 break; 1271 #if KMP_USE_FUTEX 1272 case locktag_futex: 1273 return kmp_mutex_impl_queuing; 1274 #endif 1275 case locktag_tas: 1276 return kmp_mutex_impl_spin; 1277 #if KMP_USE_TSX 1278 case locktag_hle: 1279 return kmp_mutex_impl_speculative; 1280 #endif 1281 default: 1282 return ompt_mutex_impl_unknown; 1283 } 1284 ilock = KMP_LOOKUP_I_LOCK(user_lock); 1285 } 1286 KMP_ASSERT(ilock); 1287 switch (ilock->type) { 1288 #if KMP_USE_TSX 1289 case locktag_adaptive: 1290 case locktag_rtm: 1291 return kmp_mutex_impl_speculative; 1292 #endif 1293 case locktag_nested_tas: 1294 return kmp_mutex_impl_spin; 1295 #if KMP_USE_FUTEX 1296 case locktag_nested_futex: 1297 #endif 1298 case locktag_ticket: 1299 case locktag_queuing: 1300 case locktag_drdpa: 1301 case locktag_nested_ticket: 1302 case locktag_nested_queuing: 1303 case locktag_nested_drdpa: 1304 return kmp_mutex_impl_queuing; 1305 default: 1306 return ompt_mutex_impl_unknown; 1307 } 1308 } 1309 1310 // For locks without dynamic binding 1311 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() { 1312 switch (__kmp_user_lock_kind) { 1313 case lk_tas: 1314 return kmp_mutex_impl_spin; 1315 #if KMP_USE_FUTEX 1316 case lk_futex: 1317 #endif 1318 case lk_ticket: 1319 case lk_queuing: 1320 case lk_drdpa: 1321 return kmp_mutex_impl_queuing; 1322 #if KMP_USE_TSX 1323 case lk_hle: 1324 case lk_rtm: 1325 case lk_adaptive: 1326 return kmp_mutex_impl_speculative; 1327 #endif 1328 default: 1329 return ompt_mutex_impl_unknown; 1330 } 1331 } 1332 #endif 1333 1334 /*! 1335 @ingroup WORK_SHARING 1336 @param loc source location information. 1337 @param global_tid global thread number. 1338 @param crit identity of the critical section. This could be a pointer to a lock 1339 associated with the critical section, or some other suitably unique value. 1340 @param hint the lock hint. 1341 1342 Enter code protected by a `critical` construct with a hint. The hint value is 1343 used to suggest a lock implementation. This function blocks until the executing 1344 thread can enter the critical section unless the hint suggests use of 1345 speculative execution and the hardware supports it. 1346 */ 1347 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid, 1348 kmp_critical_name *crit, uintptr_t hint) { 1349 KMP_COUNT_BLOCK(OMP_CRITICAL); 1350 kmp_user_lock_p lck; 1351 #if OMPT_SUPPORT && OMPT_OPTIONAL 1352 omp_state_t prev_state = omp_state_undefined; 1353 ompt_thread_info_t ti; 1354 // This is the case, if called from __kmpc_critical: 1355 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1356 if (!codeptr) 1357 codeptr = OMPT_GET_RETURN_ADDRESS(0); 1358 #endif 1359 1360 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1361 1362 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 1363 // Check if it is initialized. 1364 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1365 if (*lk == 0) { 1366 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint); 1367 if (KMP_IS_D_LOCK(lckseq)) { 1368 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 1369 KMP_GET_D_TAG(lckseq)); 1370 } else { 1371 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq)); 1372 } 1373 } 1374 // Branch for accessing the actual lock object and set operation. This 1375 // branching is inevitable since this lock initialization does not follow the 1376 // normal dispatch path (lock table is not used). 1377 if (KMP_EXTRACT_D_TAG(lk) != 0) { 1378 lck = (kmp_user_lock_p)lk; 1379 if (__kmp_env_consistency_check) { 1380 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1381 __kmp_map_hint_to_lock(hint)); 1382 } 1383 #if USE_ITT_BUILD 1384 __kmp_itt_critical_acquiring(lck); 1385 #endif 1386 #if OMPT_SUPPORT && OMPT_OPTIONAL 1387 if (ompt_enabled.enabled) { 1388 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1389 /* OMPT state update */ 1390 prev_state = ti.state; 1391 ti.wait_id = (omp_wait_id_t)lck; 1392 ti.state = omp_state_wait_critical; 1393 1394 /* OMPT event callback */ 1395 if (ompt_enabled.ompt_callback_mutex_acquire) { 1396 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1397 ompt_mutex_critical, (unsigned int)hint, 1398 __ompt_get_mutex_impl_type(crit), (omp_wait_id_t)crit, codeptr); 1399 } 1400 } 1401 #endif 1402 #if KMP_USE_INLINED_TAS 1403 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1404 KMP_ACQUIRE_TAS_LOCK(lck, global_tid); 1405 } else 1406 #elif KMP_USE_INLINED_FUTEX 1407 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1408 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid); 1409 } else 1410 #endif 1411 { 1412 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 1413 } 1414 } else { 1415 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 1416 lck = ilk->lock; 1417 if (__kmp_env_consistency_check) { 1418 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1419 __kmp_map_hint_to_lock(hint)); 1420 } 1421 #if USE_ITT_BUILD 1422 __kmp_itt_critical_acquiring(lck); 1423 #endif 1424 #if OMPT_SUPPORT && OMPT_OPTIONAL 1425 if (ompt_enabled.enabled) { 1426 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1427 /* OMPT state update */ 1428 prev_state = ti.state; 1429 ti.wait_id = (omp_wait_id_t)lck; 1430 ti.state = omp_state_wait_critical; 1431 1432 /* OMPT event callback */ 1433 if (ompt_enabled.ompt_callback_mutex_acquire) { 1434 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1435 ompt_mutex_critical, (unsigned int)hint, 1436 __ompt_get_mutex_impl_type(0, ilk), (omp_wait_id_t)crit, codeptr); 1437 } 1438 } 1439 #endif 1440 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 1441 } 1442 KMP_POP_PARTITIONED_TIMER(); 1443 1444 #if USE_ITT_BUILD 1445 __kmp_itt_critical_acquired(lck); 1446 #endif /* USE_ITT_BUILD */ 1447 #if OMPT_SUPPORT && OMPT_OPTIONAL 1448 if (ompt_enabled.enabled) { 1449 /* OMPT state update */ 1450 ti.state = prev_state; 1451 ti.wait_id = 0; 1452 1453 /* OMPT event callback */ 1454 if (ompt_enabled.ompt_callback_mutex_acquired) { 1455 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1456 ompt_mutex_critical, (omp_wait_id_t)crit, codeptr); 1457 } 1458 } 1459 #endif 1460 1461 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1462 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1463 } // __kmpc_critical_with_hint 1464 1465 #endif // KMP_USE_DYNAMIC_LOCK 1466 1467 /*! 1468 @ingroup WORK_SHARING 1469 @param loc source location information. 1470 @param global_tid global thread number . 1471 @param crit identity of the critical section. This could be a pointer to a lock 1472 associated with the critical section, or some other suitably unique value. 1473 1474 Leave a critical section, releasing any lock that was held during its execution. 1475 */ 1476 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, 1477 kmp_critical_name *crit) { 1478 kmp_user_lock_p lck; 1479 1480 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid)); 1481 1482 #if KMP_USE_DYNAMIC_LOCK 1483 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 1484 lck = (kmp_user_lock_p)crit; 1485 KMP_ASSERT(lck != NULL); 1486 if (__kmp_env_consistency_check) { 1487 __kmp_pop_sync(global_tid, ct_critical, loc); 1488 } 1489 #if USE_ITT_BUILD 1490 __kmp_itt_critical_releasing(lck); 1491 #endif 1492 #if KMP_USE_INLINED_TAS 1493 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1494 KMP_RELEASE_TAS_LOCK(lck, global_tid); 1495 } else 1496 #elif KMP_USE_INLINED_FUTEX 1497 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1498 KMP_RELEASE_FUTEX_LOCK(lck, global_tid); 1499 } else 1500 #endif 1501 { 1502 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 1503 } 1504 } else { 1505 kmp_indirect_lock_t *ilk = 1506 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 1507 KMP_ASSERT(ilk != NULL); 1508 lck = ilk->lock; 1509 if (__kmp_env_consistency_check) { 1510 __kmp_pop_sync(global_tid, ct_critical, loc); 1511 } 1512 #if USE_ITT_BUILD 1513 __kmp_itt_critical_releasing(lck); 1514 #endif 1515 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid); 1516 } 1517 1518 #else // KMP_USE_DYNAMIC_LOCK 1519 1520 if ((__kmp_user_lock_kind == lk_tas) && 1521 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1522 lck = (kmp_user_lock_p)crit; 1523 } 1524 #if KMP_USE_FUTEX 1525 else if ((__kmp_user_lock_kind == lk_futex) && 1526 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1527 lck = (kmp_user_lock_p)crit; 1528 } 1529 #endif 1530 else { // ticket, queuing or drdpa 1531 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit)); 1532 } 1533 1534 KMP_ASSERT(lck != NULL); 1535 1536 if (__kmp_env_consistency_check) 1537 __kmp_pop_sync(global_tid, ct_critical, loc); 1538 1539 #if USE_ITT_BUILD 1540 __kmp_itt_critical_releasing(lck); 1541 #endif /* USE_ITT_BUILD */ 1542 // Value of 'crit' should be good for using as a critical_id of the critical 1543 // section directive. 1544 __kmp_release_user_lock_with_checks(lck, global_tid); 1545 1546 #endif // KMP_USE_DYNAMIC_LOCK 1547 1548 #if OMPT_SUPPORT && OMPT_OPTIONAL 1549 /* OMPT release event triggers after lock is released; place here to trigger 1550 * for all #if branches */ 1551 OMPT_STORE_RETURN_ADDRESS(global_tid); 1552 if (ompt_enabled.ompt_callback_mutex_released) { 1553 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 1554 ompt_mutex_critical, (omp_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0)); 1555 } 1556 #endif 1557 1558 KMP_POP_PARTITIONED_TIMER(); 1559 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid)); 1560 } 1561 1562 /*! 1563 @ingroup SYNCHRONIZATION 1564 @param loc source location information 1565 @param global_tid thread id. 1566 @return one if the thread should execute the master block, zero otherwise 1567 1568 Start execution of a combined barrier and master. The barrier is executed inside 1569 this function. 1570 */ 1571 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1572 int status; 1573 1574 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid)); 1575 1576 if (!TCR_4(__kmp_init_parallel)) 1577 __kmp_parallel_initialize(); 1578 1579 if (__kmp_env_consistency_check) 1580 __kmp_check_barrier(global_tid, ct_barrier, loc); 1581 1582 #if OMPT_SUPPORT 1583 omp_frame_t *ompt_frame; 1584 if (ompt_enabled.enabled) { 1585 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1586 if (ompt_frame->enter_frame == NULL) 1587 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1588 OMPT_STORE_RETURN_ADDRESS(global_tid); 1589 } 1590 #endif 1591 #if USE_ITT_NOTIFY 1592 __kmp_threads[global_tid]->th.th_ident = loc; 1593 #endif 1594 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL); 1595 #if OMPT_SUPPORT && OMPT_OPTIONAL 1596 if (ompt_enabled.enabled) { 1597 ompt_frame->enter_frame = NULL; 1598 } 1599 #endif 1600 1601 return (status != 0) ? 0 : 1; 1602 } 1603 1604 /*! 1605 @ingroup SYNCHRONIZATION 1606 @param loc source location information 1607 @param global_tid thread id. 1608 1609 Complete the execution of a combined barrier and master. This function should 1610 only be called at the completion of the <tt>master</tt> code. Other threads will 1611 still be waiting at the barrier and this call releases them. 1612 */ 1613 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1614 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid)); 1615 1616 __kmp_end_split_barrier(bs_plain_barrier, global_tid); 1617 } 1618 1619 /*! 1620 @ingroup SYNCHRONIZATION 1621 @param loc source location information 1622 @param global_tid thread id. 1623 @return one if the thread should execute the master block, zero otherwise 1624 1625 Start execution of a combined barrier and master(nowait) construct. 1626 The barrier is executed inside this function. 1627 There is no equivalent "end" function, since the 1628 */ 1629 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) { 1630 kmp_int32 ret; 1631 1632 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid)); 1633 1634 if (!TCR_4(__kmp_init_parallel)) 1635 __kmp_parallel_initialize(); 1636 1637 if (__kmp_env_consistency_check) { 1638 if (loc == 0) { 1639 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 1640 } 1641 __kmp_check_barrier(global_tid, ct_barrier, loc); 1642 } 1643 1644 #if OMPT_SUPPORT 1645 omp_frame_t *ompt_frame; 1646 if (ompt_enabled.enabled) { 1647 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1648 if (ompt_frame->enter_frame == NULL) 1649 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1650 OMPT_STORE_RETURN_ADDRESS(global_tid); 1651 } 1652 #endif 1653 #if USE_ITT_NOTIFY 1654 __kmp_threads[global_tid]->th.th_ident = loc; 1655 #endif 1656 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 1657 #if OMPT_SUPPORT && OMPT_OPTIONAL 1658 if (ompt_enabled.enabled) { 1659 ompt_frame->enter_frame = NULL; 1660 } 1661 #endif 1662 1663 ret = __kmpc_master(loc, global_tid); 1664 1665 if (__kmp_env_consistency_check) { 1666 /* there's no __kmpc_end_master called; so the (stats) */ 1667 /* actions of __kmpc_end_master are done here */ 1668 1669 if (global_tid < 0) { 1670 KMP_WARNING(ThreadIdentInvalid); 1671 } 1672 if (ret) { 1673 /* only one thread should do the pop since only */ 1674 /* one did the push (see __kmpc_master()) */ 1675 1676 __kmp_pop_sync(global_tid, ct_master, loc); 1677 } 1678 } 1679 1680 return (ret); 1681 } 1682 1683 /* The BARRIER for a SINGLE process section is always explicit */ 1684 /*! 1685 @ingroup WORK_SHARING 1686 @param loc source location information 1687 @param global_tid global thread number 1688 @return One if this thread should execute the single construct, zero otherwise. 1689 1690 Test whether to execute a <tt>single</tt> construct. 1691 There are no implicit barriers in the two "single" calls, rather the compiler 1692 should introduce an explicit barrier if it is required. 1693 */ 1694 1695 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) { 1696 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE); 1697 1698 if (rc) { 1699 // We are going to execute the single statement, so we should count it. 1700 KMP_COUNT_BLOCK(OMP_SINGLE); 1701 KMP_PUSH_PARTITIONED_TIMER(OMP_single); 1702 } 1703 1704 #if OMPT_SUPPORT && OMPT_OPTIONAL 1705 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1706 kmp_team_t *team = this_thr->th.th_team; 1707 int tid = __kmp_tid_from_gtid(global_tid); 1708 1709 if (ompt_enabled.enabled) { 1710 if (rc) { 1711 if (ompt_enabled.ompt_callback_work) { 1712 ompt_callbacks.ompt_callback(ompt_callback_work)( 1713 ompt_work_single_executor, ompt_scope_begin, 1714 &(team->t.ompt_team_info.parallel_data), 1715 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1716 1, OMPT_GET_RETURN_ADDRESS(0)); 1717 } 1718 } else { 1719 if (ompt_enabled.ompt_callback_work) { 1720 ompt_callbacks.ompt_callback(ompt_callback_work)( 1721 ompt_work_single_other, ompt_scope_begin, 1722 &(team->t.ompt_team_info.parallel_data), 1723 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1724 1, OMPT_GET_RETURN_ADDRESS(0)); 1725 ompt_callbacks.ompt_callback(ompt_callback_work)( 1726 ompt_work_single_other, ompt_scope_end, 1727 &(team->t.ompt_team_info.parallel_data), 1728 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1729 1, OMPT_GET_RETURN_ADDRESS(0)); 1730 } 1731 } 1732 } 1733 #endif 1734 1735 return rc; 1736 } 1737 1738 /*! 1739 @ingroup WORK_SHARING 1740 @param loc source location information 1741 @param global_tid global thread number 1742 1743 Mark the end of a <tt>single</tt> construct. This function should 1744 only be called by the thread that executed the block of code protected 1745 by the `single` construct. 1746 */ 1747 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) { 1748 __kmp_exit_single(global_tid); 1749 KMP_POP_PARTITIONED_TIMER(); 1750 1751 #if OMPT_SUPPORT && OMPT_OPTIONAL 1752 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1753 kmp_team_t *team = this_thr->th.th_team; 1754 int tid = __kmp_tid_from_gtid(global_tid); 1755 1756 if (ompt_enabled.ompt_callback_work) { 1757 ompt_callbacks.ompt_callback(ompt_callback_work)( 1758 ompt_work_single_executor, ompt_scope_end, 1759 &(team->t.ompt_team_info.parallel_data), 1760 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1, 1761 OMPT_GET_RETURN_ADDRESS(0)); 1762 } 1763 #endif 1764 } 1765 1766 /*! 1767 @ingroup WORK_SHARING 1768 @param loc Source location 1769 @param global_tid Global thread id 1770 1771 Mark the end of a statically scheduled loop. 1772 */ 1773 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) { 1774 KMP_POP_PARTITIONED_TIMER(); 1775 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid)); 1776 1777 #if OMPT_SUPPORT && OMPT_OPTIONAL 1778 if (ompt_enabled.ompt_callback_work) { 1779 ompt_work_type_t ompt_work_type = ompt_work_loop; 1780 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1781 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1782 // Determine workshare type 1783 if (loc != NULL) { 1784 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) { 1785 ompt_work_type = ompt_work_loop; 1786 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) { 1787 ompt_work_type = ompt_work_sections; 1788 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) { 1789 ompt_work_type = ompt_work_distribute; 1790 } else { 1791 // use default set above. 1792 // a warning about this case is provided in __kmpc_for_static_init 1793 } 1794 KMP_DEBUG_ASSERT(ompt_work_type); 1795 } 1796 ompt_callbacks.ompt_callback(ompt_callback_work)( 1797 ompt_work_type, ompt_scope_end, &(team_info->parallel_data), 1798 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)); 1799 } 1800 #endif 1801 if (__kmp_env_consistency_check) 1802 __kmp_pop_workshare(global_tid, ct_pdo, loc); 1803 } 1804 1805 // User routines which take C-style arguments (call by value) 1806 // different from the Fortran equivalent routines 1807 1808 void ompc_set_num_threads(int arg) { 1809 // !!!!! TODO: check the per-task binding 1810 __kmp_set_num_threads(arg, __kmp_entry_gtid()); 1811 } 1812 1813 void ompc_set_dynamic(int flag) { 1814 kmp_info_t *thread; 1815 1816 /* For the thread-private implementation of the internal controls */ 1817 thread = __kmp_entry_thread(); 1818 1819 __kmp_save_internal_controls(thread); 1820 1821 set__dynamic(thread, flag ? TRUE : FALSE); 1822 } 1823 1824 void ompc_set_nested(int flag) { 1825 kmp_info_t *thread; 1826 1827 /* For the thread-private internal controls implementation */ 1828 thread = __kmp_entry_thread(); 1829 1830 __kmp_save_internal_controls(thread); 1831 1832 set__nested(thread, flag ? TRUE : FALSE); 1833 } 1834 1835 void ompc_set_max_active_levels(int max_active_levels) { 1836 /* TO DO */ 1837 /* we want per-task implementation of this internal control */ 1838 1839 /* For the per-thread internal controls implementation */ 1840 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels); 1841 } 1842 1843 void ompc_set_schedule(omp_sched_t kind, int modifier) { 1844 // !!!!! TODO: check the per-task binding 1845 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier); 1846 } 1847 1848 int ompc_get_ancestor_thread_num(int level) { 1849 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level); 1850 } 1851 1852 int ompc_get_team_size(int level) { 1853 return __kmp_get_team_size(__kmp_entry_gtid(), level); 1854 } 1855 1856 void kmpc_set_stacksize(int arg) { 1857 // __kmp_aux_set_stacksize initializes the library if needed 1858 __kmp_aux_set_stacksize(arg); 1859 } 1860 1861 void kmpc_set_stacksize_s(size_t arg) { 1862 // __kmp_aux_set_stacksize initializes the library if needed 1863 __kmp_aux_set_stacksize(arg); 1864 } 1865 1866 void kmpc_set_blocktime(int arg) { 1867 int gtid, tid; 1868 kmp_info_t *thread; 1869 1870 gtid = __kmp_entry_gtid(); 1871 tid = __kmp_tid_from_gtid(gtid); 1872 thread = __kmp_thread_from_gtid(gtid); 1873 1874 __kmp_aux_set_blocktime(arg, thread, tid); 1875 } 1876 1877 void kmpc_set_library(int arg) { 1878 // __kmp_user_set_library initializes the library if needed 1879 __kmp_user_set_library((enum library_type)arg); 1880 } 1881 1882 void kmpc_set_defaults(char const *str) { 1883 // __kmp_aux_set_defaults initializes the library if needed 1884 __kmp_aux_set_defaults(str, KMP_STRLEN(str)); 1885 } 1886 1887 void kmpc_set_disp_num_buffers(int arg) { 1888 // ignore after initialization because some teams have already 1889 // allocated dispatch buffers 1890 if (__kmp_init_serial == 0 && arg > 0) 1891 __kmp_dispatch_num_buffers = arg; 1892 } 1893 1894 int kmpc_set_affinity_mask_proc(int proc, void **mask) { 1895 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1896 return -1; 1897 #else 1898 if (!TCR_4(__kmp_init_middle)) { 1899 __kmp_middle_initialize(); 1900 } 1901 return __kmp_aux_set_affinity_mask_proc(proc, mask); 1902 #endif 1903 } 1904 1905 int kmpc_unset_affinity_mask_proc(int proc, void **mask) { 1906 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1907 return -1; 1908 #else 1909 if (!TCR_4(__kmp_init_middle)) { 1910 __kmp_middle_initialize(); 1911 } 1912 return __kmp_aux_unset_affinity_mask_proc(proc, mask); 1913 #endif 1914 } 1915 1916 int kmpc_get_affinity_mask_proc(int proc, void **mask) { 1917 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1918 return -1; 1919 #else 1920 if (!TCR_4(__kmp_init_middle)) { 1921 __kmp_middle_initialize(); 1922 } 1923 return __kmp_aux_get_affinity_mask_proc(proc, mask); 1924 #endif 1925 } 1926 1927 /* -------------------------------------------------------------------------- */ 1928 /*! 1929 @ingroup THREADPRIVATE 1930 @param loc source location information 1931 @param gtid global thread number 1932 @param cpy_size size of the cpy_data buffer 1933 @param cpy_data pointer to data to be copied 1934 @param cpy_func helper function to call for copying data 1935 @param didit flag variable: 1=single thread; 0=not single thread 1936 1937 __kmpc_copyprivate implements the interface for the private data broadcast 1938 needed for the copyprivate clause associated with a single region in an 1939 OpenMP<sup>*</sup> program (both C and Fortran). 1940 All threads participating in the parallel region call this routine. 1941 One of the threads (called the single thread) should have the <tt>didit</tt> 1942 variable set to 1 and all other threads should have that variable set to 0. 1943 All threads pass a pointer to a data buffer (cpy_data) that they have built. 1944 1945 The OpenMP specification forbids the use of nowait on the single region when a 1946 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a 1947 barrier internally to avoid race conditions, so the code generation for the 1948 single region should avoid generating a barrier after the call to @ref 1949 __kmpc_copyprivate. 1950 1951 The <tt>gtid</tt> parameter is the global thread id for the current thread. 1952 The <tt>loc</tt> parameter is a pointer to source location information. 1953 1954 Internal implementation: The single thread will first copy its descriptor 1955 address (cpy_data) to a team-private location, then the other threads will each 1956 call the function pointed to by the parameter cpy_func, which carries out the 1957 copy by copying the data using the cpy_data buffer. 1958 1959 The cpy_func routine used for the copy and the contents of the data area defined 1960 by cpy_data and cpy_size may be built in any fashion that will allow the copy 1961 to be done. For instance, the cpy_data buffer can hold the actual data to be 1962 copied or it may hold a list of pointers to the data. The cpy_func routine must 1963 interpret the cpy_data buffer appropriately. 1964 1965 The interface to cpy_func is as follows: 1966 @code 1967 void cpy_func( void *destination, void *source ) 1968 @endcode 1969 where void *destination is the cpy_data pointer for the thread being copied to 1970 and void *source is the cpy_data pointer for the thread being copied from. 1971 */ 1972 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, 1973 void *cpy_data, void (*cpy_func)(void *, void *), 1974 kmp_int32 didit) { 1975 void **data_ptr; 1976 1977 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid)); 1978 1979 KMP_MB(); 1980 1981 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data; 1982 1983 if (__kmp_env_consistency_check) { 1984 if (loc == 0) { 1985 KMP_WARNING(ConstructIdentInvalid); 1986 } 1987 } 1988 1989 // ToDo: Optimize the following two barriers into some kind of split barrier 1990 1991 if (didit) 1992 *data_ptr = cpy_data; 1993 1994 #if OMPT_SUPPORT 1995 omp_frame_t *ompt_frame; 1996 if (ompt_enabled.enabled) { 1997 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1998 if (ompt_frame->enter_frame == NULL) 1999 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 2000 OMPT_STORE_RETURN_ADDRESS(gtid); 2001 } 2002 #endif 2003 /* This barrier is not a barrier region boundary */ 2004 #if USE_ITT_NOTIFY 2005 __kmp_threads[gtid]->th.th_ident = loc; 2006 #endif 2007 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2008 2009 if (!didit) 2010 (*cpy_func)(cpy_data, *data_ptr); 2011 2012 // Consider next barrier a user-visible barrier for barrier region boundaries 2013 // Nesting checks are already handled by the single construct checks 2014 2015 #if OMPT_SUPPORT 2016 if (ompt_enabled.enabled) { 2017 OMPT_STORE_RETURN_ADDRESS(gtid); 2018 } 2019 #endif 2020 #if USE_ITT_NOTIFY 2021 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. 2022 // tasks can overwrite the location) 2023 #endif 2024 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2025 #if OMPT_SUPPORT && OMPT_OPTIONAL 2026 if (ompt_enabled.enabled) { 2027 ompt_frame->enter_frame = NULL; 2028 } 2029 #endif 2030 } 2031 2032 /* -------------------------------------------------------------------------- */ 2033 2034 #define INIT_LOCK __kmp_init_user_lock_with_checks 2035 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks 2036 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks 2037 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed 2038 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks 2039 #define ACQUIRE_NESTED_LOCK_TIMED \ 2040 __kmp_acquire_nested_user_lock_with_checks_timed 2041 #define RELEASE_LOCK __kmp_release_user_lock_with_checks 2042 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks 2043 #define TEST_LOCK __kmp_test_user_lock_with_checks 2044 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks 2045 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks 2046 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks 2047 2048 // TODO: Make check abort messages use location info & pass it into 2049 // with_checks routines 2050 2051 #if KMP_USE_DYNAMIC_LOCK 2052 2053 // internal lock initializer 2054 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock, 2055 kmp_dyna_lockseq_t seq) { 2056 if (KMP_IS_D_LOCK(seq)) { 2057 KMP_INIT_D_LOCK(lock, seq); 2058 #if USE_ITT_BUILD 2059 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL); 2060 #endif 2061 } else { 2062 KMP_INIT_I_LOCK(lock, seq); 2063 #if USE_ITT_BUILD 2064 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2065 __kmp_itt_lock_creating(ilk->lock, loc); 2066 #endif 2067 } 2068 } 2069 2070 // internal nest lock initializer 2071 static __forceinline void 2072 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, 2073 kmp_dyna_lockseq_t seq) { 2074 #if KMP_USE_TSX 2075 // Don't have nested lock implementation for speculative locks 2076 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive) 2077 seq = __kmp_user_lock_seq; 2078 #endif 2079 switch (seq) { 2080 case lockseq_tas: 2081 seq = lockseq_nested_tas; 2082 break; 2083 #if KMP_USE_FUTEX 2084 case lockseq_futex: 2085 seq = lockseq_nested_futex; 2086 break; 2087 #endif 2088 case lockseq_ticket: 2089 seq = lockseq_nested_ticket; 2090 break; 2091 case lockseq_queuing: 2092 seq = lockseq_nested_queuing; 2093 break; 2094 case lockseq_drdpa: 2095 seq = lockseq_nested_drdpa; 2096 break; 2097 default: 2098 seq = lockseq_nested_queuing; 2099 } 2100 KMP_INIT_I_LOCK(lock, seq); 2101 #if USE_ITT_BUILD 2102 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2103 __kmp_itt_lock_creating(ilk->lock, loc); 2104 #endif 2105 } 2106 2107 /* initialize the lock with a hint */ 2108 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, 2109 uintptr_t hint) { 2110 KMP_DEBUG_ASSERT(__kmp_init_serial); 2111 if (__kmp_env_consistency_check && user_lock == NULL) { 2112 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint"); 2113 } 2114 2115 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2116 2117 #if OMPT_SUPPORT && OMPT_OPTIONAL 2118 // This is the case, if called from omp_init_lock_with_hint: 2119 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2120 if (!codeptr) 2121 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2122 if (ompt_enabled.ompt_callback_lock_init) { 2123 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2124 ompt_mutex_lock, (omp_lock_hint_t)hint, 2125 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2126 codeptr); 2127 } 2128 #endif 2129 } 2130 2131 /* initialize the lock with a hint */ 2132 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, 2133 void **user_lock, uintptr_t hint) { 2134 KMP_DEBUG_ASSERT(__kmp_init_serial); 2135 if (__kmp_env_consistency_check && user_lock == NULL) { 2136 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint"); 2137 } 2138 2139 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2140 2141 #if OMPT_SUPPORT && OMPT_OPTIONAL 2142 // This is the case, if called from omp_init_lock_with_hint: 2143 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2144 if (!codeptr) 2145 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2146 if (ompt_enabled.ompt_callback_lock_init) { 2147 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2148 ompt_mutex_nest_lock, (omp_lock_hint_t)hint, 2149 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2150 codeptr); 2151 } 2152 #endif 2153 } 2154 2155 #endif // KMP_USE_DYNAMIC_LOCK 2156 2157 /* initialize the lock */ 2158 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2159 #if KMP_USE_DYNAMIC_LOCK 2160 2161 KMP_DEBUG_ASSERT(__kmp_init_serial); 2162 if (__kmp_env_consistency_check && user_lock == NULL) { 2163 KMP_FATAL(LockIsUninitialized, "omp_init_lock"); 2164 } 2165 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2166 2167 #if OMPT_SUPPORT && OMPT_OPTIONAL 2168 // This is the case, if called from omp_init_lock_with_hint: 2169 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2170 if (!codeptr) 2171 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2172 if (ompt_enabled.ompt_callback_lock_init) { 2173 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2174 ompt_mutex_lock, omp_lock_hint_none, 2175 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2176 codeptr); 2177 } 2178 #endif 2179 2180 #else // KMP_USE_DYNAMIC_LOCK 2181 2182 static char const *const func = "omp_init_lock"; 2183 kmp_user_lock_p lck; 2184 KMP_DEBUG_ASSERT(__kmp_init_serial); 2185 2186 if (__kmp_env_consistency_check) { 2187 if (user_lock == NULL) { 2188 KMP_FATAL(LockIsUninitialized, func); 2189 } 2190 } 2191 2192 KMP_CHECK_USER_LOCK_INIT(); 2193 2194 if ((__kmp_user_lock_kind == lk_tas) && 2195 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2196 lck = (kmp_user_lock_p)user_lock; 2197 } 2198 #if KMP_USE_FUTEX 2199 else if ((__kmp_user_lock_kind == lk_futex) && 2200 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2201 lck = (kmp_user_lock_p)user_lock; 2202 } 2203 #endif 2204 else { 2205 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2206 } 2207 INIT_LOCK(lck); 2208 __kmp_set_user_lock_location(lck, loc); 2209 2210 #if OMPT_SUPPORT && OMPT_OPTIONAL 2211 // This is the case, if called from omp_init_lock_with_hint: 2212 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2213 if (!codeptr) 2214 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2215 if (ompt_enabled.ompt_callback_lock_init) { 2216 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2217 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2218 (omp_wait_id_t)user_lock, codeptr); 2219 } 2220 #endif 2221 2222 #if USE_ITT_BUILD 2223 __kmp_itt_lock_creating(lck); 2224 #endif /* USE_ITT_BUILD */ 2225 2226 #endif // KMP_USE_DYNAMIC_LOCK 2227 } // __kmpc_init_lock 2228 2229 /* initialize the lock */ 2230 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2231 #if KMP_USE_DYNAMIC_LOCK 2232 2233 KMP_DEBUG_ASSERT(__kmp_init_serial); 2234 if (__kmp_env_consistency_check && user_lock == NULL) { 2235 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock"); 2236 } 2237 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2238 2239 #if OMPT_SUPPORT && OMPT_OPTIONAL 2240 // This is the case, if called from omp_init_lock_with_hint: 2241 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2242 if (!codeptr) 2243 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2244 if (ompt_enabled.ompt_callback_lock_init) { 2245 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2246 ompt_mutex_nest_lock, omp_lock_hint_none, 2247 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2248 codeptr); 2249 } 2250 #endif 2251 2252 #else // KMP_USE_DYNAMIC_LOCK 2253 2254 static char const *const func = "omp_init_nest_lock"; 2255 kmp_user_lock_p lck; 2256 KMP_DEBUG_ASSERT(__kmp_init_serial); 2257 2258 if (__kmp_env_consistency_check) { 2259 if (user_lock == NULL) { 2260 KMP_FATAL(LockIsUninitialized, func); 2261 } 2262 } 2263 2264 KMP_CHECK_USER_LOCK_INIT(); 2265 2266 if ((__kmp_user_lock_kind == lk_tas) && 2267 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2268 OMP_NEST_LOCK_T_SIZE)) { 2269 lck = (kmp_user_lock_p)user_lock; 2270 } 2271 #if KMP_USE_FUTEX 2272 else if ((__kmp_user_lock_kind == lk_futex) && 2273 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2274 OMP_NEST_LOCK_T_SIZE)) { 2275 lck = (kmp_user_lock_p)user_lock; 2276 } 2277 #endif 2278 else { 2279 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2280 } 2281 2282 INIT_NESTED_LOCK(lck); 2283 __kmp_set_user_lock_location(lck, loc); 2284 2285 #if OMPT_SUPPORT && OMPT_OPTIONAL 2286 // This is the case, if called from omp_init_lock_with_hint: 2287 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2288 if (!codeptr) 2289 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2290 if (ompt_enabled.ompt_callback_lock_init) { 2291 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2292 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2293 (omp_wait_id_t)user_lock, codeptr); 2294 } 2295 #endif 2296 2297 #if USE_ITT_BUILD 2298 __kmp_itt_lock_creating(lck); 2299 #endif /* USE_ITT_BUILD */ 2300 2301 #endif // KMP_USE_DYNAMIC_LOCK 2302 } // __kmpc_init_nest_lock 2303 2304 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2305 #if KMP_USE_DYNAMIC_LOCK 2306 2307 #if USE_ITT_BUILD 2308 kmp_user_lock_p lck; 2309 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2310 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2311 } else { 2312 lck = (kmp_user_lock_p)user_lock; 2313 } 2314 __kmp_itt_lock_destroyed(lck); 2315 #endif 2316 #if OMPT_SUPPORT && OMPT_OPTIONAL 2317 // This is the case, if called from omp_init_lock_with_hint: 2318 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2319 if (!codeptr) 2320 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2321 if (ompt_enabled.ompt_callback_lock_destroy) { 2322 kmp_user_lock_p lck; 2323 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2324 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2325 } else { 2326 lck = (kmp_user_lock_p)user_lock; 2327 } 2328 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2329 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2330 } 2331 #endif 2332 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2333 #else 2334 kmp_user_lock_p lck; 2335 2336 if ((__kmp_user_lock_kind == lk_tas) && 2337 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2338 lck = (kmp_user_lock_p)user_lock; 2339 } 2340 #if KMP_USE_FUTEX 2341 else if ((__kmp_user_lock_kind == lk_futex) && 2342 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2343 lck = (kmp_user_lock_p)user_lock; 2344 } 2345 #endif 2346 else { 2347 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock"); 2348 } 2349 2350 #if OMPT_SUPPORT && OMPT_OPTIONAL 2351 // This is the case, if called from omp_init_lock_with_hint: 2352 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2353 if (!codeptr) 2354 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2355 if (ompt_enabled.ompt_callback_lock_destroy) { 2356 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2357 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2358 } 2359 #endif 2360 2361 #if USE_ITT_BUILD 2362 __kmp_itt_lock_destroyed(lck); 2363 #endif /* USE_ITT_BUILD */ 2364 DESTROY_LOCK(lck); 2365 2366 if ((__kmp_user_lock_kind == lk_tas) && 2367 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2368 ; 2369 } 2370 #if KMP_USE_FUTEX 2371 else if ((__kmp_user_lock_kind == lk_futex) && 2372 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2373 ; 2374 } 2375 #endif 2376 else { 2377 __kmp_user_lock_free(user_lock, gtid, lck); 2378 } 2379 #endif // KMP_USE_DYNAMIC_LOCK 2380 } // __kmpc_destroy_lock 2381 2382 /* destroy the lock */ 2383 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2384 #if KMP_USE_DYNAMIC_LOCK 2385 2386 #if USE_ITT_BUILD 2387 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock); 2388 __kmp_itt_lock_destroyed(ilk->lock); 2389 #endif 2390 #if OMPT_SUPPORT && OMPT_OPTIONAL 2391 // This is the case, if called from omp_init_lock_with_hint: 2392 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2393 if (!codeptr) 2394 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2395 if (ompt_enabled.ompt_callback_lock_destroy) { 2396 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2397 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2398 } 2399 #endif 2400 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2401 2402 #else // KMP_USE_DYNAMIC_LOCK 2403 2404 kmp_user_lock_p lck; 2405 2406 if ((__kmp_user_lock_kind == lk_tas) && 2407 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2408 OMP_NEST_LOCK_T_SIZE)) { 2409 lck = (kmp_user_lock_p)user_lock; 2410 } 2411 #if KMP_USE_FUTEX 2412 else if ((__kmp_user_lock_kind == lk_futex) && 2413 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2414 OMP_NEST_LOCK_T_SIZE)) { 2415 lck = (kmp_user_lock_p)user_lock; 2416 } 2417 #endif 2418 else { 2419 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock"); 2420 } 2421 2422 #if OMPT_SUPPORT && OMPT_OPTIONAL 2423 // This is the case, if called from omp_init_lock_with_hint: 2424 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2425 if (!codeptr) 2426 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2427 if (ompt_enabled.ompt_callback_lock_destroy) { 2428 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2429 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2430 } 2431 #endif 2432 2433 #if USE_ITT_BUILD 2434 __kmp_itt_lock_destroyed(lck); 2435 #endif /* USE_ITT_BUILD */ 2436 2437 DESTROY_NESTED_LOCK(lck); 2438 2439 if ((__kmp_user_lock_kind == lk_tas) && 2440 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2441 OMP_NEST_LOCK_T_SIZE)) { 2442 ; 2443 } 2444 #if KMP_USE_FUTEX 2445 else if ((__kmp_user_lock_kind == lk_futex) && 2446 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2447 OMP_NEST_LOCK_T_SIZE)) { 2448 ; 2449 } 2450 #endif 2451 else { 2452 __kmp_user_lock_free(user_lock, gtid, lck); 2453 } 2454 #endif // KMP_USE_DYNAMIC_LOCK 2455 } // __kmpc_destroy_nest_lock 2456 2457 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2458 KMP_COUNT_BLOCK(OMP_set_lock); 2459 #if KMP_USE_DYNAMIC_LOCK 2460 int tag = KMP_EXTRACT_D_TAG(user_lock); 2461 #if USE_ITT_BUILD 2462 __kmp_itt_lock_acquiring( 2463 (kmp_user_lock_p) 2464 user_lock); // itt function will get to the right lock object. 2465 #endif 2466 #if OMPT_SUPPORT && OMPT_OPTIONAL 2467 // This is the case, if called from omp_init_lock_with_hint: 2468 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2469 if (!codeptr) 2470 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2471 if (ompt_enabled.ompt_callback_mutex_acquire) { 2472 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2473 ompt_mutex_lock, omp_lock_hint_none, 2474 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2475 codeptr); 2476 } 2477 #endif 2478 #if KMP_USE_INLINED_TAS 2479 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2480 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid); 2481 } else 2482 #elif KMP_USE_INLINED_FUTEX 2483 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2484 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid); 2485 } else 2486 #endif 2487 { 2488 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2489 } 2490 #if USE_ITT_BUILD 2491 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2492 #endif 2493 #if OMPT_SUPPORT && OMPT_OPTIONAL 2494 if (ompt_enabled.ompt_callback_mutex_acquired) { 2495 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2496 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2497 } 2498 #endif 2499 2500 #else // KMP_USE_DYNAMIC_LOCK 2501 2502 kmp_user_lock_p lck; 2503 2504 if ((__kmp_user_lock_kind == lk_tas) && 2505 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2506 lck = (kmp_user_lock_p)user_lock; 2507 } 2508 #if KMP_USE_FUTEX 2509 else if ((__kmp_user_lock_kind == lk_futex) && 2510 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2511 lck = (kmp_user_lock_p)user_lock; 2512 } 2513 #endif 2514 else { 2515 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock"); 2516 } 2517 2518 #if USE_ITT_BUILD 2519 __kmp_itt_lock_acquiring(lck); 2520 #endif /* USE_ITT_BUILD */ 2521 #if OMPT_SUPPORT && OMPT_OPTIONAL 2522 // This is the case, if called from omp_init_lock_with_hint: 2523 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2524 if (!codeptr) 2525 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2526 if (ompt_enabled.ompt_callback_mutex_acquire) { 2527 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2528 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2529 (omp_wait_id_t)lck, codeptr); 2530 } 2531 #endif 2532 2533 ACQUIRE_LOCK(lck, gtid); 2534 2535 #if USE_ITT_BUILD 2536 __kmp_itt_lock_acquired(lck); 2537 #endif /* USE_ITT_BUILD */ 2538 2539 #if OMPT_SUPPORT && OMPT_OPTIONAL 2540 if (ompt_enabled.ompt_callback_mutex_acquired) { 2541 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2542 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2543 } 2544 #endif 2545 2546 #endif // KMP_USE_DYNAMIC_LOCK 2547 } 2548 2549 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2550 #if KMP_USE_DYNAMIC_LOCK 2551 2552 #if USE_ITT_BUILD 2553 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2554 #endif 2555 #if OMPT_SUPPORT && OMPT_OPTIONAL 2556 // This is the case, if called from omp_init_lock_with_hint: 2557 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2558 if (!codeptr) 2559 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2560 if (ompt_enabled.enabled) { 2561 if (ompt_enabled.ompt_callback_mutex_acquire) { 2562 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2563 ompt_mutex_nest_lock, omp_lock_hint_none, 2564 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2565 codeptr); 2566 } 2567 } 2568 #endif 2569 int acquire_status = 2570 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid); 2571 #if USE_ITT_BUILD 2572 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2573 #endif 2574 2575 #if OMPT_SUPPORT && OMPT_OPTIONAL 2576 if (ompt_enabled.enabled) { 2577 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2578 if (ompt_enabled.ompt_callback_mutex_acquired) { 2579 // lock_first 2580 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2581 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2582 } 2583 } else { 2584 if (ompt_enabled.ompt_callback_nest_lock) { 2585 // lock_next 2586 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2587 ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr); 2588 } 2589 } 2590 } 2591 #endif 2592 2593 #else // KMP_USE_DYNAMIC_LOCK 2594 int acquire_status; 2595 kmp_user_lock_p lck; 2596 2597 if ((__kmp_user_lock_kind == lk_tas) && 2598 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2599 OMP_NEST_LOCK_T_SIZE)) { 2600 lck = (kmp_user_lock_p)user_lock; 2601 } 2602 #if KMP_USE_FUTEX 2603 else if ((__kmp_user_lock_kind == lk_futex) && 2604 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2605 OMP_NEST_LOCK_T_SIZE)) { 2606 lck = (kmp_user_lock_p)user_lock; 2607 } 2608 #endif 2609 else { 2610 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock"); 2611 } 2612 2613 #if USE_ITT_BUILD 2614 __kmp_itt_lock_acquiring(lck); 2615 #endif /* USE_ITT_BUILD */ 2616 #if OMPT_SUPPORT && OMPT_OPTIONAL 2617 // This is the case, if called from omp_init_lock_with_hint: 2618 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2619 if (!codeptr) 2620 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2621 if (ompt_enabled.enabled) { 2622 if (ompt_enabled.ompt_callback_mutex_acquire) { 2623 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2624 ompt_mutex_nest_lock, omp_lock_hint_none, 2625 __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr); 2626 } 2627 } 2628 #endif 2629 2630 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status); 2631 2632 #if USE_ITT_BUILD 2633 __kmp_itt_lock_acquired(lck); 2634 #endif /* USE_ITT_BUILD */ 2635 2636 #if OMPT_SUPPORT && OMPT_OPTIONAL 2637 if (ompt_enabled.enabled) { 2638 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2639 if (ompt_enabled.ompt_callback_mutex_acquired) { 2640 // lock_first 2641 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2642 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 2643 } 2644 } else { 2645 if (ompt_enabled.ompt_callback_nest_lock) { 2646 // lock_next 2647 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2648 ompt_scope_begin, (omp_wait_id_t)lck, codeptr); 2649 } 2650 } 2651 } 2652 #endif 2653 2654 #endif // KMP_USE_DYNAMIC_LOCK 2655 } 2656 2657 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2658 #if KMP_USE_DYNAMIC_LOCK 2659 2660 int tag = KMP_EXTRACT_D_TAG(user_lock); 2661 #if USE_ITT_BUILD 2662 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2663 #endif 2664 #if KMP_USE_INLINED_TAS 2665 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2666 KMP_RELEASE_TAS_LOCK(user_lock, gtid); 2667 } else 2668 #elif KMP_USE_INLINED_FUTEX 2669 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2670 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid); 2671 } else 2672 #endif 2673 { 2674 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2675 } 2676 2677 #if OMPT_SUPPORT && OMPT_OPTIONAL 2678 // This is the case, if called from omp_init_lock_with_hint: 2679 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2680 if (!codeptr) 2681 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2682 if (ompt_enabled.ompt_callback_mutex_released) { 2683 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2684 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2685 } 2686 #endif 2687 2688 #else // KMP_USE_DYNAMIC_LOCK 2689 2690 kmp_user_lock_p lck; 2691 2692 /* Can't use serial interval since not block structured */ 2693 /* release the lock */ 2694 2695 if ((__kmp_user_lock_kind == lk_tas) && 2696 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2697 #if KMP_OS_LINUX && \ 2698 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2699 // "fast" path implemented to fix customer performance issue 2700 #if USE_ITT_BUILD 2701 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2702 #endif /* USE_ITT_BUILD */ 2703 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0); 2704 KMP_MB(); 2705 2706 #if OMPT_SUPPORT && OMPT_OPTIONAL 2707 // This is the case, if called from omp_init_lock_with_hint: 2708 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2709 if (!codeptr) 2710 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2711 if (ompt_enabled.ompt_callback_mutex_released) { 2712 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2713 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2714 } 2715 #endif 2716 2717 return; 2718 #else 2719 lck = (kmp_user_lock_p)user_lock; 2720 #endif 2721 } 2722 #if KMP_USE_FUTEX 2723 else if ((__kmp_user_lock_kind == lk_futex) && 2724 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2725 lck = (kmp_user_lock_p)user_lock; 2726 } 2727 #endif 2728 else { 2729 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock"); 2730 } 2731 2732 #if USE_ITT_BUILD 2733 __kmp_itt_lock_releasing(lck); 2734 #endif /* USE_ITT_BUILD */ 2735 2736 RELEASE_LOCK(lck, gtid); 2737 2738 #if OMPT_SUPPORT && OMPT_OPTIONAL 2739 // This is the case, if called from omp_init_lock_with_hint: 2740 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2741 if (!codeptr) 2742 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2743 if (ompt_enabled.ompt_callback_mutex_released) { 2744 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2745 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2746 } 2747 #endif 2748 2749 #endif // KMP_USE_DYNAMIC_LOCK 2750 } 2751 2752 /* release the lock */ 2753 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2754 #if KMP_USE_DYNAMIC_LOCK 2755 2756 #if USE_ITT_BUILD 2757 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2758 #endif 2759 int release_status = 2760 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid); 2761 2762 #if OMPT_SUPPORT && OMPT_OPTIONAL 2763 // This is the case, if called from omp_init_lock_with_hint: 2764 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2765 if (!codeptr) 2766 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2767 if (ompt_enabled.enabled) { 2768 if (release_status == KMP_LOCK_RELEASED) { 2769 if (ompt_enabled.ompt_callback_mutex_released) { 2770 // release_lock_last 2771 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2772 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2773 } 2774 } else if (ompt_enabled.ompt_callback_nest_lock) { 2775 // release_lock_prev 2776 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2777 ompt_scope_end, (omp_wait_id_t)user_lock, codeptr); 2778 } 2779 } 2780 #endif 2781 2782 #else // KMP_USE_DYNAMIC_LOCK 2783 2784 kmp_user_lock_p lck; 2785 2786 /* Can't use serial interval since not block structured */ 2787 2788 if ((__kmp_user_lock_kind == lk_tas) && 2789 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2790 OMP_NEST_LOCK_T_SIZE)) { 2791 #if KMP_OS_LINUX && \ 2792 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2793 // "fast" path implemented to fix customer performance issue 2794 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock; 2795 #if USE_ITT_BUILD 2796 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2797 #endif /* USE_ITT_BUILD */ 2798 2799 #if OMPT_SUPPORT && OMPT_OPTIONAL 2800 int release_status = KMP_LOCK_STILL_HELD; 2801 #endif 2802 2803 if (--(tl->lk.depth_locked) == 0) { 2804 TCW_4(tl->lk.poll, 0); 2805 #if OMPT_SUPPORT && OMPT_OPTIONAL 2806 release_status = KMP_LOCK_RELEASED; 2807 #endif 2808 } 2809 KMP_MB(); 2810 2811 #if OMPT_SUPPORT && OMPT_OPTIONAL 2812 // This is the case, if called from omp_init_lock_with_hint: 2813 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2814 if (!codeptr) 2815 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2816 if (ompt_enabled.enabled) { 2817 if (release_status == KMP_LOCK_RELEASED) { 2818 if (ompt_enabled.ompt_callback_mutex_released) { 2819 // release_lock_last 2820 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2821 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 2822 } 2823 } else if (ompt_enabled.ompt_callback_nest_lock) { 2824 // release_lock_previous 2825 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2826 ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr); 2827 } 2828 } 2829 #endif 2830 2831 return; 2832 #else 2833 lck = (kmp_user_lock_p)user_lock; 2834 #endif 2835 } 2836 #if KMP_USE_FUTEX 2837 else if ((__kmp_user_lock_kind == lk_futex) && 2838 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2839 OMP_NEST_LOCK_T_SIZE)) { 2840 lck = (kmp_user_lock_p)user_lock; 2841 } 2842 #endif 2843 else { 2844 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock"); 2845 } 2846 2847 #if USE_ITT_BUILD 2848 __kmp_itt_lock_releasing(lck); 2849 #endif /* USE_ITT_BUILD */ 2850 2851 int release_status; 2852 release_status = RELEASE_NESTED_LOCK(lck, gtid); 2853 #if OMPT_SUPPORT && OMPT_OPTIONAL 2854 // This is the case, if called from omp_init_lock_with_hint: 2855 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2856 if (!codeptr) 2857 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2858 if (ompt_enabled.enabled) { 2859 if (release_status == KMP_LOCK_RELEASED) { 2860 if (ompt_enabled.ompt_callback_mutex_released) { 2861 // release_lock_last 2862 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2863 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 2864 } 2865 } else if (ompt_enabled.ompt_callback_nest_lock) { 2866 // release_lock_previous 2867 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2868 ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr); 2869 } 2870 } 2871 #endif 2872 2873 #endif // KMP_USE_DYNAMIC_LOCK 2874 } 2875 2876 /* try to acquire the lock */ 2877 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2878 KMP_COUNT_BLOCK(OMP_test_lock); 2879 2880 #if KMP_USE_DYNAMIC_LOCK 2881 int rc; 2882 int tag = KMP_EXTRACT_D_TAG(user_lock); 2883 #if USE_ITT_BUILD 2884 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2885 #endif 2886 #if OMPT_SUPPORT && OMPT_OPTIONAL 2887 // This is the case, if called from omp_init_lock_with_hint: 2888 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2889 if (!codeptr) 2890 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2891 if (ompt_enabled.ompt_callback_mutex_acquire) { 2892 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2893 ompt_mutex_lock, omp_lock_hint_none, 2894 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2895 codeptr); 2896 } 2897 #endif 2898 #if KMP_USE_INLINED_TAS 2899 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2900 KMP_TEST_TAS_LOCK(user_lock, gtid, rc); 2901 } else 2902 #elif KMP_USE_INLINED_FUTEX 2903 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2904 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc); 2905 } else 2906 #endif 2907 { 2908 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2909 } 2910 if (rc) { 2911 #if USE_ITT_BUILD 2912 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2913 #endif 2914 #if OMPT_SUPPORT && OMPT_OPTIONAL 2915 if (ompt_enabled.ompt_callback_mutex_acquired) { 2916 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2917 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2918 } 2919 #endif 2920 return FTN_TRUE; 2921 } else { 2922 #if USE_ITT_BUILD 2923 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 2924 #endif 2925 return FTN_FALSE; 2926 } 2927 2928 #else // KMP_USE_DYNAMIC_LOCK 2929 2930 kmp_user_lock_p lck; 2931 int rc; 2932 2933 if ((__kmp_user_lock_kind == lk_tas) && 2934 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2935 lck = (kmp_user_lock_p)user_lock; 2936 } 2937 #if KMP_USE_FUTEX 2938 else if ((__kmp_user_lock_kind == lk_futex) && 2939 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2940 lck = (kmp_user_lock_p)user_lock; 2941 } 2942 #endif 2943 else { 2944 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock"); 2945 } 2946 2947 #if USE_ITT_BUILD 2948 __kmp_itt_lock_acquiring(lck); 2949 #endif /* USE_ITT_BUILD */ 2950 #if OMPT_SUPPORT && OMPT_OPTIONAL 2951 // This is the case, if called from omp_init_lock_with_hint: 2952 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2953 if (!codeptr) 2954 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2955 if (ompt_enabled.ompt_callback_mutex_acquire) { 2956 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2957 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2958 (omp_wait_id_t)lck, codeptr); 2959 } 2960 #endif 2961 2962 rc = TEST_LOCK(lck, gtid); 2963 #if USE_ITT_BUILD 2964 if (rc) { 2965 __kmp_itt_lock_acquired(lck); 2966 } else { 2967 __kmp_itt_lock_cancelled(lck); 2968 } 2969 #endif /* USE_ITT_BUILD */ 2970 #if OMPT_SUPPORT && OMPT_OPTIONAL 2971 if (rc && ompt_enabled.ompt_callback_mutex_acquired) { 2972 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2973 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2974 } 2975 #endif 2976 2977 return (rc ? FTN_TRUE : FTN_FALSE); 2978 2979 /* Can't use serial interval since not block structured */ 2980 2981 #endif // KMP_USE_DYNAMIC_LOCK 2982 } 2983 2984 /* try to acquire the lock */ 2985 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2986 #if KMP_USE_DYNAMIC_LOCK 2987 int rc; 2988 #if USE_ITT_BUILD 2989 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2990 #endif 2991 #if OMPT_SUPPORT && OMPT_OPTIONAL 2992 // This is the case, if called from omp_init_lock_with_hint: 2993 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2994 if (!codeptr) 2995 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2996 if (ompt_enabled.ompt_callback_mutex_acquire) { 2997 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2998 ompt_mutex_nest_lock, omp_lock_hint_none, 2999 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 3000 codeptr); 3001 } 3002 #endif 3003 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid); 3004 #if USE_ITT_BUILD 3005 if (rc) { 3006 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 3007 } else { 3008 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 3009 } 3010 #endif 3011 #if OMPT_SUPPORT && OMPT_OPTIONAL 3012 if (ompt_enabled.enabled && rc) { 3013 if (rc == 1) { 3014 if (ompt_enabled.ompt_callback_mutex_acquired) { 3015 // lock_first 3016 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3017 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 3018 } 3019 } else { 3020 if (ompt_enabled.ompt_callback_nest_lock) { 3021 // lock_next 3022 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3023 ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr); 3024 } 3025 } 3026 } 3027 #endif 3028 return rc; 3029 3030 #else // KMP_USE_DYNAMIC_LOCK 3031 3032 kmp_user_lock_p lck; 3033 int rc; 3034 3035 if ((__kmp_user_lock_kind == lk_tas) && 3036 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 3037 OMP_NEST_LOCK_T_SIZE)) { 3038 lck = (kmp_user_lock_p)user_lock; 3039 } 3040 #if KMP_USE_FUTEX 3041 else if ((__kmp_user_lock_kind == lk_futex) && 3042 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 3043 OMP_NEST_LOCK_T_SIZE)) { 3044 lck = (kmp_user_lock_p)user_lock; 3045 } 3046 #endif 3047 else { 3048 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock"); 3049 } 3050 3051 #if USE_ITT_BUILD 3052 __kmp_itt_lock_acquiring(lck); 3053 #endif /* USE_ITT_BUILD */ 3054 3055 #if OMPT_SUPPORT && OMPT_OPTIONAL 3056 // This is the case, if called from omp_init_lock_with_hint: 3057 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3058 if (!codeptr) 3059 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3060 if (ompt_enabled.enabled) && 3061 ompt_enabled.ompt_callback_mutex_acquire) { 3062 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3063 ompt_mutex_nest_lock, omp_lock_hint_none, 3064 __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr); 3065 } 3066 #endif 3067 3068 rc = TEST_NESTED_LOCK(lck, gtid); 3069 #if USE_ITT_BUILD 3070 if (rc) { 3071 __kmp_itt_lock_acquired(lck); 3072 } else { 3073 __kmp_itt_lock_cancelled(lck); 3074 } 3075 #endif /* USE_ITT_BUILD */ 3076 #if OMPT_SUPPORT && OMPT_OPTIONAL 3077 if (ompt_enabled.enabled && rc) { 3078 if (rc == 1) { 3079 if (ompt_enabled.ompt_callback_mutex_acquired) { 3080 // lock_first 3081 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3082 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 3083 } 3084 } else { 3085 if (ompt_enabled.ompt_callback_nest_lock) { 3086 // lock_next 3087 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3088 ompt_mutex_scope_begin, (omp_wait_id_t)lck, codeptr); 3089 } 3090 } 3091 } 3092 #endif 3093 return rc; 3094 3095 /* Can't use serial interval since not block structured */ 3096 3097 #endif // KMP_USE_DYNAMIC_LOCK 3098 } 3099 3100 // Interface to fast scalable reduce methods routines 3101 3102 // keep the selected method in a thread local structure for cross-function 3103 // usage: will be used in __kmpc_end_reduce* functions; 3104 // another solution: to re-determine the method one more time in 3105 // __kmpc_end_reduce* functions (new prototype required then) 3106 // AT: which solution is better? 3107 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \ 3108 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod)) 3109 3110 #define __KMP_GET_REDUCTION_METHOD(gtid) \ 3111 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) 3112 3113 // description of the packed_reduction_method variable: look at the macros in 3114 // kmp.h 3115 3116 // used in a critical section reduce block 3117 static __forceinline void 3118 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3119 kmp_critical_name *crit) { 3120 3121 // this lock was visible to a customer and to the threading profile tool as a 3122 // serial overhead span (although it's used for an internal purpose only) 3123 // why was it visible in previous implementation? 3124 // should we keep it visible in new reduce block? 3125 kmp_user_lock_p lck; 3126 3127 #if KMP_USE_DYNAMIC_LOCK 3128 3129 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 3130 // Check if it is initialized. 3131 if (*lk == 0) { 3132 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3133 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 3134 KMP_GET_D_TAG(__kmp_user_lock_seq)); 3135 } else { 3136 __kmp_init_indirect_csptr(crit, loc, global_tid, 3137 KMP_GET_I_TAG(__kmp_user_lock_seq)); 3138 } 3139 } 3140 // Branch for accessing the actual lock object and set operation. This 3141 // branching is inevitable since this lock initialization does not follow the 3142 // normal dispatch path (lock table is not used). 3143 if (KMP_EXTRACT_D_TAG(lk) != 0) { 3144 lck = (kmp_user_lock_p)lk; 3145 KMP_DEBUG_ASSERT(lck != NULL); 3146 if (__kmp_env_consistency_check) { 3147 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3148 } 3149 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 3150 } else { 3151 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 3152 lck = ilk->lock; 3153 KMP_DEBUG_ASSERT(lck != NULL); 3154 if (__kmp_env_consistency_check) { 3155 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3156 } 3157 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 3158 } 3159 3160 #else // KMP_USE_DYNAMIC_LOCK 3161 3162 // We know that the fast reduction code is only emitted by Intel compilers 3163 // with 32 byte critical sections. If there isn't enough space, then we 3164 // have to use a pointer. 3165 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) { 3166 lck = (kmp_user_lock_p)crit; 3167 } else { 3168 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 3169 } 3170 KMP_DEBUG_ASSERT(lck != NULL); 3171 3172 if (__kmp_env_consistency_check) 3173 __kmp_push_sync(global_tid, ct_critical, loc, lck); 3174 3175 __kmp_acquire_user_lock_with_checks(lck, global_tid); 3176 3177 #endif // KMP_USE_DYNAMIC_LOCK 3178 } 3179 3180 // used in a critical section reduce block 3181 static __forceinline void 3182 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3183 kmp_critical_name *crit) { 3184 3185 kmp_user_lock_p lck; 3186 3187 #if KMP_USE_DYNAMIC_LOCK 3188 3189 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3190 lck = (kmp_user_lock_p)crit; 3191 if (__kmp_env_consistency_check) 3192 __kmp_pop_sync(global_tid, ct_critical, loc); 3193 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 3194 } else { 3195 kmp_indirect_lock_t *ilk = 3196 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 3197 if (__kmp_env_consistency_check) 3198 __kmp_pop_sync(global_tid, ct_critical, loc); 3199 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid); 3200 } 3201 3202 #else // KMP_USE_DYNAMIC_LOCK 3203 3204 // We know that the fast reduction code is only emitted by Intel compilers 3205 // with 32 byte critical sections. If there isn't enough space, then we have 3206 // to use a pointer. 3207 if (__kmp_base_user_lock_size > 32) { 3208 lck = *((kmp_user_lock_p *)crit); 3209 KMP_ASSERT(lck != NULL); 3210 } else { 3211 lck = (kmp_user_lock_p)crit; 3212 } 3213 3214 if (__kmp_env_consistency_check) 3215 __kmp_pop_sync(global_tid, ct_critical, loc); 3216 3217 __kmp_release_user_lock_with_checks(lck, global_tid); 3218 3219 #endif // KMP_USE_DYNAMIC_LOCK 3220 } // __kmp_end_critical_section_reduce_block 3221 3222 #if OMP_40_ENABLED 3223 static __forceinline int 3224 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p, 3225 int *task_state) { 3226 kmp_team_t *team; 3227 3228 // Check if we are inside the teams construct? 3229 if (th->th.th_teams_microtask) { 3230 *team_p = team = th->th.th_team; 3231 if (team->t.t_level == th->th.th_teams_level) { 3232 // This is reduction at teams construct. 3233 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0 3234 // Let's swap teams temporarily for the reduction. 3235 th->th.th_info.ds.ds_tid = team->t.t_master_tid; 3236 th->th.th_team = team->t.t_parent; 3237 th->th.th_team_nproc = th->th.th_team->t.t_nproc; 3238 th->th.th_task_team = th->th.th_team->t.t_task_team[0]; 3239 *task_state = th->th.th_task_state; 3240 th->th.th_task_state = 0; 3241 3242 return 1; 3243 } 3244 } 3245 return 0; 3246 } 3247 3248 static __forceinline void 3249 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) { 3250 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction. 3251 th->th.th_info.ds.ds_tid = 0; 3252 th->th.th_team = team; 3253 th->th.th_team_nproc = team->t.t_nproc; 3254 th->th.th_task_team = team->t.t_task_team[task_state]; 3255 th->th.th_task_state = task_state; 3256 } 3257 #endif 3258 3259 /* 2.a.i. Reduce Block without a terminating barrier */ 3260 /*! 3261 @ingroup SYNCHRONIZATION 3262 @param loc source location information 3263 @param global_tid global thread number 3264 @param num_vars number of items (variables) to be reduced 3265 @param reduce_size size of data in bytes to be reduced 3266 @param reduce_data pointer to data to be reduced 3267 @param reduce_func callback function providing reduction operation on two 3268 operands and returning result of reduction in lhs_data 3269 @param lck pointer to the unique lock data structure 3270 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3271 threads if atomic reduction needed 3272 3273 The nowait version is used for a reduce clause with the nowait argument. 3274 */ 3275 kmp_int32 3276 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3277 size_t reduce_size, void *reduce_data, 3278 void (*reduce_func)(void *lhs_data, void *rhs_data), 3279 kmp_critical_name *lck) { 3280 3281 KMP_COUNT_BLOCK(REDUCE_nowait); 3282 int retval = 0; 3283 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3284 #if OMP_40_ENABLED 3285 kmp_info_t *th; 3286 kmp_team_t *team; 3287 int teams_swapped = 0, task_state; 3288 #endif 3289 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid)); 3290 3291 // why do we need this initialization here at all? 3292 // Reduction clause can not be used as a stand-alone directive. 3293 3294 // do not call __kmp_serial_initialize(), it will be called by 3295 // __kmp_parallel_initialize() if needed 3296 // possible detection of false-positive race by the threadchecker ??? 3297 if (!TCR_4(__kmp_init_parallel)) 3298 __kmp_parallel_initialize(); 3299 3300 // check correctness of reduce block nesting 3301 #if KMP_USE_DYNAMIC_LOCK 3302 if (__kmp_env_consistency_check) 3303 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3304 #else 3305 if (__kmp_env_consistency_check) 3306 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3307 #endif 3308 3309 #if OMP_40_ENABLED 3310 th = __kmp_thread_from_gtid(global_tid); 3311 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3312 #endif // OMP_40_ENABLED 3313 3314 // packed_reduction_method value will be reused by __kmp_end_reduce* function, 3315 // the value should be kept in a variable 3316 // the variable should be either a construct-specific or thread-specific 3317 // property, not a team specific property 3318 // (a thread can reach the next reduce block on the next construct, reduce 3319 // method may differ on the next construct) 3320 // an ident_t "loc" parameter could be used as a construct-specific property 3321 // (what if loc == 0?) 3322 // (if both construct-specific and team-specific variables were shared, 3323 // then unness extra syncs should be needed) 3324 // a thread-specific variable is better regarding two issues above (next 3325 // construct and extra syncs) 3326 // a thread-specific "th_local.reduction_method" variable is used currently 3327 // each thread executes 'determine' and 'set' lines (no need to execute by one 3328 // thread, to avoid unness extra syncs) 3329 3330 packed_reduction_method = __kmp_determine_reduction_method( 3331 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3332 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3333 3334 if (packed_reduction_method == critical_reduce_block) { 3335 3336 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3337 retval = 1; 3338 3339 } else if (packed_reduction_method == empty_reduce_block) { 3340 3341 // usage: if team size == 1, no synchronization is required ( Intel 3342 // platforms only ) 3343 retval = 1; 3344 3345 } else if (packed_reduction_method == atomic_reduce_block) { 3346 3347 retval = 2; 3348 3349 // all threads should do this pop here (because __kmpc_end_reduce_nowait() 3350 // won't be called by the code gen) 3351 // (it's not quite good, because the checking block has been closed by 3352 // this 'pop', 3353 // but atomic operation has not been executed yet, will be executed 3354 // slightly later, literally on next instruction) 3355 if (__kmp_env_consistency_check) 3356 __kmp_pop_sync(global_tid, ct_reduce, loc); 3357 3358 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3359 tree_reduce_block)) { 3360 3361 // AT: performance issue: a real barrier here 3362 // AT: (if master goes slow, other threads are blocked here waiting for the 3363 // master to come and release them) 3364 // AT: (it's not what a customer might expect specifying NOWAIT clause) 3365 // AT: (specifying NOWAIT won't result in improvement of performance, it'll 3366 // be confusing to a customer) 3367 // AT: another implementation of *barrier_gather*nowait() (or some other design) 3368 // might go faster and be more in line with sense of NOWAIT 3369 // AT: TO DO: do epcc test and compare times 3370 3371 // this barrier should be invisible to a customer and to the threading profile 3372 // tool (it's neither a terminating barrier nor customer's code, it's 3373 // used for an internal purpose) 3374 #if OMPT_SUPPORT 3375 // JP: can this barrier potentially leed to task scheduling? 3376 // JP: as long as there is a barrier in the implementation, OMPT should and 3377 // will provide the barrier events 3378 // so we set-up the necessary frame/return addresses. 3379 omp_frame_t *ompt_frame; 3380 if (ompt_enabled.enabled) { 3381 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3382 if (ompt_frame->enter_frame == NULL) 3383 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3384 OMPT_STORE_RETURN_ADDRESS(global_tid); 3385 } 3386 #endif 3387 #if USE_ITT_NOTIFY 3388 __kmp_threads[global_tid]->th.th_ident = loc; 3389 #endif 3390 retval = 3391 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3392 global_tid, FALSE, reduce_size, reduce_data, reduce_func); 3393 retval = (retval != 0) ? (0) : (1); 3394 #if OMPT_SUPPORT && OMPT_OPTIONAL 3395 if (ompt_enabled.enabled) { 3396 ompt_frame->enter_frame = NULL; 3397 } 3398 #endif 3399 3400 // all other workers except master should do this pop here 3401 // ( none of other workers will get to __kmpc_end_reduce_nowait() ) 3402 if (__kmp_env_consistency_check) { 3403 if (retval == 0) { 3404 __kmp_pop_sync(global_tid, ct_reduce, loc); 3405 } 3406 } 3407 3408 } else { 3409 3410 // should never reach this block 3411 KMP_ASSERT(0); // "unexpected method" 3412 } 3413 #if OMP_40_ENABLED 3414 if (teams_swapped) { 3415 __kmp_restore_swapped_teams(th, team, task_state); 3416 } 3417 #endif 3418 KA_TRACE( 3419 10, 3420 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", 3421 global_tid, packed_reduction_method, retval)); 3422 3423 return retval; 3424 } 3425 3426 /*! 3427 @ingroup SYNCHRONIZATION 3428 @param loc source location information 3429 @param global_tid global thread id. 3430 @param lck pointer to the unique lock data structure 3431 3432 Finish the execution of a reduce nowait. 3433 */ 3434 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, 3435 kmp_critical_name *lck) { 3436 3437 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3438 3439 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid)); 3440 3441 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3442 3443 if (packed_reduction_method == critical_reduce_block) { 3444 3445 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3446 3447 } else if (packed_reduction_method == empty_reduce_block) { 3448 3449 // usage: if team size == 1, no synchronization is required ( on Intel 3450 // platforms only ) 3451 3452 } else if (packed_reduction_method == atomic_reduce_block) { 3453 3454 // neither master nor other workers should get here 3455 // (code gen does not generate this call in case 2: atomic reduce block) 3456 // actually it's better to remove this elseif at all; 3457 // after removal this value will checked by the 'else' and will assert 3458 3459 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3460 tree_reduce_block)) { 3461 3462 // only master gets here 3463 3464 } else { 3465 3466 // should never reach this block 3467 KMP_ASSERT(0); // "unexpected method" 3468 } 3469 3470 if (__kmp_env_consistency_check) 3471 __kmp_pop_sync(global_tid, ct_reduce, loc); 3472 3473 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", 3474 global_tid, packed_reduction_method)); 3475 3476 return; 3477 } 3478 3479 /* 2.a.ii. Reduce Block with a terminating barrier */ 3480 3481 /*! 3482 @ingroup SYNCHRONIZATION 3483 @param loc source location information 3484 @param global_tid global thread number 3485 @param num_vars number of items (variables) to be reduced 3486 @param reduce_size size of data in bytes to be reduced 3487 @param reduce_data pointer to data to be reduced 3488 @param reduce_func callback function providing reduction operation on two 3489 operands and returning result of reduction in lhs_data 3490 @param lck pointer to the unique lock data structure 3491 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3492 threads if atomic reduction needed 3493 3494 A blocking reduce that includes an implicit barrier. 3495 */ 3496 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3497 size_t reduce_size, void *reduce_data, 3498 void (*reduce_func)(void *lhs_data, void *rhs_data), 3499 kmp_critical_name *lck) { 3500 KMP_COUNT_BLOCK(REDUCE_wait); 3501 int retval = 0; 3502 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3503 #if OMP_40_ENABLED 3504 kmp_info_t *th; 3505 kmp_team_t *team; 3506 int teams_swapped = 0, task_state; 3507 #endif 3508 3509 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid)); 3510 3511 // why do we need this initialization here at all? 3512 // Reduction clause can not be a stand-alone directive. 3513 3514 // do not call __kmp_serial_initialize(), it will be called by 3515 // __kmp_parallel_initialize() if needed 3516 // possible detection of false-positive race by the threadchecker ??? 3517 if (!TCR_4(__kmp_init_parallel)) 3518 __kmp_parallel_initialize(); 3519 3520 // check correctness of reduce block nesting 3521 #if KMP_USE_DYNAMIC_LOCK 3522 if (__kmp_env_consistency_check) 3523 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3524 #else 3525 if (__kmp_env_consistency_check) 3526 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3527 #endif 3528 3529 #if OMP_40_ENABLED 3530 th = __kmp_thread_from_gtid(global_tid); 3531 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3532 #endif // OMP_40_ENABLED 3533 3534 packed_reduction_method = __kmp_determine_reduction_method( 3535 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3536 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3537 3538 if (packed_reduction_method == critical_reduce_block) { 3539 3540 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3541 retval = 1; 3542 3543 } else if (packed_reduction_method == empty_reduce_block) { 3544 3545 // usage: if team size == 1, no synchronization is required ( Intel 3546 // platforms only ) 3547 retval = 1; 3548 3549 } else if (packed_reduction_method == atomic_reduce_block) { 3550 3551 retval = 2; 3552 3553 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3554 tree_reduce_block)) { 3555 3556 // case tree_reduce_block: 3557 // this barrier should be visible to a customer and to the threading profile 3558 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3559 #if OMPT_SUPPORT 3560 omp_frame_t *ompt_frame; 3561 if (ompt_enabled.enabled) { 3562 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3563 if (ompt_frame->enter_frame == NULL) 3564 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3565 OMPT_STORE_RETURN_ADDRESS(global_tid); 3566 } 3567 #endif 3568 #if USE_ITT_NOTIFY 3569 __kmp_threads[global_tid]->th.th_ident = 3570 loc; // needed for correct notification of frames 3571 #endif 3572 retval = 3573 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3574 global_tid, TRUE, reduce_size, reduce_data, reduce_func); 3575 retval = (retval != 0) ? (0) : (1); 3576 #if OMPT_SUPPORT && OMPT_OPTIONAL 3577 if (ompt_enabled.enabled) { 3578 ompt_frame->enter_frame = NULL; 3579 } 3580 #endif 3581 3582 // all other workers except master should do this pop here 3583 // ( none of other workers except master will enter __kmpc_end_reduce() ) 3584 if (__kmp_env_consistency_check) { 3585 if (retval == 0) { // 0: all other workers; 1: master 3586 __kmp_pop_sync(global_tid, ct_reduce, loc); 3587 } 3588 } 3589 3590 } else { 3591 3592 // should never reach this block 3593 KMP_ASSERT(0); // "unexpected method" 3594 } 3595 #if OMP_40_ENABLED 3596 if (teams_swapped) { 3597 __kmp_restore_swapped_teams(th, team, task_state); 3598 } 3599 #endif 3600 3601 KA_TRACE(10, 3602 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", 3603 global_tid, packed_reduction_method, retval)); 3604 3605 return retval; 3606 } 3607 3608 /*! 3609 @ingroup SYNCHRONIZATION 3610 @param loc source location information 3611 @param global_tid global thread id. 3612 @param lck pointer to the unique lock data structure 3613 3614 Finish the execution of a blocking reduce. 3615 The <tt>lck</tt> pointer must be the same as that used in the corresponding 3616 start function. 3617 */ 3618 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, 3619 kmp_critical_name *lck) { 3620 3621 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3622 #if OMP_40_ENABLED 3623 kmp_info_t *th; 3624 kmp_team_t *team; 3625 int teams_swapped = 0, task_state; 3626 #endif 3627 3628 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid)); 3629 3630 #if OMP_40_ENABLED 3631 th = __kmp_thread_from_gtid(global_tid); 3632 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3633 #endif // OMP_40_ENABLED 3634 3635 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3636 3637 // this barrier should be visible to a customer and to the threading profile 3638 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3639 3640 if (packed_reduction_method == critical_reduce_block) { 3641 3642 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3643 3644 // TODO: implicit barrier: should be exposed 3645 #if OMPT_SUPPORT 3646 omp_frame_t *ompt_frame; 3647 if (ompt_enabled.enabled) { 3648 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3649 if (ompt_frame->enter_frame == NULL) 3650 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3651 OMPT_STORE_RETURN_ADDRESS(global_tid); 3652 } 3653 #endif 3654 #if USE_ITT_NOTIFY 3655 __kmp_threads[global_tid]->th.th_ident = loc; 3656 #endif 3657 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3658 #if OMPT_SUPPORT && OMPT_OPTIONAL 3659 if (ompt_enabled.enabled) { 3660 ompt_frame->enter_frame = NULL; 3661 } 3662 #endif 3663 3664 } else if (packed_reduction_method == empty_reduce_block) { 3665 3666 // usage: if team size==1, no synchronization is required (Intel platforms only) 3667 3668 // TODO: implicit barrier: should be exposed 3669 #if OMPT_SUPPORT 3670 omp_frame_t *ompt_frame; 3671 if (ompt_enabled.enabled) { 3672 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3673 if (ompt_frame->enter_frame == NULL) 3674 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3675 OMPT_STORE_RETURN_ADDRESS(global_tid); 3676 } 3677 #endif 3678 #if USE_ITT_NOTIFY 3679 __kmp_threads[global_tid]->th.th_ident = loc; 3680 #endif 3681 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3682 #if OMPT_SUPPORT && OMPT_OPTIONAL 3683 if (ompt_enabled.enabled) { 3684 ompt_frame->enter_frame = NULL; 3685 } 3686 #endif 3687 3688 } else if (packed_reduction_method == atomic_reduce_block) { 3689 3690 #if OMPT_SUPPORT 3691 omp_frame_t *ompt_frame; 3692 if (ompt_enabled.enabled) { 3693 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3694 if (ompt_frame->enter_frame == NULL) 3695 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3696 OMPT_STORE_RETURN_ADDRESS(global_tid); 3697 } 3698 #endif 3699 // TODO: implicit barrier: should be exposed 3700 #if USE_ITT_NOTIFY 3701 __kmp_threads[global_tid]->th.th_ident = loc; 3702 #endif 3703 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3704 #if OMPT_SUPPORT && OMPT_OPTIONAL 3705 if (ompt_enabled.enabled) { 3706 ompt_frame->enter_frame = NULL; 3707 } 3708 #endif 3709 3710 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3711 tree_reduce_block)) { 3712 3713 // only master executes here (master releases all other workers) 3714 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3715 global_tid); 3716 3717 } else { 3718 3719 // should never reach this block 3720 KMP_ASSERT(0); // "unexpected method" 3721 } 3722 #if OMP_40_ENABLED 3723 if (teams_swapped) { 3724 __kmp_restore_swapped_teams(th, team, task_state); 3725 } 3726 #endif 3727 3728 if (__kmp_env_consistency_check) 3729 __kmp_pop_sync(global_tid, ct_reduce, loc); 3730 3731 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n", 3732 global_tid, packed_reduction_method)); 3733 3734 return; 3735 } 3736 3737 #undef __KMP_GET_REDUCTION_METHOD 3738 #undef __KMP_SET_REDUCTION_METHOD 3739 3740 /* end of interface to fast scalable reduce routines */ 3741 3742 kmp_uint64 __kmpc_get_taskid() { 3743 3744 kmp_int32 gtid; 3745 kmp_info_t *thread; 3746 3747 gtid = __kmp_get_gtid(); 3748 if (gtid < 0) { 3749 return 0; 3750 } 3751 thread = __kmp_thread_from_gtid(gtid); 3752 return thread->th.th_current_task->td_task_id; 3753 3754 } // __kmpc_get_taskid 3755 3756 kmp_uint64 __kmpc_get_parent_taskid() { 3757 3758 kmp_int32 gtid; 3759 kmp_info_t *thread; 3760 kmp_taskdata_t *parent_task; 3761 3762 gtid = __kmp_get_gtid(); 3763 if (gtid < 0) { 3764 return 0; 3765 } 3766 thread = __kmp_thread_from_gtid(gtid); 3767 parent_task = thread->th.th_current_task->td_parent; 3768 return (parent_task == NULL ? 0 : parent_task->td_task_id); 3769 3770 } // __kmpc_get_parent_taskid 3771 3772 #if OMP_45_ENABLED 3773 /*! 3774 @ingroup WORK_SHARING 3775 @param loc source location information. 3776 @param gtid global thread number. 3777 @param num_dims number of associated doacross loops. 3778 @param dims info on loops bounds. 3779 3780 Initialize doacross loop information. 3781 Expect compiler send us inclusive bounds, 3782 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2. 3783 */ 3784 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, 3785 const struct kmp_dim *dims) { 3786 int j, idx; 3787 kmp_int64 last, trace_count; 3788 kmp_info_t *th = __kmp_threads[gtid]; 3789 kmp_team_t *team = th->th.th_team; 3790 kmp_uint32 *flags; 3791 kmp_disp_t *pr_buf = th->th.th_dispatch; 3792 dispatch_shared_info_t *sh_buf; 3793 3794 KA_TRACE( 3795 20, 3796 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n", 3797 gtid, num_dims, !team->t.t_serialized)); 3798 KMP_DEBUG_ASSERT(dims != NULL); 3799 KMP_DEBUG_ASSERT(num_dims > 0); 3800 3801 if (team->t.t_serialized) { 3802 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n")); 3803 return; // no dependencies if team is serialized 3804 } 3805 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 3806 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for 3807 // the next loop 3808 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3809 3810 // Save bounds info into allocated private buffer 3811 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL); 3812 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc( 3813 th, sizeof(kmp_int64) * (4 * num_dims + 1)); 3814 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3815 pr_buf->th_doacross_info[0] = 3816 (kmp_int64)num_dims; // first element is number of dimensions 3817 // Save also address of num_done in order to access it later without knowing 3818 // the buffer index 3819 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done; 3820 pr_buf->th_doacross_info[2] = dims[0].lo; 3821 pr_buf->th_doacross_info[3] = dims[0].up; 3822 pr_buf->th_doacross_info[4] = dims[0].st; 3823 last = 5; 3824 for (j = 1; j < num_dims; ++j) { 3825 kmp_int64 3826 range_length; // To keep ranges of all dimensions but the first dims[0] 3827 if (dims[j].st == 1) { // most common case 3828 // AC: should we care of ranges bigger than LLONG_MAX? (not for now) 3829 range_length = dims[j].up - dims[j].lo + 1; 3830 } else { 3831 if (dims[j].st > 0) { 3832 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo); 3833 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1; 3834 } else { // negative increment 3835 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up); 3836 range_length = 3837 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1; 3838 } 3839 } 3840 pr_buf->th_doacross_info[last++] = range_length; 3841 pr_buf->th_doacross_info[last++] = dims[j].lo; 3842 pr_buf->th_doacross_info[last++] = dims[j].up; 3843 pr_buf->th_doacross_info[last++] = dims[j].st; 3844 } 3845 3846 // Compute total trip count. 3847 // Start with range of dims[0] which we don't need to keep in the buffer. 3848 if (dims[0].st == 1) { // most common case 3849 trace_count = dims[0].up - dims[0].lo + 1; 3850 } else if (dims[0].st > 0) { 3851 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo); 3852 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1; 3853 } else { // negative increment 3854 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up); 3855 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1; 3856 } 3857 for (j = 1; j < num_dims; ++j) { 3858 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges 3859 } 3860 KMP_DEBUG_ASSERT(trace_count > 0); 3861 3862 // Check if shared buffer is not occupied by other loop (idx - 3863 // __kmp_dispatch_num_buffers) 3864 if (idx != sh_buf->doacross_buf_idx) { 3865 // Shared buffer is occupied, wait for it to be free 3866 __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx, 3867 __kmp_eq_4, NULL); 3868 } 3869 #if KMP_32_BIT_ARCH 3870 // Check if we are the first thread. After the CAS the first thread gets 0, 3871 // others get 1 if initialization is in progress, allocated pointer otherwise. 3872 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated. 3873 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32( 3874 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1); 3875 #else 3876 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64( 3877 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL); 3878 #endif 3879 if (flags == NULL) { 3880 // we are the first thread, allocate the array of flags 3881 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration 3882 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1); 3883 KMP_MB(); 3884 sh_buf->doacross_flags = flags; 3885 } else if (flags == (kmp_uint32 *)1) { 3886 #if KMP_32_BIT_ARCH 3887 // initialization is still in progress, need to wait 3888 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1) 3889 #else 3890 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL) 3891 #endif 3892 KMP_YIELD(TRUE); 3893 KMP_MB(); 3894 } else { 3895 KMP_MB(); 3896 } 3897 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value 3898 pr_buf->th_doacross_flags = 3899 sh_buf->doacross_flags; // save private copy in order to not 3900 // touch shared buffer on each iteration 3901 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid)); 3902 } 3903 3904 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) { 3905 kmp_int32 shft, num_dims, i; 3906 kmp_uint32 flag; 3907 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 3908 kmp_info_t *th = __kmp_threads[gtid]; 3909 kmp_team_t *team = th->th.th_team; 3910 kmp_disp_t *pr_buf; 3911 kmp_int64 lo, up, st; 3912 3913 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid)); 3914 if (team->t.t_serialized) { 3915 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n")); 3916 return; // no dependencies if team is serialized 3917 } 3918 3919 // calculate sequential iteration number and check out-of-bounds condition 3920 pr_buf = th->th.th_dispatch; 3921 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3922 num_dims = pr_buf->th_doacross_info[0]; 3923 lo = pr_buf->th_doacross_info[2]; 3924 up = pr_buf->th_doacross_info[3]; 3925 st = pr_buf->th_doacross_info[4]; 3926 if (st == 1) { // most common case 3927 if (vec[0] < lo || vec[0] > up) { 3928 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3929 "bounds [%lld,%lld]\n", 3930 gtid, vec[0], lo, up)); 3931 return; 3932 } 3933 iter_number = vec[0] - lo; 3934 } else if (st > 0) { 3935 if (vec[0] < lo || vec[0] > up) { 3936 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3937 "bounds [%lld,%lld]\n", 3938 gtid, vec[0], lo, up)); 3939 return; 3940 } 3941 iter_number = (kmp_uint64)(vec[0] - lo) / st; 3942 } else { // negative increment 3943 if (vec[0] > lo || vec[0] < up) { 3944 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3945 "bounds [%lld,%lld]\n", 3946 gtid, vec[0], lo, up)); 3947 return; 3948 } 3949 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 3950 } 3951 for (i = 1; i < num_dims; ++i) { 3952 kmp_int64 iter, ln; 3953 kmp_int32 j = i * 4; 3954 ln = pr_buf->th_doacross_info[j + 1]; 3955 lo = pr_buf->th_doacross_info[j + 2]; 3956 up = pr_buf->th_doacross_info[j + 3]; 3957 st = pr_buf->th_doacross_info[j + 4]; 3958 if (st == 1) { 3959 if (vec[i] < lo || vec[i] > up) { 3960 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3961 "bounds [%lld,%lld]\n", 3962 gtid, vec[i], lo, up)); 3963 return; 3964 } 3965 iter = vec[i] - lo; 3966 } else if (st > 0) { 3967 if (vec[i] < lo || vec[i] > up) { 3968 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3969 "bounds [%lld,%lld]\n", 3970 gtid, vec[i], lo, up)); 3971 return; 3972 } 3973 iter = (kmp_uint64)(vec[i] - lo) / st; 3974 } else { // st < 0 3975 if (vec[i] > lo || vec[i] < up) { 3976 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3977 "bounds [%lld,%lld]\n", 3978 gtid, vec[i], lo, up)); 3979 return; 3980 } 3981 iter = (kmp_uint64)(lo - vec[i]) / (-st); 3982 } 3983 iter_number = iter + ln * iter_number; 3984 } 3985 shft = iter_number % 32; // use 32-bit granularity 3986 iter_number >>= 5; // divided by 32 3987 flag = 1 << shft; 3988 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) { 3989 KMP_YIELD(TRUE); 3990 } 3991 KMP_MB(); 3992 KA_TRACE(20, 3993 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", 3994 gtid, (iter_number << 5) + shft)); 3995 } 3996 3997 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) { 3998 kmp_int32 shft, num_dims, i; 3999 kmp_uint32 flag; 4000 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 4001 kmp_info_t *th = __kmp_threads[gtid]; 4002 kmp_team_t *team = th->th.th_team; 4003 kmp_disp_t *pr_buf; 4004 kmp_int64 lo, st; 4005 4006 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid)); 4007 if (team->t.t_serialized) { 4008 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n")); 4009 return; // no dependencies if team is serialized 4010 } 4011 4012 // calculate sequential iteration number (same as in "wait" but no 4013 // out-of-bounds checks) 4014 pr_buf = th->th.th_dispatch; 4015 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 4016 num_dims = pr_buf->th_doacross_info[0]; 4017 lo = pr_buf->th_doacross_info[2]; 4018 st = pr_buf->th_doacross_info[4]; 4019 if (st == 1) { // most common case 4020 iter_number = vec[0] - lo; 4021 } else if (st > 0) { 4022 iter_number = (kmp_uint64)(vec[0] - lo) / st; 4023 } else { // negative increment 4024 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 4025 } 4026 for (i = 1; i < num_dims; ++i) { 4027 kmp_int64 iter, ln; 4028 kmp_int32 j = i * 4; 4029 ln = pr_buf->th_doacross_info[j + 1]; 4030 lo = pr_buf->th_doacross_info[j + 2]; 4031 st = pr_buf->th_doacross_info[j + 4]; 4032 if (st == 1) { 4033 iter = vec[i] - lo; 4034 } else if (st > 0) { 4035 iter = (kmp_uint64)(vec[i] - lo) / st; 4036 } else { // st < 0 4037 iter = (kmp_uint64)(lo - vec[i]) / (-st); 4038 } 4039 iter_number = iter + ln * iter_number; 4040 } 4041 shft = iter_number % 32; // use 32-bit granularity 4042 iter_number >>= 5; // divided by 32 4043 flag = 1 << shft; 4044 KMP_MB(); 4045 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) 4046 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag); 4047 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid, 4048 (iter_number << 5) + shft)); 4049 } 4050 4051 void __kmpc_doacross_fini(ident_t *loc, int gtid) { 4052 kmp_int32 num_done; 4053 kmp_info_t *th = __kmp_threads[gtid]; 4054 kmp_team_t *team = th->th.th_team; 4055 kmp_disp_t *pr_buf = th->th.th_dispatch; 4056 4057 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid)); 4058 if (team->t.t_serialized) { 4059 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team)); 4060 return; // nothing to do 4061 } 4062 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1; 4063 if (num_done == th->th.th_team_nproc) { 4064 // we are the last thread, need to free shared resources 4065 int idx = pr_buf->th_doacross_buf_idx - 1; 4066 dispatch_shared_info_t *sh_buf = 4067 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 4068 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == 4069 (kmp_int64)&sh_buf->doacross_num_done); 4070 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done); 4071 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx); 4072 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags)); 4073 sh_buf->doacross_flags = NULL; 4074 sh_buf->doacross_num_done = 0; 4075 sh_buf->doacross_buf_idx += 4076 __kmp_dispatch_num_buffers; // free buffer for future re-use 4077 } 4078 // free private resources (need to keep buffer index forever) 4079 pr_buf->th_doacross_flags = NULL; 4080 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info); 4081 pr_buf->th_doacross_info = NULL; 4082 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid)); 4083 } 4084 #endif 4085 4086 #if OMP_50_ENABLED 4087 int __kmpc_get_target_offload(void) { 4088 if (!__kmp_init_serial) { 4089 __kmp_serial_initialize(); 4090 } 4091 return __kmp_target_offload; 4092 } 4093 #endif // OMP_50_ENABLED 4094 4095 // end of file // 4096