1 /* 2 * kmp_csupport.cpp -- kfront linkage support for OpenMP. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #define __KMP_IMP 15 #include "omp.h" /* extern "C" declarations of user-visible routines */ 16 #include "kmp.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_itt.h" 20 #include "kmp_lock.h" 21 #include "kmp_stats.h" 22 23 #if OMPT_SUPPORT 24 #include "ompt-specific.h" 25 #endif 26 27 #define MAX_MESSAGE 512 28 29 // flags will be used in future, e.g. to implement openmp_strict library 30 // restrictions 31 32 /*! 33 * @ingroup STARTUP_SHUTDOWN 34 * @param loc in source location information 35 * @param flags in for future use (currently ignored) 36 * 37 * Initialize the runtime library. This call is optional; if it is not made then 38 * it will be implicitly called by attempts to use other library functions. 39 */ 40 void __kmpc_begin(ident_t *loc, kmp_int32 flags) { 41 // By default __kmpc_begin() is no-op. 42 char *env; 43 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL && 44 __kmp_str_match_true(env)) { 45 __kmp_middle_initialize(); 46 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n")); 47 } else if (__kmp_ignore_mppbeg() == FALSE) { 48 // By default __kmp_ignore_mppbeg() returns TRUE. 49 __kmp_internal_begin(); 50 KC_TRACE(10, ("__kmpc_begin: called\n")); 51 } 52 } 53 54 /*! 55 * @ingroup STARTUP_SHUTDOWN 56 * @param loc source location information 57 * 58 * Shutdown the runtime library. This is also optional, and even if called will 59 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to 60 * zero. 61 */ 62 void __kmpc_end(ident_t *loc) { 63 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() 64 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND 65 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() 66 // returns FALSE and __kmpc_end() will unregister this root (it can cause 67 // library shut down). 68 if (__kmp_ignore_mppend() == FALSE) { 69 KC_TRACE(10, ("__kmpc_end: called\n")); 70 KA_TRACE(30, ("__kmpc_end\n")); 71 72 __kmp_internal_end_thread(-1); 73 } 74 } 75 76 /*! 77 @ingroup THREAD_STATES 78 @param loc Source location information. 79 @return The global thread index of the active thread. 80 81 This function can be called in any context. 82 83 If the runtime has ony been entered at the outermost level from a 84 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is 85 that which would be returned by omp_get_thread_num() in the outermost 86 active parallel construct. (Or zero if there is no active parallel 87 construct, since the master thread is necessarily thread zero). 88 89 If multiple non-OpenMP threads all enter an OpenMP construct then this 90 will be a unique thread identifier among all the threads created by 91 the OpenMP runtime (but the value cannote be defined in terms of 92 OpenMP thread ids returned by omp_get_thread_num()). 93 */ 94 kmp_int32 __kmpc_global_thread_num(ident_t *loc) { 95 kmp_int32 gtid = __kmp_entry_gtid(); 96 97 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid)); 98 99 return gtid; 100 } 101 102 /*! 103 @ingroup THREAD_STATES 104 @param loc Source location information. 105 @return The number of threads under control of the OpenMP<sup>*</sup> runtime 106 107 This function can be called in any context. 108 It returns the total number of threads under the control of the OpenMP runtime. 109 That is not a number that can be determined by any OpenMP standard calls, since 110 the library may be called from more than one non-OpenMP thread, and this 111 reflects the total over all such calls. Similarly the runtime maintains 112 underlying threads even when they are not active (since the cost of creating 113 and destroying OS threads is high), this call counts all such threads even if 114 they are not waiting for work. 115 */ 116 kmp_int32 __kmpc_global_num_threads(ident_t *loc) { 117 KC_TRACE(10, 118 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth)); 119 120 return TCR_4(__kmp_all_nth); 121 } 122 123 /*! 124 @ingroup THREAD_STATES 125 @param loc Source location information. 126 @return The thread number of the calling thread in the innermost active parallel 127 construct. 128 */ 129 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) { 130 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n")); 131 return __kmp_tid_from_gtid(__kmp_entry_gtid()); 132 } 133 134 /*! 135 @ingroup THREAD_STATES 136 @param loc Source location information. 137 @return The number of threads in the innermost active parallel construct. 138 */ 139 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) { 140 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n")); 141 142 return __kmp_entry_thread()->th.th_team->t.t_nproc; 143 } 144 145 /*! 146 * @ingroup DEPRECATED 147 * @param loc location description 148 * 149 * This function need not be called. It always returns TRUE. 150 */ 151 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) { 152 #ifndef KMP_DEBUG 153 154 return TRUE; 155 156 #else 157 158 const char *semi2; 159 const char *semi3; 160 int line_no; 161 162 if (__kmp_par_range == 0) { 163 return TRUE; 164 } 165 semi2 = loc->psource; 166 if (semi2 == NULL) { 167 return TRUE; 168 } 169 semi2 = strchr(semi2, ';'); 170 if (semi2 == NULL) { 171 return TRUE; 172 } 173 semi2 = strchr(semi2 + 1, ';'); 174 if (semi2 == NULL) { 175 return TRUE; 176 } 177 if (__kmp_par_range_filename[0]) { 178 const char *name = semi2 - 1; 179 while ((name > loc->psource) && (*name != '/') && (*name != ';')) { 180 name--; 181 } 182 if ((*name == '/') || (*name == ';')) { 183 name++; 184 } 185 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) { 186 return __kmp_par_range < 0; 187 } 188 } 189 semi3 = strchr(semi2 + 1, ';'); 190 if (__kmp_par_range_routine[0]) { 191 if ((semi3 != NULL) && (semi3 > semi2) && 192 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) { 193 return __kmp_par_range < 0; 194 } 195 } 196 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) { 197 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) { 198 return __kmp_par_range > 0; 199 } 200 return __kmp_par_range < 0; 201 } 202 return TRUE; 203 204 #endif /* KMP_DEBUG */ 205 } 206 207 /*! 208 @ingroup THREAD_STATES 209 @param loc Source location information. 210 @return 1 if this thread is executing inside an active parallel region, zero if 211 not. 212 */ 213 kmp_int32 __kmpc_in_parallel(ident_t *loc) { 214 return __kmp_entry_thread()->th.th_root->r.r_active; 215 } 216 217 /*! 218 @ingroup PARALLEL 219 @param loc source location information 220 @param global_tid global thread number 221 @param num_threads number of threads requested for this parallel construct 222 223 Set the number of threads to be used by the next fork spawned by this thread. 224 This call is only required if the parallel construct has a `num_threads` clause. 225 */ 226 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, 227 kmp_int32 num_threads) { 228 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n", 229 global_tid, num_threads)); 230 231 __kmp_push_num_threads(loc, global_tid, num_threads); 232 } 233 234 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) { 235 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n")); 236 237 /* the num_threads are automatically popped */ 238 } 239 240 #if OMP_40_ENABLED 241 242 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, 243 kmp_int32 proc_bind) { 244 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid, 245 proc_bind)); 246 247 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind); 248 } 249 250 #endif /* OMP_40_ENABLED */ 251 252 /*! 253 @ingroup PARALLEL 254 @param loc source location information 255 @param argc total number of arguments in the ellipsis 256 @param microtask pointer to callback routine consisting of outlined parallel 257 construct 258 @param ... pointers to shared variables that aren't global 259 260 Do the actual fork and call the microtask in the relevant number of threads. 261 */ 262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) { 263 int gtid = __kmp_entry_gtid(); 264 265 #if (KMP_STATS_ENABLED) 266 // If we were in a serial region, then stop the serial timer, record 267 // the event, and start parallel region timer 268 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 269 if (previous_state == stats_state_e::SERIAL_REGION) { 270 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead); 271 } else { 272 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead); 273 } 274 int inParallel = __kmpc_in_parallel(loc); 275 if (inParallel) { 276 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL); 277 } else { 278 KMP_COUNT_BLOCK(OMP_PARALLEL); 279 } 280 #endif 281 282 // maybe to save thr_state is enough here 283 { 284 va_list ap; 285 va_start(ap, microtask); 286 287 #if OMPT_SUPPORT 288 omp_frame_t *ompt_frame; 289 if (ompt_enabled.enabled) { 290 kmp_info_t *master_th = __kmp_threads[gtid]; 291 kmp_team_t *parent_team = master_th->th.th_team; 292 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info; 293 if (lwt) 294 ompt_frame = &(lwt->ompt_task_info.frame); 295 else { 296 int tid = __kmp_tid_from_gtid(gtid); 297 ompt_frame = &( 298 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame); 299 } 300 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 301 OMPT_STORE_RETURN_ADDRESS(gtid); 302 } 303 #endif 304 305 #if INCLUDE_SSC_MARKS 306 SSC_MARK_FORKING(); 307 #endif 308 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 309 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task 310 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, 311 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 312 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 313 &ap 314 #else 315 ap 316 #endif 317 ); 318 #if INCLUDE_SSC_MARKS 319 SSC_MARK_JOINING(); 320 #endif 321 __kmp_join_call(loc, gtid 322 #if OMPT_SUPPORT 323 , 324 fork_context_intel 325 #endif 326 ); 327 328 va_end(ap); 329 } 330 331 #if KMP_STATS_ENABLED 332 if (previous_state == stats_state_e::SERIAL_REGION) { 333 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial); 334 } else { 335 KMP_POP_PARTITIONED_TIMER(); 336 } 337 #endif // KMP_STATS_ENABLED 338 } 339 340 #if OMP_40_ENABLED 341 /*! 342 @ingroup PARALLEL 343 @param loc source location information 344 @param global_tid global thread number 345 @param num_teams number of teams requested for the teams construct 346 @param num_threads number of threads per team requested for the teams construct 347 348 Set the number of teams to be used by the teams construct. 349 This call is only required if the teams construct has a `num_teams` clause 350 or a `thread_limit` clause (or both). 351 */ 352 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, 353 kmp_int32 num_teams, kmp_int32 num_threads) { 354 KA_TRACE(20, 355 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n", 356 global_tid, num_teams, num_threads)); 357 358 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads); 359 } 360 361 /*! 362 @ingroup PARALLEL 363 @param loc source location information 364 @param argc total number of arguments in the ellipsis 365 @param microtask pointer to callback routine consisting of outlined teams 366 construct 367 @param ... pointers to shared variables that aren't global 368 369 Do the actual fork and call the microtask in the relevant number of threads. 370 */ 371 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, 372 ...) { 373 int gtid = __kmp_entry_gtid(); 374 kmp_info_t *this_thr = __kmp_threads[gtid]; 375 va_list ap; 376 va_start(ap, microtask); 377 378 KMP_COUNT_BLOCK(OMP_TEAMS); 379 380 // remember teams entry point and nesting level 381 this_thr->th.th_teams_microtask = microtask; 382 this_thr->th.th_teams_level = 383 this_thr->th.th_team->t.t_level; // AC: can be >0 on host 384 385 #if OMPT_SUPPORT 386 kmp_team_t *parent_team = this_thr->th.th_team; 387 int tid = __kmp_tid_from_gtid(gtid); 388 if (ompt_enabled.enabled) { 389 parent_team->t.t_implicit_task_taskdata[tid] 390 .ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1); 391 } 392 OMPT_STORE_RETURN_ADDRESS(gtid); 393 #endif 394 395 // check if __kmpc_push_num_teams called, set default number of teams 396 // otherwise 397 if (this_thr->th.th_teams_size.nteams == 0) { 398 __kmp_push_num_teams(loc, gtid, 0, 0); 399 } 400 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1); 401 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1); 402 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1); 403 404 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 405 VOLATILE_CAST(microtask_t) 406 __kmp_teams_master, // "wrapped" task 407 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, 408 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 409 &ap 410 #else 411 ap 412 #endif 413 ); 414 __kmp_join_call(loc, gtid 415 #if OMPT_SUPPORT 416 , 417 fork_context_intel 418 #endif 419 ); 420 421 this_thr->th.th_teams_microtask = NULL; 422 this_thr->th.th_teams_level = 0; 423 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L; 424 va_end(ap); 425 } 426 #endif /* OMP_40_ENABLED */ 427 428 // I don't think this function should ever have been exported. 429 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated 430 // openmp code ever called it, but it's been exported from the RTL for so 431 // long that I'm afraid to remove the definition. 432 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); } 433 434 /*! 435 @ingroup PARALLEL 436 @param loc source location information 437 @param global_tid global thread number 438 439 Enter a serialized parallel construct. This interface is used to handle a 440 conditional parallel region, like this, 441 @code 442 #pragma omp parallel if (condition) 443 @endcode 444 when the condition is false. 445 */ 446 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 447 // The implementation is now in kmp_runtime.cpp so that it can share static 448 // functions with kmp_fork_call since the tasks to be done are similar in 449 // each case. 450 #if OMPT_SUPPORT 451 OMPT_STORE_RETURN_ADDRESS(global_tid); 452 #endif 453 __kmp_serialized_parallel(loc, global_tid); 454 } 455 456 /*! 457 @ingroup PARALLEL 458 @param loc source location information 459 @param global_tid global thread number 460 461 Leave a serialized parallel construct. 462 */ 463 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 464 kmp_internal_control_t *top; 465 kmp_info_t *this_thr; 466 kmp_team_t *serial_team; 467 468 KC_TRACE(10, 469 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid)); 470 471 /* skip all this code for autopar serialized loops since it results in 472 unacceptable overhead */ 473 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 474 return; 475 476 // Not autopar code 477 if (!TCR_4(__kmp_init_parallel)) 478 __kmp_parallel_initialize(); 479 480 this_thr = __kmp_threads[global_tid]; 481 serial_team = this_thr->th.th_serial_team; 482 483 #if OMP_45_ENABLED 484 kmp_task_team_t *task_team = this_thr->th.th_task_team; 485 486 // we need to wait for the proxy tasks before finishing the thread 487 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) 488 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL)); 489 #endif 490 491 KMP_MB(); 492 KMP_DEBUG_ASSERT(serial_team); 493 KMP_ASSERT(serial_team->t.t_serialized); 494 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 495 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team); 496 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 497 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 498 499 #if OMPT_SUPPORT 500 if (ompt_enabled.enabled && 501 this_thr->th.ompt_thread_info.state != omp_state_overhead) { 502 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = NULL; 503 if (ompt_enabled.ompt_callback_implicit_task) { 504 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 505 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1, 506 OMPT_CUR_TASK_INFO(this_thr)->thread_num); 507 } 508 509 // reset clear the task id only after unlinking the task 510 ompt_data_t *parent_task_data; 511 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL); 512 513 if (ompt_enabled.ompt_callback_parallel_end) { 514 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 515 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data, 516 ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid)); 517 } 518 __ompt_lw_taskteam_unlink(this_thr); 519 this_thr->th.ompt_thread_info.state = omp_state_overhead; 520 } 521 #endif 522 523 /* If necessary, pop the internal control stack values and replace the team 524 * values */ 525 top = serial_team->t.t_control_stack_top; 526 if (top && top->serial_nesting_level == serial_team->t.t_serialized) { 527 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top); 528 serial_team->t.t_control_stack_top = top->next; 529 __kmp_free(top); 530 } 531 532 // if( serial_team -> t.t_serialized > 1 ) 533 serial_team->t.t_level--; 534 535 /* pop dispatch buffers stack */ 536 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer); 537 { 538 dispatch_private_info_t *disp_buffer = 539 serial_team->t.t_dispatch->th_disp_buffer; 540 serial_team->t.t_dispatch->th_disp_buffer = 541 serial_team->t.t_dispatch->th_disp_buffer->next; 542 __kmp_free(disp_buffer); 543 } 544 #if OMP_50_ENABLED 545 this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore 546 #endif 547 548 --serial_team->t.t_serialized; 549 if (serial_team->t.t_serialized == 0) { 550 551 /* return to the parallel section */ 552 553 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 554 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) { 555 __kmp_clear_x87_fpu_status_word(); 556 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word); 557 __kmp_load_mxcsr(&serial_team->t.t_mxcsr); 558 } 559 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 560 561 this_thr->th.th_team = serial_team->t.t_parent; 562 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid; 563 564 /* restore values cached in the thread */ 565 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */ 566 this_thr->th.th_team_master = 567 serial_team->t.t_parent->t.t_threads[0]; /* JPH */ 568 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized; 569 570 /* TODO the below shouldn't need to be adjusted for serialized teams */ 571 this_thr->th.th_dispatch = 572 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid]; 573 574 __kmp_pop_current_task_from_thread(this_thr); 575 576 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0); 577 this_thr->th.th_current_task->td_flags.executing = 1; 578 579 if (__kmp_tasking_mode != tskm_immediate_exec) { 580 // Copy the task team from the new child / old parent team to the thread. 581 this_thr->th.th_task_team = 582 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]; 583 KA_TRACE(20, 584 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / " 585 "team %p\n", 586 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 587 } 588 } else { 589 if (__kmp_tasking_mode != tskm_immediate_exec) { 590 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting " 591 "depth of serial team %p to %d\n", 592 global_tid, serial_team, serial_team->t.t_serialized)); 593 } 594 } 595 596 if (__kmp_env_consistency_check) 597 __kmp_pop_parallel(global_tid, NULL); 598 #if OMPT_SUPPORT 599 if (ompt_enabled.enabled) 600 this_thr->th.ompt_thread_info.state = 601 ((this_thr->th.th_team_serialized) ? omp_state_work_serial 602 : omp_state_work_parallel); 603 #endif 604 } 605 606 /*! 607 @ingroup SYNCHRONIZATION 608 @param loc source location information. 609 610 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though 611 depending on the memory ordering convention obeyed by the compiler 612 even that may not be necessary). 613 */ 614 void __kmpc_flush(ident_t *loc) { 615 KC_TRACE(10, ("__kmpc_flush: called\n")); 616 617 /* need explicit __mf() here since use volatile instead in library */ 618 KMP_MB(); /* Flush all pending memory write invalidates. */ 619 620 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64) 621 #if KMP_MIC 622 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used. 623 // We shouldn't need it, though, since the ABI rules require that 624 // * If the compiler generates NGO stores it also generates the fence 625 // * If users hand-code NGO stores they should insert the fence 626 // therefore no incomplete unordered stores should be visible. 627 #else 628 // C74404 629 // This is to address non-temporal store instructions (sfence needed). 630 // The clflush instruction is addressed either (mfence needed). 631 // Probably the non-temporal load monvtdqa instruction should also be 632 // addressed. 633 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2. 634 if (!__kmp_cpuinfo.initialized) { 635 __kmp_query_cpuid(&__kmp_cpuinfo); 636 } 637 if (!__kmp_cpuinfo.sse2) { 638 // CPU cannot execute SSE2 instructions. 639 } else { 640 #if KMP_COMPILER_ICC 641 _mm_mfence(); 642 #elif KMP_COMPILER_MSVC 643 MemoryBarrier(); 644 #else 645 __sync_synchronize(); 646 #endif // KMP_COMPILER_ICC 647 } 648 #endif // KMP_MIC 649 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64) 650 // Nothing to see here move along 651 #elif KMP_ARCH_PPC64 652 // Nothing needed here (we have a real MB above). 653 #if KMP_OS_CNK 654 // The flushing thread needs to yield here; this prevents a 655 // busy-waiting thread from saturating the pipeline. flush is 656 // often used in loops like this: 657 // while (!flag) { 658 // #pragma omp flush(flag) 659 // } 660 // and adding the yield here is good for at least a 10x speedup 661 // when running >2 threads per core (on the NAS LU benchmark). 662 __kmp_yield(TRUE); 663 #endif 664 #else 665 #error Unknown or unsupported architecture 666 #endif 667 668 #if OMPT_SUPPORT && OMPT_OPTIONAL 669 if (ompt_enabled.ompt_callback_flush) { 670 ompt_callbacks.ompt_callback(ompt_callback_flush)( 671 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0)); 672 } 673 #endif 674 } 675 676 /* -------------------------------------------------------------------------- */ 677 /*! 678 @ingroup SYNCHRONIZATION 679 @param loc source location information 680 @param global_tid thread id. 681 682 Execute a barrier. 683 */ 684 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) { 685 KMP_COUNT_BLOCK(OMP_BARRIER); 686 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid)); 687 688 if (!TCR_4(__kmp_init_parallel)) 689 __kmp_parallel_initialize(); 690 691 if (__kmp_env_consistency_check) { 692 if (loc == 0) { 693 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 694 } 695 696 __kmp_check_barrier(global_tid, ct_barrier, loc); 697 } 698 699 #if OMPT_SUPPORT 700 omp_frame_t *ompt_frame; 701 if (ompt_enabled.enabled) { 702 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 703 if (ompt_frame->enter_frame == NULL) 704 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 705 OMPT_STORE_RETURN_ADDRESS(global_tid); 706 } 707 #endif 708 __kmp_threads[global_tid]->th.th_ident = loc; 709 // TODO: explicit barrier_wait_id: 710 // this function is called when 'barrier' directive is present or 711 // implicit barrier at the end of a worksharing construct. 712 // 1) better to add a per-thread barrier counter to a thread data structure 713 // 2) set to 0 when a new team is created 714 // 4) no sync is required 715 716 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 717 #if OMPT_SUPPORT && OMPT_OPTIONAL 718 if (ompt_enabled.enabled) { 719 ompt_frame->enter_frame = NULL; 720 } 721 #endif 722 } 723 724 /* The BARRIER for a MASTER section is always explicit */ 725 /*! 726 @ingroup WORK_SHARING 727 @param loc source location information. 728 @param global_tid global thread number . 729 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise. 730 */ 731 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) { 732 int status = 0; 733 734 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid)); 735 736 if (!TCR_4(__kmp_init_parallel)) 737 __kmp_parallel_initialize(); 738 739 if (KMP_MASTER_GTID(global_tid)) { 740 KMP_COUNT_BLOCK(OMP_MASTER); 741 KMP_PUSH_PARTITIONED_TIMER(OMP_master); 742 status = 1; 743 } 744 745 #if OMPT_SUPPORT && OMPT_OPTIONAL 746 if (status) { 747 if (ompt_enabled.ompt_callback_master) { 748 kmp_info_t *this_thr = __kmp_threads[global_tid]; 749 kmp_team_t *team = this_thr->th.th_team; 750 751 int tid = __kmp_tid_from_gtid(global_tid); 752 ompt_callbacks.ompt_callback(ompt_callback_master)( 753 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data), 754 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 755 OMPT_GET_RETURN_ADDRESS(0)); 756 } 757 } 758 #endif 759 760 if (__kmp_env_consistency_check) { 761 #if KMP_USE_DYNAMIC_LOCK 762 if (status) 763 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0); 764 else 765 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0); 766 #else 767 if (status) 768 __kmp_push_sync(global_tid, ct_master, loc, NULL); 769 else 770 __kmp_check_sync(global_tid, ct_master, loc, NULL); 771 #endif 772 } 773 774 return status; 775 } 776 777 /*! 778 @ingroup WORK_SHARING 779 @param loc source location information. 780 @param global_tid global thread number . 781 782 Mark the end of a <tt>master</tt> region. This should only be called by the 783 thread that executes the <tt>master</tt> region. 784 */ 785 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) { 786 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid)); 787 788 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid)); 789 KMP_POP_PARTITIONED_TIMER(); 790 791 #if OMPT_SUPPORT && OMPT_OPTIONAL 792 kmp_info_t *this_thr = __kmp_threads[global_tid]; 793 kmp_team_t *team = this_thr->th.th_team; 794 if (ompt_enabled.ompt_callback_master) { 795 int tid = __kmp_tid_from_gtid(global_tid); 796 ompt_callbacks.ompt_callback(ompt_callback_master)( 797 ompt_scope_end, &(team->t.ompt_team_info.parallel_data), 798 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 799 OMPT_GET_RETURN_ADDRESS(0)); 800 } 801 #endif 802 803 if (__kmp_env_consistency_check) { 804 if (global_tid < 0) 805 KMP_WARNING(ThreadIdentInvalid); 806 807 if (KMP_MASTER_GTID(global_tid)) 808 __kmp_pop_sync(global_tid, ct_master, loc); 809 } 810 } 811 812 /*! 813 @ingroup WORK_SHARING 814 @param loc source location information. 815 @param gtid global thread number. 816 817 Start execution of an <tt>ordered</tt> construct. 818 */ 819 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) { 820 int cid = 0; 821 kmp_info_t *th; 822 KMP_DEBUG_ASSERT(__kmp_init_serial); 823 824 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid)); 825 826 if (!TCR_4(__kmp_init_parallel)) 827 __kmp_parallel_initialize(); 828 829 #if USE_ITT_BUILD 830 __kmp_itt_ordered_prep(gtid); 831 // TODO: ordered_wait_id 832 #endif /* USE_ITT_BUILD */ 833 834 th = __kmp_threads[gtid]; 835 836 #if OMPT_SUPPORT && OMPT_OPTIONAL 837 kmp_team_t *team; 838 omp_wait_id_t lck; 839 void *codeptr_ra; 840 if (ompt_enabled.enabled) { 841 OMPT_STORE_RETURN_ADDRESS(gtid); 842 team = __kmp_team_from_gtid(gtid); 843 lck = (omp_wait_id_t)&team->t.t_ordered.dt.t_value; 844 /* OMPT state update */ 845 th->th.ompt_thread_info.wait_id = lck; 846 th->th.ompt_thread_info.state = omp_state_wait_ordered; 847 848 /* OMPT event callback */ 849 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 850 if (ompt_enabled.ompt_callback_mutex_acquire) { 851 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 852 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, 853 (omp_wait_id_t)lck, codeptr_ra); 854 } 855 } 856 #endif 857 858 if (th->th.th_dispatch->th_deo_fcn != 0) 859 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc); 860 else 861 __kmp_parallel_deo(>id, &cid, loc); 862 863 #if OMPT_SUPPORT && OMPT_OPTIONAL 864 if (ompt_enabled.enabled) { 865 /* OMPT state update */ 866 th->th.ompt_thread_info.state = omp_state_work_parallel; 867 th->th.ompt_thread_info.wait_id = 0; 868 869 /* OMPT event callback */ 870 if (ompt_enabled.ompt_callback_mutex_acquired) { 871 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 872 ompt_mutex_ordered, (omp_wait_id_t)lck, codeptr_ra); 873 } 874 } 875 #endif 876 877 #if USE_ITT_BUILD 878 __kmp_itt_ordered_start(gtid); 879 #endif /* USE_ITT_BUILD */ 880 } 881 882 /*! 883 @ingroup WORK_SHARING 884 @param loc source location information. 885 @param gtid global thread number. 886 887 End execution of an <tt>ordered</tt> construct. 888 */ 889 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) { 890 int cid = 0; 891 kmp_info_t *th; 892 893 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid)); 894 895 #if USE_ITT_BUILD 896 __kmp_itt_ordered_end(gtid); 897 // TODO: ordered_wait_id 898 #endif /* USE_ITT_BUILD */ 899 900 th = __kmp_threads[gtid]; 901 902 if (th->th.th_dispatch->th_dxo_fcn != 0) 903 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc); 904 else 905 __kmp_parallel_dxo(>id, &cid, loc); 906 907 #if OMPT_SUPPORT && OMPT_OPTIONAL 908 OMPT_STORE_RETURN_ADDRESS(gtid); 909 if (ompt_enabled.ompt_callback_mutex_released) { 910 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 911 ompt_mutex_ordered, 912 (omp_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value, 913 OMPT_LOAD_RETURN_ADDRESS(gtid)); 914 } 915 #endif 916 } 917 918 #if KMP_USE_DYNAMIC_LOCK 919 920 static __forceinline void 921 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc, 922 kmp_int32 gtid, kmp_indirect_locktag_t tag) { 923 // Pointer to the allocated indirect lock is written to crit, while indexing 924 // is ignored. 925 void *idx; 926 kmp_indirect_lock_t **lck; 927 lck = (kmp_indirect_lock_t **)crit; 928 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag); 929 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock); 930 KMP_SET_I_LOCK_LOCATION(ilk, loc); 931 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section); 932 KA_TRACE(20, 933 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag)); 934 #if USE_ITT_BUILD 935 __kmp_itt_critical_creating(ilk->lock, loc); 936 #endif 937 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk); 938 if (status == 0) { 939 #if USE_ITT_BUILD 940 __kmp_itt_critical_destroyed(ilk->lock); 941 #endif 942 // We don't really need to destroy the unclaimed lock here since it will be 943 // cleaned up at program exit. 944 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx); 945 } 946 KMP_DEBUG_ASSERT(*lck != NULL); 947 } 948 949 // Fast-path acquire tas lock 950 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \ 951 { \ 952 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 953 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 954 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 955 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 956 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \ 957 kmp_uint32 spins; \ 958 KMP_FSYNC_PREPARE(l); \ 959 KMP_INIT_YIELD(spins); \ 960 if (TCR_4(__kmp_nth) > \ 961 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 962 KMP_YIELD(TRUE); \ 963 } else { \ 964 KMP_YIELD_SPIN(spins); \ 965 } \ 966 kmp_backoff_t backoff = __kmp_spin_backoff_params; \ 967 while ( \ 968 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 969 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \ 970 __kmp_spin_backoff(&backoff); \ 971 if (TCR_4(__kmp_nth) > \ 972 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 973 KMP_YIELD(TRUE); \ 974 } else { \ 975 KMP_YIELD_SPIN(spins); \ 976 } \ 977 } \ 978 } \ 979 KMP_FSYNC_ACQUIRED(l); \ 980 } 981 982 // Fast-path test tas lock 983 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \ 984 { \ 985 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 986 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 987 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 988 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \ 989 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \ 990 } 991 992 // Fast-path release tas lock 993 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \ 994 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); } 995 996 #if KMP_USE_FUTEX 997 998 #include <sys/syscall.h> 999 #include <unistd.h> 1000 #ifndef FUTEX_WAIT 1001 #define FUTEX_WAIT 0 1002 #endif 1003 #ifndef FUTEX_WAKE 1004 #define FUTEX_WAKE 1 1005 #endif 1006 1007 // Fast-path acquire futex lock 1008 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \ 1009 { \ 1010 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1011 kmp_int32 gtid_code = (gtid + 1) << 1; \ 1012 KMP_MB(); \ 1013 KMP_FSYNC_PREPARE(ftx); \ 1014 kmp_int32 poll_val; \ 1015 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \ 1016 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1017 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \ 1018 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \ 1019 if (!cond) { \ 1020 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \ 1021 poll_val | \ 1022 KMP_LOCK_BUSY(1, futex))) { \ 1023 continue; \ 1024 } \ 1025 poll_val |= KMP_LOCK_BUSY(1, futex); \ 1026 } \ 1027 kmp_int32 rc; \ 1028 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \ 1029 NULL, NULL, 0)) != 0) { \ 1030 continue; \ 1031 } \ 1032 gtid_code |= 1; \ 1033 } \ 1034 KMP_FSYNC_ACQUIRED(ftx); \ 1035 } 1036 1037 // Fast-path test futex lock 1038 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \ 1039 { \ 1040 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1041 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1042 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \ 1043 KMP_FSYNC_ACQUIRED(ftx); \ 1044 rc = TRUE; \ 1045 } else { \ 1046 rc = FALSE; \ 1047 } \ 1048 } 1049 1050 // Fast-path release futex lock 1051 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \ 1052 { \ 1053 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1054 KMP_MB(); \ 1055 KMP_FSYNC_RELEASING(ftx); \ 1056 kmp_int32 poll_val = \ 1057 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \ 1058 if (KMP_LOCK_STRIP(poll_val) & 1) { \ 1059 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \ 1060 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \ 1061 } \ 1062 KMP_MB(); \ 1063 KMP_YIELD(TCR_4(__kmp_nth) > \ 1064 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \ 1065 } 1066 1067 #endif // KMP_USE_FUTEX 1068 1069 #else // KMP_USE_DYNAMIC_LOCK 1070 1071 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit, 1072 ident_t const *loc, 1073 kmp_int32 gtid) { 1074 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit; 1075 1076 // Because of the double-check, the following load doesn't need to be volatile 1077 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1078 1079 if (lck == NULL) { 1080 void *idx; 1081 1082 // Allocate & initialize the lock. 1083 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup() 1084 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section); 1085 __kmp_init_user_lock_with_checks(lck); 1086 __kmp_set_user_lock_location(lck, loc); 1087 #if USE_ITT_BUILD 1088 __kmp_itt_critical_creating(lck); 1089 // __kmp_itt_critical_creating() should be called *before* the first usage 1090 // of underlying lock. It is the only place where we can guarantee it. There 1091 // are chances the lock will destroyed with no usage, but it is not a 1092 // problem, because this is not real event seen by user but rather setting 1093 // name for object (lock). See more details in kmp_itt.h. 1094 #endif /* USE_ITT_BUILD */ 1095 1096 // Use a cmpxchg instruction to slam the start of the critical section with 1097 // the lock pointer. If another thread beat us to it, deallocate the lock, 1098 // and use the lock that the other thread allocated. 1099 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck); 1100 1101 if (status == 0) { 1102 // Deallocate the lock and reload the value. 1103 #if USE_ITT_BUILD 1104 __kmp_itt_critical_destroyed(lck); 1105 // Let ITT know the lock is destroyed and the same memory location may be reused 1106 // for another purpose. 1107 #endif /* USE_ITT_BUILD */ 1108 __kmp_destroy_user_lock_with_checks(lck); 1109 __kmp_user_lock_free(&idx, gtid, lck); 1110 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1111 KMP_DEBUG_ASSERT(lck != NULL); 1112 } 1113 } 1114 return lck; 1115 } 1116 1117 #endif // KMP_USE_DYNAMIC_LOCK 1118 1119 /*! 1120 @ingroup WORK_SHARING 1121 @param loc source location information. 1122 @param global_tid global thread number . 1123 @param crit identity of the critical section. This could be a pointer to a lock 1124 associated with the critical section, or some other suitably unique value. 1125 1126 Enter code protected by a `critical` construct. 1127 This function blocks until the executing thread can enter the critical section. 1128 */ 1129 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, 1130 kmp_critical_name *crit) { 1131 #if KMP_USE_DYNAMIC_LOCK 1132 #if OMPT_SUPPORT && OMPT_OPTIONAL 1133 OMPT_STORE_RETURN_ADDRESS(global_tid); 1134 #endif // OMPT_SUPPORT 1135 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); 1136 #else 1137 KMP_COUNT_BLOCK(OMP_CRITICAL); 1138 #if OMPT_SUPPORT && OMPT_OPTIONAL 1139 omp_state_t prev_state = omp_state_undefined; 1140 ompt_thread_info_t ti; 1141 #endif 1142 kmp_user_lock_p lck; 1143 1144 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1145 1146 // TODO: add THR_OVHD_STATE 1147 1148 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1149 KMP_CHECK_USER_LOCK_INIT(); 1150 1151 if ((__kmp_user_lock_kind == lk_tas) && 1152 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1153 lck = (kmp_user_lock_p)crit; 1154 } 1155 #if KMP_USE_FUTEX 1156 else if ((__kmp_user_lock_kind == lk_futex) && 1157 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1158 lck = (kmp_user_lock_p)crit; 1159 } 1160 #endif 1161 else { // ticket, queuing or drdpa 1162 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 1163 } 1164 1165 if (__kmp_env_consistency_check) 1166 __kmp_push_sync(global_tid, ct_critical, loc, lck); 1167 1168 // since the critical directive binds to all threads, not just the current 1169 // team we have to check this even if we are in a serialized team. 1170 // also, even if we are the uber thread, we still have to conduct the lock, 1171 // as we have to contend with sibling threads. 1172 1173 #if USE_ITT_BUILD 1174 __kmp_itt_critical_acquiring(lck); 1175 #endif /* USE_ITT_BUILD */ 1176 #if OMPT_SUPPORT && OMPT_OPTIONAL 1177 OMPT_STORE_RETURN_ADDRESS(gtid); 1178 void *codeptr_ra = NULL; 1179 if (ompt_enabled.enabled) { 1180 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1181 /* OMPT state update */ 1182 prev_state = ti.state; 1183 ti.wait_id = (omp_wait_id_t)lck; 1184 ti.state = omp_state_wait_critical; 1185 1186 /* OMPT event callback */ 1187 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 1188 if (ompt_enabled.ompt_callback_mutex_acquire) { 1189 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1190 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 1191 (omp_wait_id_t)crit, codeptr_ra); 1192 } 1193 } 1194 #endif 1195 // Value of 'crit' should be good for using as a critical_id of the critical 1196 // section directive. 1197 __kmp_acquire_user_lock_with_checks(lck, global_tid); 1198 1199 #if USE_ITT_BUILD 1200 __kmp_itt_critical_acquired(lck); 1201 #endif /* USE_ITT_BUILD */ 1202 #if OMPT_SUPPORT && OMPT_OPTIONAL 1203 if (ompt_enabled.enabled) { 1204 /* OMPT state update */ 1205 ti.state = prev_state; 1206 ti.wait_id = 0; 1207 1208 /* OMPT event callback */ 1209 if (ompt_enabled.ompt_callback_mutex_acquired) { 1210 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1211 ompt_mutex_critical, (omp_wait_id_t)crit, codeptr_ra); 1212 } 1213 } 1214 #endif 1215 KMP_POP_PARTITIONED_TIMER(); 1216 1217 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1218 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1219 #endif // KMP_USE_DYNAMIC_LOCK 1220 } 1221 1222 #if KMP_USE_DYNAMIC_LOCK 1223 1224 // Converts the given hint to an internal lock implementation 1225 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) { 1226 #if KMP_USE_TSX 1227 #define KMP_TSX_LOCK(seq) lockseq_##seq 1228 #else 1229 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq 1230 #endif 1231 1232 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1233 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm) 1234 #else 1235 #define KMP_CPUINFO_RTM 0 1236 #endif 1237 1238 // Hints that do not require further logic 1239 if (hint & kmp_lock_hint_hle) 1240 return KMP_TSX_LOCK(hle); 1241 if (hint & kmp_lock_hint_rtm) 1242 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq; 1243 if (hint & kmp_lock_hint_adaptive) 1244 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq; 1245 1246 // Rule out conflicting hints first by returning the default lock 1247 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended)) 1248 return __kmp_user_lock_seq; 1249 if ((hint & omp_lock_hint_speculative) && 1250 (hint & omp_lock_hint_nonspeculative)) 1251 return __kmp_user_lock_seq; 1252 1253 // Do not even consider speculation when it appears to be contended 1254 if (hint & omp_lock_hint_contended) 1255 return lockseq_queuing; 1256 1257 // Uncontended lock without speculation 1258 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative)) 1259 return lockseq_tas; 1260 1261 // HLE lock for speculation 1262 if (hint & omp_lock_hint_speculative) 1263 return KMP_TSX_LOCK(hle); 1264 1265 return __kmp_user_lock_seq; 1266 } 1267 1268 #if OMPT_SUPPORT && OMPT_OPTIONAL 1269 #if KMP_USE_DYNAMIC_LOCK 1270 static kmp_mutex_impl_t 1271 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) { 1272 if (user_lock) { 1273 switch (KMP_EXTRACT_D_TAG(user_lock)) { 1274 case 0: 1275 break; 1276 #if KMP_USE_FUTEX 1277 case locktag_futex: 1278 return kmp_mutex_impl_queuing; 1279 #endif 1280 case locktag_tas: 1281 return kmp_mutex_impl_spin; 1282 #if KMP_USE_TSX 1283 case locktag_hle: 1284 return kmp_mutex_impl_speculative; 1285 #endif 1286 default: 1287 return ompt_mutex_impl_unknown; 1288 } 1289 ilock = KMP_LOOKUP_I_LOCK(user_lock); 1290 } 1291 KMP_ASSERT(ilock); 1292 switch (ilock->type) { 1293 #if KMP_USE_TSX 1294 case locktag_adaptive: 1295 case locktag_rtm: 1296 return kmp_mutex_impl_speculative; 1297 #endif 1298 case locktag_nested_tas: 1299 return kmp_mutex_impl_spin; 1300 #if KMP_USE_FUTEX 1301 case locktag_nested_futex: 1302 #endif 1303 case locktag_ticket: 1304 case locktag_queuing: 1305 case locktag_drdpa: 1306 case locktag_nested_ticket: 1307 case locktag_nested_queuing: 1308 case locktag_nested_drdpa: 1309 return kmp_mutex_impl_queuing; 1310 default: 1311 return ompt_mutex_impl_unknown; 1312 } 1313 } 1314 #else 1315 // For locks without dynamic binding 1316 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() { 1317 switch (__kmp_user_lock_kind) { 1318 case lk_tas: 1319 return kmp_mutex_impl_spin; 1320 #if KMP_USE_FUTEX 1321 case lk_futex: 1322 #endif 1323 case lk_ticket: 1324 case lk_queuing: 1325 case lk_drdpa: 1326 return kmp_mutex_impl_queuing; 1327 #if KMP_USE_TSX 1328 case lk_hle: 1329 case lk_rtm: 1330 case lk_adaptive: 1331 return kmp_mutex_impl_speculative; 1332 #endif 1333 default: 1334 return ompt_mutex_impl_unknown; 1335 } 1336 } 1337 #endif // KMP_USE_DYNAMIC_LOCK 1338 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1339 1340 /*! 1341 @ingroup WORK_SHARING 1342 @param loc source location information. 1343 @param global_tid global thread number. 1344 @param crit identity of the critical section. This could be a pointer to a lock 1345 associated with the critical section, or some other suitably unique value. 1346 @param hint the lock hint. 1347 1348 Enter code protected by a `critical` construct with a hint. The hint value is 1349 used to suggest a lock implementation. This function blocks until the executing 1350 thread can enter the critical section unless the hint suggests use of 1351 speculative execution and the hardware supports it. 1352 */ 1353 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid, 1354 kmp_critical_name *crit, uint32_t hint) { 1355 KMP_COUNT_BLOCK(OMP_CRITICAL); 1356 kmp_user_lock_p lck; 1357 #if OMPT_SUPPORT && OMPT_OPTIONAL 1358 omp_state_t prev_state = omp_state_undefined; 1359 ompt_thread_info_t ti; 1360 // This is the case, if called from __kmpc_critical: 1361 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1362 if (!codeptr) 1363 codeptr = OMPT_GET_RETURN_ADDRESS(0); 1364 #endif 1365 1366 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1367 1368 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 1369 // Check if it is initialized. 1370 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1371 if (*lk == 0) { 1372 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint); 1373 if (KMP_IS_D_LOCK(lckseq)) { 1374 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 1375 KMP_GET_D_TAG(lckseq)); 1376 } else { 1377 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq)); 1378 } 1379 } 1380 // Branch for accessing the actual lock object and set operation. This 1381 // branching is inevitable since this lock initialization does not follow the 1382 // normal dispatch path (lock table is not used). 1383 if (KMP_EXTRACT_D_TAG(lk) != 0) { 1384 lck = (kmp_user_lock_p)lk; 1385 if (__kmp_env_consistency_check) { 1386 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1387 __kmp_map_hint_to_lock(hint)); 1388 } 1389 #if USE_ITT_BUILD 1390 __kmp_itt_critical_acquiring(lck); 1391 #endif 1392 #if OMPT_SUPPORT && OMPT_OPTIONAL 1393 if (ompt_enabled.enabled) { 1394 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1395 /* OMPT state update */ 1396 prev_state = ti.state; 1397 ti.wait_id = (omp_wait_id_t)lck; 1398 ti.state = omp_state_wait_critical; 1399 1400 /* OMPT event callback */ 1401 if (ompt_enabled.ompt_callback_mutex_acquire) { 1402 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1403 ompt_mutex_critical, (unsigned int)hint, 1404 __ompt_get_mutex_impl_type(crit), (omp_wait_id_t)crit, codeptr); 1405 } 1406 } 1407 #endif 1408 #if KMP_USE_INLINED_TAS 1409 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1410 KMP_ACQUIRE_TAS_LOCK(lck, global_tid); 1411 } else 1412 #elif KMP_USE_INLINED_FUTEX 1413 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1414 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid); 1415 } else 1416 #endif 1417 { 1418 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 1419 } 1420 } else { 1421 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 1422 lck = ilk->lock; 1423 if (__kmp_env_consistency_check) { 1424 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1425 __kmp_map_hint_to_lock(hint)); 1426 } 1427 #if USE_ITT_BUILD 1428 __kmp_itt_critical_acquiring(lck); 1429 #endif 1430 #if OMPT_SUPPORT && OMPT_OPTIONAL 1431 if (ompt_enabled.enabled) { 1432 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1433 /* OMPT state update */ 1434 prev_state = ti.state; 1435 ti.wait_id = (omp_wait_id_t)lck; 1436 ti.state = omp_state_wait_critical; 1437 1438 /* OMPT event callback */ 1439 if (ompt_enabled.ompt_callback_mutex_acquire) { 1440 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1441 ompt_mutex_critical, (unsigned int)hint, 1442 __ompt_get_mutex_impl_type(0, ilk), (omp_wait_id_t)crit, codeptr); 1443 } 1444 } 1445 #endif 1446 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 1447 } 1448 KMP_POP_PARTITIONED_TIMER(); 1449 1450 #if USE_ITT_BUILD 1451 __kmp_itt_critical_acquired(lck); 1452 #endif /* USE_ITT_BUILD */ 1453 #if OMPT_SUPPORT && OMPT_OPTIONAL 1454 if (ompt_enabled.enabled) { 1455 /* OMPT state update */ 1456 ti.state = prev_state; 1457 ti.wait_id = 0; 1458 1459 /* OMPT event callback */ 1460 if (ompt_enabled.ompt_callback_mutex_acquired) { 1461 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1462 ompt_mutex_critical, (omp_wait_id_t)crit, codeptr); 1463 } 1464 } 1465 #endif 1466 1467 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1468 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1469 } // __kmpc_critical_with_hint 1470 1471 #endif // KMP_USE_DYNAMIC_LOCK 1472 1473 /*! 1474 @ingroup WORK_SHARING 1475 @param loc source location information. 1476 @param global_tid global thread number . 1477 @param crit identity of the critical section. This could be a pointer to a lock 1478 associated with the critical section, or some other suitably unique value. 1479 1480 Leave a critical section, releasing any lock that was held during its execution. 1481 */ 1482 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, 1483 kmp_critical_name *crit) { 1484 kmp_user_lock_p lck; 1485 1486 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid)); 1487 1488 #if KMP_USE_DYNAMIC_LOCK 1489 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 1490 lck = (kmp_user_lock_p)crit; 1491 KMP_ASSERT(lck != NULL); 1492 if (__kmp_env_consistency_check) { 1493 __kmp_pop_sync(global_tid, ct_critical, loc); 1494 } 1495 #if USE_ITT_BUILD 1496 __kmp_itt_critical_releasing(lck); 1497 #endif 1498 #if KMP_USE_INLINED_TAS 1499 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1500 KMP_RELEASE_TAS_LOCK(lck, global_tid); 1501 } else 1502 #elif KMP_USE_INLINED_FUTEX 1503 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1504 KMP_RELEASE_FUTEX_LOCK(lck, global_tid); 1505 } else 1506 #endif 1507 { 1508 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 1509 } 1510 } else { 1511 kmp_indirect_lock_t *ilk = 1512 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 1513 KMP_ASSERT(ilk != NULL); 1514 lck = ilk->lock; 1515 if (__kmp_env_consistency_check) { 1516 __kmp_pop_sync(global_tid, ct_critical, loc); 1517 } 1518 #if USE_ITT_BUILD 1519 __kmp_itt_critical_releasing(lck); 1520 #endif 1521 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid); 1522 } 1523 1524 #else // KMP_USE_DYNAMIC_LOCK 1525 1526 if ((__kmp_user_lock_kind == lk_tas) && 1527 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1528 lck = (kmp_user_lock_p)crit; 1529 } 1530 #if KMP_USE_FUTEX 1531 else if ((__kmp_user_lock_kind == lk_futex) && 1532 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1533 lck = (kmp_user_lock_p)crit; 1534 } 1535 #endif 1536 else { // ticket, queuing or drdpa 1537 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit)); 1538 } 1539 1540 KMP_ASSERT(lck != NULL); 1541 1542 if (__kmp_env_consistency_check) 1543 __kmp_pop_sync(global_tid, ct_critical, loc); 1544 1545 #if USE_ITT_BUILD 1546 __kmp_itt_critical_releasing(lck); 1547 #endif /* USE_ITT_BUILD */ 1548 // Value of 'crit' should be good for using as a critical_id of the critical 1549 // section directive. 1550 __kmp_release_user_lock_with_checks(lck, global_tid); 1551 1552 #endif // KMP_USE_DYNAMIC_LOCK 1553 1554 #if OMPT_SUPPORT && OMPT_OPTIONAL 1555 /* OMPT release event triggers after lock is released; place here to trigger 1556 * for all #if branches */ 1557 OMPT_STORE_RETURN_ADDRESS(global_tid); 1558 if (ompt_enabled.ompt_callback_mutex_released) { 1559 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 1560 ompt_mutex_critical, (omp_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0)); 1561 } 1562 #endif 1563 1564 KMP_POP_PARTITIONED_TIMER(); 1565 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid)); 1566 } 1567 1568 /*! 1569 @ingroup SYNCHRONIZATION 1570 @param loc source location information 1571 @param global_tid thread id. 1572 @return one if the thread should execute the master block, zero otherwise 1573 1574 Start execution of a combined barrier and master. The barrier is executed inside 1575 this function. 1576 */ 1577 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1578 int status; 1579 1580 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid)); 1581 1582 if (!TCR_4(__kmp_init_parallel)) 1583 __kmp_parallel_initialize(); 1584 1585 if (__kmp_env_consistency_check) 1586 __kmp_check_barrier(global_tid, ct_barrier, loc); 1587 1588 #if OMPT_SUPPORT 1589 omp_frame_t *ompt_frame; 1590 if (ompt_enabled.enabled) { 1591 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1592 if (ompt_frame->enter_frame == NULL) 1593 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1594 OMPT_STORE_RETURN_ADDRESS(global_tid); 1595 } 1596 #endif 1597 #if USE_ITT_NOTIFY 1598 __kmp_threads[global_tid]->th.th_ident = loc; 1599 #endif 1600 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL); 1601 #if OMPT_SUPPORT && OMPT_OPTIONAL 1602 if (ompt_enabled.enabled) { 1603 ompt_frame->enter_frame = NULL; 1604 } 1605 #endif 1606 1607 return (status != 0) ? 0 : 1; 1608 } 1609 1610 /*! 1611 @ingroup SYNCHRONIZATION 1612 @param loc source location information 1613 @param global_tid thread id. 1614 1615 Complete the execution of a combined barrier and master. This function should 1616 only be called at the completion of the <tt>master</tt> code. Other threads will 1617 still be waiting at the barrier and this call releases them. 1618 */ 1619 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1620 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid)); 1621 1622 __kmp_end_split_barrier(bs_plain_barrier, global_tid); 1623 } 1624 1625 /*! 1626 @ingroup SYNCHRONIZATION 1627 @param loc source location information 1628 @param global_tid thread id. 1629 @return one if the thread should execute the master block, zero otherwise 1630 1631 Start execution of a combined barrier and master(nowait) construct. 1632 The barrier is executed inside this function. 1633 There is no equivalent "end" function, since the 1634 */ 1635 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) { 1636 kmp_int32 ret; 1637 1638 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid)); 1639 1640 if (!TCR_4(__kmp_init_parallel)) 1641 __kmp_parallel_initialize(); 1642 1643 if (__kmp_env_consistency_check) { 1644 if (loc == 0) { 1645 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 1646 } 1647 __kmp_check_barrier(global_tid, ct_barrier, loc); 1648 } 1649 1650 #if OMPT_SUPPORT 1651 omp_frame_t *ompt_frame; 1652 if (ompt_enabled.enabled) { 1653 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1654 if (ompt_frame->enter_frame == NULL) 1655 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1656 OMPT_STORE_RETURN_ADDRESS(global_tid); 1657 } 1658 #endif 1659 #if USE_ITT_NOTIFY 1660 __kmp_threads[global_tid]->th.th_ident = loc; 1661 #endif 1662 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 1663 #if OMPT_SUPPORT && OMPT_OPTIONAL 1664 if (ompt_enabled.enabled) { 1665 ompt_frame->enter_frame = NULL; 1666 } 1667 #endif 1668 1669 ret = __kmpc_master(loc, global_tid); 1670 1671 if (__kmp_env_consistency_check) { 1672 /* there's no __kmpc_end_master called; so the (stats) */ 1673 /* actions of __kmpc_end_master are done here */ 1674 1675 if (global_tid < 0) { 1676 KMP_WARNING(ThreadIdentInvalid); 1677 } 1678 if (ret) { 1679 /* only one thread should do the pop since only */ 1680 /* one did the push (see __kmpc_master()) */ 1681 1682 __kmp_pop_sync(global_tid, ct_master, loc); 1683 } 1684 } 1685 1686 return (ret); 1687 } 1688 1689 /* The BARRIER for a SINGLE process section is always explicit */ 1690 /*! 1691 @ingroup WORK_SHARING 1692 @param loc source location information 1693 @param global_tid global thread number 1694 @return One if this thread should execute the single construct, zero otherwise. 1695 1696 Test whether to execute a <tt>single</tt> construct. 1697 There are no implicit barriers in the two "single" calls, rather the compiler 1698 should introduce an explicit barrier if it is required. 1699 */ 1700 1701 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) { 1702 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE); 1703 1704 if (rc) { 1705 // We are going to execute the single statement, so we should count it. 1706 KMP_COUNT_BLOCK(OMP_SINGLE); 1707 KMP_PUSH_PARTITIONED_TIMER(OMP_single); 1708 } 1709 1710 #if OMPT_SUPPORT && OMPT_OPTIONAL 1711 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1712 kmp_team_t *team = this_thr->th.th_team; 1713 int tid = __kmp_tid_from_gtid(global_tid); 1714 1715 if (ompt_enabled.enabled) { 1716 if (rc) { 1717 if (ompt_enabled.ompt_callback_work) { 1718 ompt_callbacks.ompt_callback(ompt_callback_work)( 1719 ompt_work_single_executor, ompt_scope_begin, 1720 &(team->t.ompt_team_info.parallel_data), 1721 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1722 1, OMPT_GET_RETURN_ADDRESS(0)); 1723 } 1724 } else { 1725 if (ompt_enabled.ompt_callback_work) { 1726 ompt_callbacks.ompt_callback(ompt_callback_work)( 1727 ompt_work_single_other, ompt_scope_begin, 1728 &(team->t.ompt_team_info.parallel_data), 1729 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1730 1, OMPT_GET_RETURN_ADDRESS(0)); 1731 ompt_callbacks.ompt_callback(ompt_callback_work)( 1732 ompt_work_single_other, ompt_scope_end, 1733 &(team->t.ompt_team_info.parallel_data), 1734 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1735 1, OMPT_GET_RETURN_ADDRESS(0)); 1736 } 1737 } 1738 } 1739 #endif 1740 1741 return rc; 1742 } 1743 1744 /*! 1745 @ingroup WORK_SHARING 1746 @param loc source location information 1747 @param global_tid global thread number 1748 1749 Mark the end of a <tt>single</tt> construct. This function should 1750 only be called by the thread that executed the block of code protected 1751 by the `single` construct. 1752 */ 1753 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) { 1754 __kmp_exit_single(global_tid); 1755 KMP_POP_PARTITIONED_TIMER(); 1756 1757 #if OMPT_SUPPORT && OMPT_OPTIONAL 1758 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1759 kmp_team_t *team = this_thr->th.th_team; 1760 int tid = __kmp_tid_from_gtid(global_tid); 1761 1762 if (ompt_enabled.ompt_callback_work) { 1763 ompt_callbacks.ompt_callback(ompt_callback_work)( 1764 ompt_work_single_executor, ompt_scope_end, 1765 &(team->t.ompt_team_info.parallel_data), 1766 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1, 1767 OMPT_GET_RETURN_ADDRESS(0)); 1768 } 1769 #endif 1770 } 1771 1772 /*! 1773 @ingroup WORK_SHARING 1774 @param loc Source location 1775 @param global_tid Global thread id 1776 1777 Mark the end of a statically scheduled loop. 1778 */ 1779 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) { 1780 KMP_POP_PARTITIONED_TIMER(); 1781 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid)); 1782 1783 #if OMPT_SUPPORT && OMPT_OPTIONAL 1784 if (ompt_enabled.ompt_callback_work) { 1785 ompt_work_t ompt_work_type = ompt_work_loop; 1786 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1787 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1788 // Determine workshare type 1789 if (loc != NULL) { 1790 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) { 1791 ompt_work_type = ompt_work_loop; 1792 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) { 1793 ompt_work_type = ompt_work_sections; 1794 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) { 1795 ompt_work_type = ompt_work_distribute; 1796 } else { 1797 // use default set above. 1798 // a warning about this case is provided in __kmpc_for_static_init 1799 } 1800 KMP_DEBUG_ASSERT(ompt_work_type); 1801 } 1802 ompt_callbacks.ompt_callback(ompt_callback_work)( 1803 ompt_work_type, ompt_scope_end, &(team_info->parallel_data), 1804 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)); 1805 } 1806 #endif 1807 if (__kmp_env_consistency_check) 1808 __kmp_pop_workshare(global_tid, ct_pdo, loc); 1809 } 1810 1811 // User routines which take C-style arguments (call by value) 1812 // different from the Fortran equivalent routines 1813 1814 void ompc_set_num_threads(int arg) { 1815 // !!!!! TODO: check the per-task binding 1816 __kmp_set_num_threads(arg, __kmp_entry_gtid()); 1817 } 1818 1819 void ompc_set_dynamic(int flag) { 1820 kmp_info_t *thread; 1821 1822 /* For the thread-private implementation of the internal controls */ 1823 thread = __kmp_entry_thread(); 1824 1825 __kmp_save_internal_controls(thread); 1826 1827 set__dynamic(thread, flag ? TRUE : FALSE); 1828 } 1829 1830 void ompc_set_nested(int flag) { 1831 kmp_info_t *thread; 1832 1833 /* For the thread-private internal controls implementation */ 1834 thread = __kmp_entry_thread(); 1835 1836 __kmp_save_internal_controls(thread); 1837 1838 set__nested(thread, flag ? TRUE : FALSE); 1839 } 1840 1841 void ompc_set_max_active_levels(int max_active_levels) { 1842 /* TO DO */ 1843 /* we want per-task implementation of this internal control */ 1844 1845 /* For the per-thread internal controls implementation */ 1846 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels); 1847 } 1848 1849 void ompc_set_schedule(omp_sched_t kind, int modifier) { 1850 // !!!!! TODO: check the per-task binding 1851 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier); 1852 } 1853 1854 int ompc_get_ancestor_thread_num(int level) { 1855 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level); 1856 } 1857 1858 int ompc_get_team_size(int level) { 1859 return __kmp_get_team_size(__kmp_entry_gtid(), level); 1860 } 1861 1862 void kmpc_set_stacksize(int arg) { 1863 // __kmp_aux_set_stacksize initializes the library if needed 1864 __kmp_aux_set_stacksize(arg); 1865 } 1866 1867 void kmpc_set_stacksize_s(size_t arg) { 1868 // __kmp_aux_set_stacksize initializes the library if needed 1869 __kmp_aux_set_stacksize(arg); 1870 } 1871 1872 void kmpc_set_blocktime(int arg) { 1873 int gtid, tid; 1874 kmp_info_t *thread; 1875 1876 gtid = __kmp_entry_gtid(); 1877 tid = __kmp_tid_from_gtid(gtid); 1878 thread = __kmp_thread_from_gtid(gtid); 1879 1880 __kmp_aux_set_blocktime(arg, thread, tid); 1881 } 1882 1883 void kmpc_set_library(int arg) { 1884 // __kmp_user_set_library initializes the library if needed 1885 __kmp_user_set_library((enum library_type)arg); 1886 } 1887 1888 void kmpc_set_defaults(char const *str) { 1889 // __kmp_aux_set_defaults initializes the library if needed 1890 __kmp_aux_set_defaults(str, KMP_STRLEN(str)); 1891 } 1892 1893 void kmpc_set_disp_num_buffers(int arg) { 1894 // ignore after initialization because some teams have already 1895 // allocated dispatch buffers 1896 if (__kmp_init_serial == 0 && arg > 0) 1897 __kmp_dispatch_num_buffers = arg; 1898 } 1899 1900 int kmpc_set_affinity_mask_proc(int proc, void **mask) { 1901 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1902 return -1; 1903 #else 1904 if (!TCR_4(__kmp_init_middle)) { 1905 __kmp_middle_initialize(); 1906 } 1907 return __kmp_aux_set_affinity_mask_proc(proc, mask); 1908 #endif 1909 } 1910 1911 int kmpc_unset_affinity_mask_proc(int proc, void **mask) { 1912 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1913 return -1; 1914 #else 1915 if (!TCR_4(__kmp_init_middle)) { 1916 __kmp_middle_initialize(); 1917 } 1918 return __kmp_aux_unset_affinity_mask_proc(proc, mask); 1919 #endif 1920 } 1921 1922 int kmpc_get_affinity_mask_proc(int proc, void **mask) { 1923 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1924 return -1; 1925 #else 1926 if (!TCR_4(__kmp_init_middle)) { 1927 __kmp_middle_initialize(); 1928 } 1929 return __kmp_aux_get_affinity_mask_proc(proc, mask); 1930 #endif 1931 } 1932 1933 /* -------------------------------------------------------------------------- */ 1934 /*! 1935 @ingroup THREADPRIVATE 1936 @param loc source location information 1937 @param gtid global thread number 1938 @param cpy_size size of the cpy_data buffer 1939 @param cpy_data pointer to data to be copied 1940 @param cpy_func helper function to call for copying data 1941 @param didit flag variable: 1=single thread; 0=not single thread 1942 1943 __kmpc_copyprivate implements the interface for the private data broadcast 1944 needed for the copyprivate clause associated with a single region in an 1945 OpenMP<sup>*</sup> program (both C and Fortran). 1946 All threads participating in the parallel region call this routine. 1947 One of the threads (called the single thread) should have the <tt>didit</tt> 1948 variable set to 1 and all other threads should have that variable set to 0. 1949 All threads pass a pointer to a data buffer (cpy_data) that they have built. 1950 1951 The OpenMP specification forbids the use of nowait on the single region when a 1952 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a 1953 barrier internally to avoid race conditions, so the code generation for the 1954 single region should avoid generating a barrier after the call to @ref 1955 __kmpc_copyprivate. 1956 1957 The <tt>gtid</tt> parameter is the global thread id for the current thread. 1958 The <tt>loc</tt> parameter is a pointer to source location information. 1959 1960 Internal implementation: The single thread will first copy its descriptor 1961 address (cpy_data) to a team-private location, then the other threads will each 1962 call the function pointed to by the parameter cpy_func, which carries out the 1963 copy by copying the data using the cpy_data buffer. 1964 1965 The cpy_func routine used for the copy and the contents of the data area defined 1966 by cpy_data and cpy_size may be built in any fashion that will allow the copy 1967 to be done. For instance, the cpy_data buffer can hold the actual data to be 1968 copied or it may hold a list of pointers to the data. The cpy_func routine must 1969 interpret the cpy_data buffer appropriately. 1970 1971 The interface to cpy_func is as follows: 1972 @code 1973 void cpy_func( void *destination, void *source ) 1974 @endcode 1975 where void *destination is the cpy_data pointer for the thread being copied to 1976 and void *source is the cpy_data pointer for the thread being copied from. 1977 */ 1978 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, 1979 void *cpy_data, void (*cpy_func)(void *, void *), 1980 kmp_int32 didit) { 1981 void **data_ptr; 1982 1983 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid)); 1984 1985 KMP_MB(); 1986 1987 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data; 1988 1989 if (__kmp_env_consistency_check) { 1990 if (loc == 0) { 1991 KMP_WARNING(ConstructIdentInvalid); 1992 } 1993 } 1994 1995 // ToDo: Optimize the following two barriers into some kind of split barrier 1996 1997 if (didit) 1998 *data_ptr = cpy_data; 1999 2000 #if OMPT_SUPPORT 2001 omp_frame_t *ompt_frame; 2002 if (ompt_enabled.enabled) { 2003 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 2004 if (ompt_frame->enter_frame == NULL) 2005 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 2006 OMPT_STORE_RETURN_ADDRESS(gtid); 2007 } 2008 #endif 2009 /* This barrier is not a barrier region boundary */ 2010 #if USE_ITT_NOTIFY 2011 __kmp_threads[gtid]->th.th_ident = loc; 2012 #endif 2013 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2014 2015 if (!didit) 2016 (*cpy_func)(cpy_data, *data_ptr); 2017 2018 // Consider next barrier a user-visible barrier for barrier region boundaries 2019 // Nesting checks are already handled by the single construct checks 2020 2021 #if OMPT_SUPPORT 2022 if (ompt_enabled.enabled) { 2023 OMPT_STORE_RETURN_ADDRESS(gtid); 2024 } 2025 #endif 2026 #if USE_ITT_NOTIFY 2027 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. 2028 // tasks can overwrite the location) 2029 #endif 2030 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2031 #if OMPT_SUPPORT && OMPT_OPTIONAL 2032 if (ompt_enabled.enabled) { 2033 ompt_frame->enter_frame = NULL; 2034 } 2035 #endif 2036 } 2037 2038 /* -------------------------------------------------------------------------- */ 2039 2040 #define INIT_LOCK __kmp_init_user_lock_with_checks 2041 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks 2042 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks 2043 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed 2044 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks 2045 #define ACQUIRE_NESTED_LOCK_TIMED \ 2046 __kmp_acquire_nested_user_lock_with_checks_timed 2047 #define RELEASE_LOCK __kmp_release_user_lock_with_checks 2048 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks 2049 #define TEST_LOCK __kmp_test_user_lock_with_checks 2050 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks 2051 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks 2052 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks 2053 2054 // TODO: Make check abort messages use location info & pass it into 2055 // with_checks routines 2056 2057 #if KMP_USE_DYNAMIC_LOCK 2058 2059 // internal lock initializer 2060 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock, 2061 kmp_dyna_lockseq_t seq) { 2062 if (KMP_IS_D_LOCK(seq)) { 2063 KMP_INIT_D_LOCK(lock, seq); 2064 #if USE_ITT_BUILD 2065 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL); 2066 #endif 2067 } else { 2068 KMP_INIT_I_LOCK(lock, seq); 2069 #if USE_ITT_BUILD 2070 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2071 __kmp_itt_lock_creating(ilk->lock, loc); 2072 #endif 2073 } 2074 } 2075 2076 // internal nest lock initializer 2077 static __forceinline void 2078 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, 2079 kmp_dyna_lockseq_t seq) { 2080 #if KMP_USE_TSX 2081 // Don't have nested lock implementation for speculative locks 2082 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive) 2083 seq = __kmp_user_lock_seq; 2084 #endif 2085 switch (seq) { 2086 case lockseq_tas: 2087 seq = lockseq_nested_tas; 2088 break; 2089 #if KMP_USE_FUTEX 2090 case lockseq_futex: 2091 seq = lockseq_nested_futex; 2092 break; 2093 #endif 2094 case lockseq_ticket: 2095 seq = lockseq_nested_ticket; 2096 break; 2097 case lockseq_queuing: 2098 seq = lockseq_nested_queuing; 2099 break; 2100 case lockseq_drdpa: 2101 seq = lockseq_nested_drdpa; 2102 break; 2103 default: 2104 seq = lockseq_nested_queuing; 2105 } 2106 KMP_INIT_I_LOCK(lock, seq); 2107 #if USE_ITT_BUILD 2108 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2109 __kmp_itt_lock_creating(ilk->lock, loc); 2110 #endif 2111 } 2112 2113 /* initialize the lock with a hint */ 2114 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, 2115 uintptr_t hint) { 2116 KMP_DEBUG_ASSERT(__kmp_init_serial); 2117 if (__kmp_env_consistency_check && user_lock == NULL) { 2118 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint"); 2119 } 2120 2121 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2122 2123 #if OMPT_SUPPORT && OMPT_OPTIONAL 2124 // This is the case, if called from omp_init_lock_with_hint: 2125 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2126 if (!codeptr) 2127 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2128 if (ompt_enabled.ompt_callback_lock_init) { 2129 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2130 ompt_mutex_lock, (omp_lock_hint_t)hint, 2131 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2132 codeptr); 2133 } 2134 #endif 2135 } 2136 2137 /* initialize the lock with a hint */ 2138 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, 2139 void **user_lock, uintptr_t hint) { 2140 KMP_DEBUG_ASSERT(__kmp_init_serial); 2141 if (__kmp_env_consistency_check && user_lock == NULL) { 2142 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint"); 2143 } 2144 2145 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2146 2147 #if OMPT_SUPPORT && OMPT_OPTIONAL 2148 // This is the case, if called from omp_init_lock_with_hint: 2149 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2150 if (!codeptr) 2151 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2152 if (ompt_enabled.ompt_callback_lock_init) { 2153 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2154 ompt_mutex_nest_lock, (omp_lock_hint_t)hint, 2155 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2156 codeptr); 2157 } 2158 #endif 2159 } 2160 2161 #endif // KMP_USE_DYNAMIC_LOCK 2162 2163 /* initialize the lock */ 2164 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2165 #if KMP_USE_DYNAMIC_LOCK 2166 2167 KMP_DEBUG_ASSERT(__kmp_init_serial); 2168 if (__kmp_env_consistency_check && user_lock == NULL) { 2169 KMP_FATAL(LockIsUninitialized, "omp_init_lock"); 2170 } 2171 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2172 2173 #if OMPT_SUPPORT && OMPT_OPTIONAL 2174 // This is the case, if called from omp_init_lock_with_hint: 2175 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2176 if (!codeptr) 2177 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2178 if (ompt_enabled.ompt_callback_lock_init) { 2179 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2180 ompt_mutex_lock, omp_lock_hint_none, 2181 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2182 codeptr); 2183 } 2184 #endif 2185 2186 #else // KMP_USE_DYNAMIC_LOCK 2187 2188 static char const *const func = "omp_init_lock"; 2189 kmp_user_lock_p lck; 2190 KMP_DEBUG_ASSERT(__kmp_init_serial); 2191 2192 if (__kmp_env_consistency_check) { 2193 if (user_lock == NULL) { 2194 KMP_FATAL(LockIsUninitialized, func); 2195 } 2196 } 2197 2198 KMP_CHECK_USER_LOCK_INIT(); 2199 2200 if ((__kmp_user_lock_kind == lk_tas) && 2201 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2202 lck = (kmp_user_lock_p)user_lock; 2203 } 2204 #if KMP_USE_FUTEX 2205 else if ((__kmp_user_lock_kind == lk_futex) && 2206 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2207 lck = (kmp_user_lock_p)user_lock; 2208 } 2209 #endif 2210 else { 2211 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2212 } 2213 INIT_LOCK(lck); 2214 __kmp_set_user_lock_location(lck, loc); 2215 2216 #if OMPT_SUPPORT && OMPT_OPTIONAL 2217 // This is the case, if called from omp_init_lock_with_hint: 2218 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2219 if (!codeptr) 2220 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2221 if (ompt_enabled.ompt_callback_lock_init) { 2222 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2223 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2224 (omp_wait_id_t)user_lock, codeptr); 2225 } 2226 #endif 2227 2228 #if USE_ITT_BUILD 2229 __kmp_itt_lock_creating(lck); 2230 #endif /* USE_ITT_BUILD */ 2231 2232 #endif // KMP_USE_DYNAMIC_LOCK 2233 } // __kmpc_init_lock 2234 2235 /* initialize the lock */ 2236 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2237 #if KMP_USE_DYNAMIC_LOCK 2238 2239 KMP_DEBUG_ASSERT(__kmp_init_serial); 2240 if (__kmp_env_consistency_check && user_lock == NULL) { 2241 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock"); 2242 } 2243 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2244 2245 #if OMPT_SUPPORT && OMPT_OPTIONAL 2246 // This is the case, if called from omp_init_lock_with_hint: 2247 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2248 if (!codeptr) 2249 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2250 if (ompt_enabled.ompt_callback_lock_init) { 2251 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2252 ompt_mutex_nest_lock, omp_lock_hint_none, 2253 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2254 codeptr); 2255 } 2256 #endif 2257 2258 #else // KMP_USE_DYNAMIC_LOCK 2259 2260 static char const *const func = "omp_init_nest_lock"; 2261 kmp_user_lock_p lck; 2262 KMP_DEBUG_ASSERT(__kmp_init_serial); 2263 2264 if (__kmp_env_consistency_check) { 2265 if (user_lock == NULL) { 2266 KMP_FATAL(LockIsUninitialized, func); 2267 } 2268 } 2269 2270 KMP_CHECK_USER_LOCK_INIT(); 2271 2272 if ((__kmp_user_lock_kind == lk_tas) && 2273 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2274 OMP_NEST_LOCK_T_SIZE)) { 2275 lck = (kmp_user_lock_p)user_lock; 2276 } 2277 #if KMP_USE_FUTEX 2278 else if ((__kmp_user_lock_kind == lk_futex) && 2279 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2280 OMP_NEST_LOCK_T_SIZE)) { 2281 lck = (kmp_user_lock_p)user_lock; 2282 } 2283 #endif 2284 else { 2285 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2286 } 2287 2288 INIT_NESTED_LOCK(lck); 2289 __kmp_set_user_lock_location(lck, loc); 2290 2291 #if OMPT_SUPPORT && OMPT_OPTIONAL 2292 // This is the case, if called from omp_init_lock_with_hint: 2293 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2294 if (!codeptr) 2295 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2296 if (ompt_enabled.ompt_callback_lock_init) { 2297 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2298 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2299 (omp_wait_id_t)user_lock, codeptr); 2300 } 2301 #endif 2302 2303 #if USE_ITT_BUILD 2304 __kmp_itt_lock_creating(lck); 2305 #endif /* USE_ITT_BUILD */ 2306 2307 #endif // KMP_USE_DYNAMIC_LOCK 2308 } // __kmpc_init_nest_lock 2309 2310 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2311 #if KMP_USE_DYNAMIC_LOCK 2312 2313 #if USE_ITT_BUILD 2314 kmp_user_lock_p lck; 2315 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2316 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2317 } else { 2318 lck = (kmp_user_lock_p)user_lock; 2319 } 2320 __kmp_itt_lock_destroyed(lck); 2321 #endif 2322 #if OMPT_SUPPORT && OMPT_OPTIONAL 2323 // This is the case, if called from omp_init_lock_with_hint: 2324 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2325 if (!codeptr) 2326 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2327 if (ompt_enabled.ompt_callback_lock_destroy) { 2328 kmp_user_lock_p lck; 2329 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2330 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2331 } else { 2332 lck = (kmp_user_lock_p)user_lock; 2333 } 2334 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2335 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2336 } 2337 #endif 2338 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2339 #else 2340 kmp_user_lock_p lck; 2341 2342 if ((__kmp_user_lock_kind == lk_tas) && 2343 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2344 lck = (kmp_user_lock_p)user_lock; 2345 } 2346 #if KMP_USE_FUTEX 2347 else if ((__kmp_user_lock_kind == lk_futex) && 2348 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2349 lck = (kmp_user_lock_p)user_lock; 2350 } 2351 #endif 2352 else { 2353 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock"); 2354 } 2355 2356 #if OMPT_SUPPORT && OMPT_OPTIONAL 2357 // This is the case, if called from omp_init_lock_with_hint: 2358 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2359 if (!codeptr) 2360 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2361 if (ompt_enabled.ompt_callback_lock_destroy) { 2362 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2363 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2364 } 2365 #endif 2366 2367 #if USE_ITT_BUILD 2368 __kmp_itt_lock_destroyed(lck); 2369 #endif /* USE_ITT_BUILD */ 2370 DESTROY_LOCK(lck); 2371 2372 if ((__kmp_user_lock_kind == lk_tas) && 2373 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2374 ; 2375 } 2376 #if KMP_USE_FUTEX 2377 else if ((__kmp_user_lock_kind == lk_futex) && 2378 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2379 ; 2380 } 2381 #endif 2382 else { 2383 __kmp_user_lock_free(user_lock, gtid, lck); 2384 } 2385 #endif // KMP_USE_DYNAMIC_LOCK 2386 } // __kmpc_destroy_lock 2387 2388 /* destroy the lock */ 2389 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2390 #if KMP_USE_DYNAMIC_LOCK 2391 2392 #if USE_ITT_BUILD 2393 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock); 2394 __kmp_itt_lock_destroyed(ilk->lock); 2395 #endif 2396 #if OMPT_SUPPORT && OMPT_OPTIONAL 2397 // This is the case, if called from omp_init_lock_with_hint: 2398 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2399 if (!codeptr) 2400 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2401 if (ompt_enabled.ompt_callback_lock_destroy) { 2402 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2403 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2404 } 2405 #endif 2406 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2407 2408 #else // KMP_USE_DYNAMIC_LOCK 2409 2410 kmp_user_lock_p lck; 2411 2412 if ((__kmp_user_lock_kind == lk_tas) && 2413 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2414 OMP_NEST_LOCK_T_SIZE)) { 2415 lck = (kmp_user_lock_p)user_lock; 2416 } 2417 #if KMP_USE_FUTEX 2418 else if ((__kmp_user_lock_kind == lk_futex) && 2419 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2420 OMP_NEST_LOCK_T_SIZE)) { 2421 lck = (kmp_user_lock_p)user_lock; 2422 } 2423 #endif 2424 else { 2425 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock"); 2426 } 2427 2428 #if OMPT_SUPPORT && OMPT_OPTIONAL 2429 // This is the case, if called from omp_init_lock_with_hint: 2430 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2431 if (!codeptr) 2432 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2433 if (ompt_enabled.ompt_callback_lock_destroy) { 2434 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2435 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2436 } 2437 #endif 2438 2439 #if USE_ITT_BUILD 2440 __kmp_itt_lock_destroyed(lck); 2441 #endif /* USE_ITT_BUILD */ 2442 2443 DESTROY_NESTED_LOCK(lck); 2444 2445 if ((__kmp_user_lock_kind == lk_tas) && 2446 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2447 OMP_NEST_LOCK_T_SIZE)) { 2448 ; 2449 } 2450 #if KMP_USE_FUTEX 2451 else if ((__kmp_user_lock_kind == lk_futex) && 2452 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2453 OMP_NEST_LOCK_T_SIZE)) { 2454 ; 2455 } 2456 #endif 2457 else { 2458 __kmp_user_lock_free(user_lock, gtid, lck); 2459 } 2460 #endif // KMP_USE_DYNAMIC_LOCK 2461 } // __kmpc_destroy_nest_lock 2462 2463 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2464 KMP_COUNT_BLOCK(OMP_set_lock); 2465 #if KMP_USE_DYNAMIC_LOCK 2466 int tag = KMP_EXTRACT_D_TAG(user_lock); 2467 #if USE_ITT_BUILD 2468 __kmp_itt_lock_acquiring( 2469 (kmp_user_lock_p) 2470 user_lock); // itt function will get to the right lock object. 2471 #endif 2472 #if OMPT_SUPPORT && OMPT_OPTIONAL 2473 // This is the case, if called from omp_init_lock_with_hint: 2474 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2475 if (!codeptr) 2476 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2477 if (ompt_enabled.ompt_callback_mutex_acquire) { 2478 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2479 ompt_mutex_lock, omp_lock_hint_none, 2480 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2481 codeptr); 2482 } 2483 #endif 2484 #if KMP_USE_INLINED_TAS 2485 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2486 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid); 2487 } else 2488 #elif KMP_USE_INLINED_FUTEX 2489 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2490 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid); 2491 } else 2492 #endif 2493 { 2494 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2495 } 2496 #if USE_ITT_BUILD 2497 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2498 #endif 2499 #if OMPT_SUPPORT && OMPT_OPTIONAL 2500 if (ompt_enabled.ompt_callback_mutex_acquired) { 2501 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2502 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2503 } 2504 #endif 2505 2506 #else // KMP_USE_DYNAMIC_LOCK 2507 2508 kmp_user_lock_p lck; 2509 2510 if ((__kmp_user_lock_kind == lk_tas) && 2511 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2512 lck = (kmp_user_lock_p)user_lock; 2513 } 2514 #if KMP_USE_FUTEX 2515 else if ((__kmp_user_lock_kind == lk_futex) && 2516 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2517 lck = (kmp_user_lock_p)user_lock; 2518 } 2519 #endif 2520 else { 2521 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock"); 2522 } 2523 2524 #if USE_ITT_BUILD 2525 __kmp_itt_lock_acquiring(lck); 2526 #endif /* USE_ITT_BUILD */ 2527 #if OMPT_SUPPORT && OMPT_OPTIONAL 2528 // This is the case, if called from omp_init_lock_with_hint: 2529 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2530 if (!codeptr) 2531 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2532 if (ompt_enabled.ompt_callback_mutex_acquire) { 2533 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2534 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2535 (omp_wait_id_t)lck, codeptr); 2536 } 2537 #endif 2538 2539 ACQUIRE_LOCK(lck, gtid); 2540 2541 #if USE_ITT_BUILD 2542 __kmp_itt_lock_acquired(lck); 2543 #endif /* USE_ITT_BUILD */ 2544 2545 #if OMPT_SUPPORT && OMPT_OPTIONAL 2546 if (ompt_enabled.ompt_callback_mutex_acquired) { 2547 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2548 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2549 } 2550 #endif 2551 2552 #endif // KMP_USE_DYNAMIC_LOCK 2553 } 2554 2555 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2556 #if KMP_USE_DYNAMIC_LOCK 2557 2558 #if USE_ITT_BUILD 2559 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2560 #endif 2561 #if OMPT_SUPPORT && OMPT_OPTIONAL 2562 // This is the case, if called from omp_init_lock_with_hint: 2563 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2564 if (!codeptr) 2565 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2566 if (ompt_enabled.enabled) { 2567 if (ompt_enabled.ompt_callback_mutex_acquire) { 2568 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2569 ompt_mutex_nest_lock, omp_lock_hint_none, 2570 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2571 codeptr); 2572 } 2573 } 2574 #endif 2575 int acquire_status = 2576 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid); 2577 (void) acquire_status; 2578 #if USE_ITT_BUILD 2579 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2580 #endif 2581 2582 #if OMPT_SUPPORT && OMPT_OPTIONAL 2583 if (ompt_enabled.enabled) { 2584 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2585 if (ompt_enabled.ompt_callback_mutex_acquired) { 2586 // lock_first 2587 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2588 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2589 } 2590 } else { 2591 if (ompt_enabled.ompt_callback_nest_lock) { 2592 // lock_next 2593 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2594 ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr); 2595 } 2596 } 2597 } 2598 #endif 2599 2600 #else // KMP_USE_DYNAMIC_LOCK 2601 int acquire_status; 2602 kmp_user_lock_p lck; 2603 2604 if ((__kmp_user_lock_kind == lk_tas) && 2605 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2606 OMP_NEST_LOCK_T_SIZE)) { 2607 lck = (kmp_user_lock_p)user_lock; 2608 } 2609 #if KMP_USE_FUTEX 2610 else if ((__kmp_user_lock_kind == lk_futex) && 2611 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2612 OMP_NEST_LOCK_T_SIZE)) { 2613 lck = (kmp_user_lock_p)user_lock; 2614 } 2615 #endif 2616 else { 2617 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock"); 2618 } 2619 2620 #if USE_ITT_BUILD 2621 __kmp_itt_lock_acquiring(lck); 2622 #endif /* USE_ITT_BUILD */ 2623 #if OMPT_SUPPORT && OMPT_OPTIONAL 2624 // This is the case, if called from omp_init_lock_with_hint: 2625 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2626 if (!codeptr) 2627 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2628 if (ompt_enabled.enabled) { 2629 if (ompt_enabled.ompt_callback_mutex_acquire) { 2630 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2631 ompt_mutex_nest_lock, omp_lock_hint_none, 2632 __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr); 2633 } 2634 } 2635 #endif 2636 2637 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status); 2638 2639 #if USE_ITT_BUILD 2640 __kmp_itt_lock_acquired(lck); 2641 #endif /* USE_ITT_BUILD */ 2642 2643 #if OMPT_SUPPORT && OMPT_OPTIONAL 2644 if (ompt_enabled.enabled) { 2645 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2646 if (ompt_enabled.ompt_callback_mutex_acquired) { 2647 // lock_first 2648 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2649 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 2650 } 2651 } else { 2652 if (ompt_enabled.ompt_callback_nest_lock) { 2653 // lock_next 2654 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2655 ompt_scope_begin, (omp_wait_id_t)lck, codeptr); 2656 } 2657 } 2658 } 2659 #endif 2660 2661 #endif // KMP_USE_DYNAMIC_LOCK 2662 } 2663 2664 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2665 #if KMP_USE_DYNAMIC_LOCK 2666 2667 int tag = KMP_EXTRACT_D_TAG(user_lock); 2668 #if USE_ITT_BUILD 2669 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2670 #endif 2671 #if KMP_USE_INLINED_TAS 2672 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2673 KMP_RELEASE_TAS_LOCK(user_lock, gtid); 2674 } else 2675 #elif KMP_USE_INLINED_FUTEX 2676 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2677 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid); 2678 } else 2679 #endif 2680 { 2681 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2682 } 2683 2684 #if OMPT_SUPPORT && OMPT_OPTIONAL 2685 // This is the case, if called from omp_init_lock_with_hint: 2686 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2687 if (!codeptr) 2688 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2689 if (ompt_enabled.ompt_callback_mutex_released) { 2690 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2691 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2692 } 2693 #endif 2694 2695 #else // KMP_USE_DYNAMIC_LOCK 2696 2697 kmp_user_lock_p lck; 2698 2699 /* Can't use serial interval since not block structured */ 2700 /* release the lock */ 2701 2702 if ((__kmp_user_lock_kind == lk_tas) && 2703 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2704 #if KMP_OS_LINUX && \ 2705 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2706 // "fast" path implemented to fix customer performance issue 2707 #if USE_ITT_BUILD 2708 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2709 #endif /* USE_ITT_BUILD */ 2710 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0); 2711 KMP_MB(); 2712 2713 #if OMPT_SUPPORT && OMPT_OPTIONAL 2714 // This is the case, if called from omp_init_lock_with_hint: 2715 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2716 if (!codeptr) 2717 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2718 if (ompt_enabled.ompt_callback_mutex_released) { 2719 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2720 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2721 } 2722 #endif 2723 2724 return; 2725 #else 2726 lck = (kmp_user_lock_p)user_lock; 2727 #endif 2728 } 2729 #if KMP_USE_FUTEX 2730 else if ((__kmp_user_lock_kind == lk_futex) && 2731 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2732 lck = (kmp_user_lock_p)user_lock; 2733 } 2734 #endif 2735 else { 2736 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock"); 2737 } 2738 2739 #if USE_ITT_BUILD 2740 __kmp_itt_lock_releasing(lck); 2741 #endif /* USE_ITT_BUILD */ 2742 2743 RELEASE_LOCK(lck, gtid); 2744 2745 #if OMPT_SUPPORT && OMPT_OPTIONAL 2746 // This is the case, if called from omp_init_lock_with_hint: 2747 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2748 if (!codeptr) 2749 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2750 if (ompt_enabled.ompt_callback_mutex_released) { 2751 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2752 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2753 } 2754 #endif 2755 2756 #endif // KMP_USE_DYNAMIC_LOCK 2757 } 2758 2759 /* release the lock */ 2760 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2761 #if KMP_USE_DYNAMIC_LOCK 2762 2763 #if USE_ITT_BUILD 2764 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2765 #endif 2766 int release_status = 2767 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid); 2768 (void) release_status; 2769 2770 #if OMPT_SUPPORT && OMPT_OPTIONAL 2771 // This is the case, if called from omp_init_lock_with_hint: 2772 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2773 if (!codeptr) 2774 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2775 if (ompt_enabled.enabled) { 2776 if (release_status == KMP_LOCK_RELEASED) { 2777 if (ompt_enabled.ompt_callback_mutex_released) { 2778 // release_lock_last 2779 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2780 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2781 } 2782 } else if (ompt_enabled.ompt_callback_nest_lock) { 2783 // release_lock_prev 2784 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2785 ompt_scope_end, (omp_wait_id_t)user_lock, codeptr); 2786 } 2787 } 2788 #endif 2789 2790 #else // KMP_USE_DYNAMIC_LOCK 2791 2792 kmp_user_lock_p lck; 2793 2794 /* Can't use serial interval since not block structured */ 2795 2796 if ((__kmp_user_lock_kind == lk_tas) && 2797 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2798 OMP_NEST_LOCK_T_SIZE)) { 2799 #if KMP_OS_LINUX && \ 2800 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2801 // "fast" path implemented to fix customer performance issue 2802 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock; 2803 #if USE_ITT_BUILD 2804 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2805 #endif /* USE_ITT_BUILD */ 2806 2807 #if OMPT_SUPPORT && OMPT_OPTIONAL 2808 int release_status = KMP_LOCK_STILL_HELD; 2809 #endif 2810 2811 if (--(tl->lk.depth_locked) == 0) { 2812 TCW_4(tl->lk.poll, 0); 2813 #if OMPT_SUPPORT && OMPT_OPTIONAL 2814 release_status = KMP_LOCK_RELEASED; 2815 #endif 2816 } 2817 KMP_MB(); 2818 2819 #if OMPT_SUPPORT && OMPT_OPTIONAL 2820 // This is the case, if called from omp_init_lock_with_hint: 2821 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2822 if (!codeptr) 2823 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2824 if (ompt_enabled.enabled) { 2825 if (release_status == KMP_LOCK_RELEASED) { 2826 if (ompt_enabled.ompt_callback_mutex_released) { 2827 // release_lock_last 2828 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2829 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 2830 } 2831 } else if (ompt_enabled.ompt_callback_nest_lock) { 2832 // release_lock_previous 2833 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2834 ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr); 2835 } 2836 } 2837 #endif 2838 2839 return; 2840 #else 2841 lck = (kmp_user_lock_p)user_lock; 2842 #endif 2843 } 2844 #if KMP_USE_FUTEX 2845 else if ((__kmp_user_lock_kind == lk_futex) && 2846 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2847 OMP_NEST_LOCK_T_SIZE)) { 2848 lck = (kmp_user_lock_p)user_lock; 2849 } 2850 #endif 2851 else { 2852 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock"); 2853 } 2854 2855 #if USE_ITT_BUILD 2856 __kmp_itt_lock_releasing(lck); 2857 #endif /* USE_ITT_BUILD */ 2858 2859 int release_status; 2860 release_status = RELEASE_NESTED_LOCK(lck, gtid); 2861 #if OMPT_SUPPORT && OMPT_OPTIONAL 2862 // This is the case, if called from omp_init_lock_with_hint: 2863 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2864 if (!codeptr) 2865 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2866 if (ompt_enabled.enabled) { 2867 if (release_status == KMP_LOCK_RELEASED) { 2868 if (ompt_enabled.ompt_callback_mutex_released) { 2869 // release_lock_last 2870 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2871 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 2872 } 2873 } else if (ompt_enabled.ompt_callback_nest_lock) { 2874 // release_lock_previous 2875 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2876 ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr); 2877 } 2878 } 2879 #endif 2880 2881 #endif // KMP_USE_DYNAMIC_LOCK 2882 } 2883 2884 /* try to acquire the lock */ 2885 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2886 KMP_COUNT_BLOCK(OMP_test_lock); 2887 2888 #if KMP_USE_DYNAMIC_LOCK 2889 int rc; 2890 int tag = KMP_EXTRACT_D_TAG(user_lock); 2891 #if USE_ITT_BUILD 2892 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2893 #endif 2894 #if OMPT_SUPPORT && OMPT_OPTIONAL 2895 // This is the case, if called from omp_init_lock_with_hint: 2896 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2897 if (!codeptr) 2898 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2899 if (ompt_enabled.ompt_callback_mutex_acquire) { 2900 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2901 ompt_mutex_lock, omp_lock_hint_none, 2902 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2903 codeptr); 2904 } 2905 #endif 2906 #if KMP_USE_INLINED_TAS 2907 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2908 KMP_TEST_TAS_LOCK(user_lock, gtid, rc); 2909 } else 2910 #elif KMP_USE_INLINED_FUTEX 2911 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2912 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc); 2913 } else 2914 #endif 2915 { 2916 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2917 } 2918 if (rc) { 2919 #if USE_ITT_BUILD 2920 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2921 #endif 2922 #if OMPT_SUPPORT && OMPT_OPTIONAL 2923 if (ompt_enabled.ompt_callback_mutex_acquired) { 2924 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2925 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2926 } 2927 #endif 2928 return FTN_TRUE; 2929 } else { 2930 #if USE_ITT_BUILD 2931 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 2932 #endif 2933 return FTN_FALSE; 2934 } 2935 2936 #else // KMP_USE_DYNAMIC_LOCK 2937 2938 kmp_user_lock_p lck; 2939 int rc; 2940 2941 if ((__kmp_user_lock_kind == lk_tas) && 2942 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2943 lck = (kmp_user_lock_p)user_lock; 2944 } 2945 #if KMP_USE_FUTEX 2946 else if ((__kmp_user_lock_kind == lk_futex) && 2947 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2948 lck = (kmp_user_lock_p)user_lock; 2949 } 2950 #endif 2951 else { 2952 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock"); 2953 } 2954 2955 #if USE_ITT_BUILD 2956 __kmp_itt_lock_acquiring(lck); 2957 #endif /* USE_ITT_BUILD */ 2958 #if OMPT_SUPPORT && OMPT_OPTIONAL 2959 // This is the case, if called from omp_init_lock_with_hint: 2960 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2961 if (!codeptr) 2962 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2963 if (ompt_enabled.ompt_callback_mutex_acquire) { 2964 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2965 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2966 (omp_wait_id_t)lck, codeptr); 2967 } 2968 #endif 2969 2970 rc = TEST_LOCK(lck, gtid); 2971 #if USE_ITT_BUILD 2972 if (rc) { 2973 __kmp_itt_lock_acquired(lck); 2974 } else { 2975 __kmp_itt_lock_cancelled(lck); 2976 } 2977 #endif /* USE_ITT_BUILD */ 2978 #if OMPT_SUPPORT && OMPT_OPTIONAL 2979 if (rc && ompt_enabled.ompt_callback_mutex_acquired) { 2980 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2981 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2982 } 2983 #endif 2984 2985 return (rc ? FTN_TRUE : FTN_FALSE); 2986 2987 /* Can't use serial interval since not block structured */ 2988 2989 #endif // KMP_USE_DYNAMIC_LOCK 2990 } 2991 2992 /* try to acquire the lock */ 2993 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2994 #if KMP_USE_DYNAMIC_LOCK 2995 int rc; 2996 #if USE_ITT_BUILD 2997 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2998 #endif 2999 #if OMPT_SUPPORT && OMPT_OPTIONAL 3000 // This is the case, if called from omp_init_lock_with_hint: 3001 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3002 if (!codeptr) 3003 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3004 if (ompt_enabled.ompt_callback_mutex_acquire) { 3005 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3006 ompt_mutex_nest_lock, omp_lock_hint_none, 3007 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 3008 codeptr); 3009 } 3010 #endif 3011 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid); 3012 #if USE_ITT_BUILD 3013 if (rc) { 3014 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 3015 } else { 3016 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 3017 } 3018 #endif 3019 #if OMPT_SUPPORT && OMPT_OPTIONAL 3020 if (ompt_enabled.enabled && rc) { 3021 if (rc == 1) { 3022 if (ompt_enabled.ompt_callback_mutex_acquired) { 3023 // lock_first 3024 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3025 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 3026 } 3027 } else { 3028 if (ompt_enabled.ompt_callback_nest_lock) { 3029 // lock_next 3030 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3031 ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr); 3032 } 3033 } 3034 } 3035 #endif 3036 return rc; 3037 3038 #else // KMP_USE_DYNAMIC_LOCK 3039 3040 kmp_user_lock_p lck; 3041 int rc; 3042 3043 if ((__kmp_user_lock_kind == lk_tas) && 3044 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 3045 OMP_NEST_LOCK_T_SIZE)) { 3046 lck = (kmp_user_lock_p)user_lock; 3047 } 3048 #if KMP_USE_FUTEX 3049 else if ((__kmp_user_lock_kind == lk_futex) && 3050 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 3051 OMP_NEST_LOCK_T_SIZE)) { 3052 lck = (kmp_user_lock_p)user_lock; 3053 } 3054 #endif 3055 else { 3056 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock"); 3057 } 3058 3059 #if USE_ITT_BUILD 3060 __kmp_itt_lock_acquiring(lck); 3061 #endif /* USE_ITT_BUILD */ 3062 3063 #if OMPT_SUPPORT && OMPT_OPTIONAL 3064 // This is the case, if called from omp_init_lock_with_hint: 3065 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3066 if (!codeptr) 3067 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3068 if (ompt_enabled.enabled) && 3069 ompt_enabled.ompt_callback_mutex_acquire) { 3070 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3071 ompt_mutex_nest_lock, omp_lock_hint_none, 3072 __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr); 3073 } 3074 #endif 3075 3076 rc = TEST_NESTED_LOCK(lck, gtid); 3077 #if USE_ITT_BUILD 3078 if (rc) { 3079 __kmp_itt_lock_acquired(lck); 3080 } else { 3081 __kmp_itt_lock_cancelled(lck); 3082 } 3083 #endif /* USE_ITT_BUILD */ 3084 #if OMPT_SUPPORT && OMPT_OPTIONAL 3085 if (ompt_enabled.enabled && rc) { 3086 if (rc == 1) { 3087 if (ompt_enabled.ompt_callback_mutex_acquired) { 3088 // lock_first 3089 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3090 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 3091 } 3092 } else { 3093 if (ompt_enabled.ompt_callback_nest_lock) { 3094 // lock_next 3095 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3096 ompt_mutex_scope_begin, (omp_wait_id_t)lck, codeptr); 3097 } 3098 } 3099 } 3100 #endif 3101 return rc; 3102 3103 /* Can't use serial interval since not block structured */ 3104 3105 #endif // KMP_USE_DYNAMIC_LOCK 3106 } 3107 3108 // Interface to fast scalable reduce methods routines 3109 3110 // keep the selected method in a thread local structure for cross-function 3111 // usage: will be used in __kmpc_end_reduce* functions; 3112 // another solution: to re-determine the method one more time in 3113 // __kmpc_end_reduce* functions (new prototype required then) 3114 // AT: which solution is better? 3115 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \ 3116 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod)) 3117 3118 #define __KMP_GET_REDUCTION_METHOD(gtid) \ 3119 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) 3120 3121 // description of the packed_reduction_method variable: look at the macros in 3122 // kmp.h 3123 3124 // used in a critical section reduce block 3125 static __forceinline void 3126 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3127 kmp_critical_name *crit) { 3128 3129 // this lock was visible to a customer and to the threading profile tool as a 3130 // serial overhead span (although it's used for an internal purpose only) 3131 // why was it visible in previous implementation? 3132 // should we keep it visible in new reduce block? 3133 kmp_user_lock_p lck; 3134 3135 #if KMP_USE_DYNAMIC_LOCK 3136 3137 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 3138 // Check if it is initialized. 3139 if (*lk == 0) { 3140 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3141 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 3142 KMP_GET_D_TAG(__kmp_user_lock_seq)); 3143 } else { 3144 __kmp_init_indirect_csptr(crit, loc, global_tid, 3145 KMP_GET_I_TAG(__kmp_user_lock_seq)); 3146 } 3147 } 3148 // Branch for accessing the actual lock object and set operation. This 3149 // branching is inevitable since this lock initialization does not follow the 3150 // normal dispatch path (lock table is not used). 3151 if (KMP_EXTRACT_D_TAG(lk) != 0) { 3152 lck = (kmp_user_lock_p)lk; 3153 KMP_DEBUG_ASSERT(lck != NULL); 3154 if (__kmp_env_consistency_check) { 3155 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3156 } 3157 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 3158 } else { 3159 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 3160 lck = ilk->lock; 3161 KMP_DEBUG_ASSERT(lck != NULL); 3162 if (__kmp_env_consistency_check) { 3163 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3164 } 3165 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 3166 } 3167 3168 #else // KMP_USE_DYNAMIC_LOCK 3169 3170 // We know that the fast reduction code is only emitted by Intel compilers 3171 // with 32 byte critical sections. If there isn't enough space, then we 3172 // have to use a pointer. 3173 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) { 3174 lck = (kmp_user_lock_p)crit; 3175 } else { 3176 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 3177 } 3178 KMP_DEBUG_ASSERT(lck != NULL); 3179 3180 if (__kmp_env_consistency_check) 3181 __kmp_push_sync(global_tid, ct_critical, loc, lck); 3182 3183 __kmp_acquire_user_lock_with_checks(lck, global_tid); 3184 3185 #endif // KMP_USE_DYNAMIC_LOCK 3186 } 3187 3188 // used in a critical section reduce block 3189 static __forceinline void 3190 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3191 kmp_critical_name *crit) { 3192 3193 kmp_user_lock_p lck; 3194 3195 #if KMP_USE_DYNAMIC_LOCK 3196 3197 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3198 lck = (kmp_user_lock_p)crit; 3199 if (__kmp_env_consistency_check) 3200 __kmp_pop_sync(global_tid, ct_critical, loc); 3201 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 3202 } else { 3203 kmp_indirect_lock_t *ilk = 3204 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 3205 if (__kmp_env_consistency_check) 3206 __kmp_pop_sync(global_tid, ct_critical, loc); 3207 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid); 3208 } 3209 3210 #else // KMP_USE_DYNAMIC_LOCK 3211 3212 // We know that the fast reduction code is only emitted by Intel compilers 3213 // with 32 byte critical sections. If there isn't enough space, then we have 3214 // to use a pointer. 3215 if (__kmp_base_user_lock_size > 32) { 3216 lck = *((kmp_user_lock_p *)crit); 3217 KMP_ASSERT(lck != NULL); 3218 } else { 3219 lck = (kmp_user_lock_p)crit; 3220 } 3221 3222 if (__kmp_env_consistency_check) 3223 __kmp_pop_sync(global_tid, ct_critical, loc); 3224 3225 __kmp_release_user_lock_with_checks(lck, global_tid); 3226 3227 #endif // KMP_USE_DYNAMIC_LOCK 3228 } // __kmp_end_critical_section_reduce_block 3229 3230 #if OMP_40_ENABLED 3231 static __forceinline int 3232 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p, 3233 int *task_state) { 3234 kmp_team_t *team; 3235 3236 // Check if we are inside the teams construct? 3237 if (th->th.th_teams_microtask) { 3238 *team_p = team = th->th.th_team; 3239 if (team->t.t_level == th->th.th_teams_level) { 3240 // This is reduction at teams construct. 3241 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0 3242 // Let's swap teams temporarily for the reduction. 3243 th->th.th_info.ds.ds_tid = team->t.t_master_tid; 3244 th->th.th_team = team->t.t_parent; 3245 th->th.th_team_nproc = th->th.th_team->t.t_nproc; 3246 th->th.th_task_team = th->th.th_team->t.t_task_team[0]; 3247 *task_state = th->th.th_task_state; 3248 th->th.th_task_state = 0; 3249 3250 return 1; 3251 } 3252 } 3253 return 0; 3254 } 3255 3256 static __forceinline void 3257 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) { 3258 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction. 3259 th->th.th_info.ds.ds_tid = 0; 3260 th->th.th_team = team; 3261 th->th.th_team_nproc = team->t.t_nproc; 3262 th->th.th_task_team = team->t.t_task_team[task_state]; 3263 th->th.th_task_state = task_state; 3264 } 3265 #endif 3266 3267 /* 2.a.i. Reduce Block without a terminating barrier */ 3268 /*! 3269 @ingroup SYNCHRONIZATION 3270 @param loc source location information 3271 @param global_tid global thread number 3272 @param num_vars number of items (variables) to be reduced 3273 @param reduce_size size of data in bytes to be reduced 3274 @param reduce_data pointer to data to be reduced 3275 @param reduce_func callback function providing reduction operation on two 3276 operands and returning result of reduction in lhs_data 3277 @param lck pointer to the unique lock data structure 3278 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3279 threads if atomic reduction needed 3280 3281 The nowait version is used for a reduce clause with the nowait argument. 3282 */ 3283 kmp_int32 3284 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3285 size_t reduce_size, void *reduce_data, 3286 void (*reduce_func)(void *lhs_data, void *rhs_data), 3287 kmp_critical_name *lck) { 3288 3289 KMP_COUNT_BLOCK(REDUCE_nowait); 3290 int retval = 0; 3291 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3292 #if OMP_40_ENABLED 3293 kmp_info_t *th; 3294 kmp_team_t *team; 3295 int teams_swapped = 0, task_state; 3296 #endif 3297 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid)); 3298 3299 // why do we need this initialization here at all? 3300 // Reduction clause can not be used as a stand-alone directive. 3301 3302 // do not call __kmp_serial_initialize(), it will be called by 3303 // __kmp_parallel_initialize() if needed 3304 // possible detection of false-positive race by the threadchecker ??? 3305 if (!TCR_4(__kmp_init_parallel)) 3306 __kmp_parallel_initialize(); 3307 3308 // check correctness of reduce block nesting 3309 #if KMP_USE_DYNAMIC_LOCK 3310 if (__kmp_env_consistency_check) 3311 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3312 #else 3313 if (__kmp_env_consistency_check) 3314 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3315 #endif 3316 3317 #if OMP_40_ENABLED 3318 th = __kmp_thread_from_gtid(global_tid); 3319 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3320 #endif // OMP_40_ENABLED 3321 3322 // packed_reduction_method value will be reused by __kmp_end_reduce* function, 3323 // the value should be kept in a variable 3324 // the variable should be either a construct-specific or thread-specific 3325 // property, not a team specific property 3326 // (a thread can reach the next reduce block on the next construct, reduce 3327 // method may differ on the next construct) 3328 // an ident_t "loc" parameter could be used as a construct-specific property 3329 // (what if loc == 0?) 3330 // (if both construct-specific and team-specific variables were shared, 3331 // then unness extra syncs should be needed) 3332 // a thread-specific variable is better regarding two issues above (next 3333 // construct and extra syncs) 3334 // a thread-specific "th_local.reduction_method" variable is used currently 3335 // each thread executes 'determine' and 'set' lines (no need to execute by one 3336 // thread, to avoid unness extra syncs) 3337 3338 packed_reduction_method = __kmp_determine_reduction_method( 3339 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3340 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3341 3342 if (packed_reduction_method == critical_reduce_block) { 3343 3344 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3345 retval = 1; 3346 3347 } else if (packed_reduction_method == empty_reduce_block) { 3348 3349 // usage: if team size == 1, no synchronization is required ( Intel 3350 // platforms only ) 3351 retval = 1; 3352 3353 } else if (packed_reduction_method == atomic_reduce_block) { 3354 3355 retval = 2; 3356 3357 // all threads should do this pop here (because __kmpc_end_reduce_nowait() 3358 // won't be called by the code gen) 3359 // (it's not quite good, because the checking block has been closed by 3360 // this 'pop', 3361 // but atomic operation has not been executed yet, will be executed 3362 // slightly later, literally on next instruction) 3363 if (__kmp_env_consistency_check) 3364 __kmp_pop_sync(global_tid, ct_reduce, loc); 3365 3366 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3367 tree_reduce_block)) { 3368 3369 // AT: performance issue: a real barrier here 3370 // AT: (if master goes slow, other threads are blocked here waiting for the 3371 // master to come and release them) 3372 // AT: (it's not what a customer might expect specifying NOWAIT clause) 3373 // AT: (specifying NOWAIT won't result in improvement of performance, it'll 3374 // be confusing to a customer) 3375 // AT: another implementation of *barrier_gather*nowait() (or some other design) 3376 // might go faster and be more in line with sense of NOWAIT 3377 // AT: TO DO: do epcc test and compare times 3378 3379 // this barrier should be invisible to a customer and to the threading profile 3380 // tool (it's neither a terminating barrier nor customer's code, it's 3381 // used for an internal purpose) 3382 #if OMPT_SUPPORT 3383 // JP: can this barrier potentially leed to task scheduling? 3384 // JP: as long as there is a barrier in the implementation, OMPT should and 3385 // will provide the barrier events 3386 // so we set-up the necessary frame/return addresses. 3387 omp_frame_t *ompt_frame; 3388 if (ompt_enabled.enabled) { 3389 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3390 if (ompt_frame->enter_frame == NULL) 3391 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3392 OMPT_STORE_RETURN_ADDRESS(global_tid); 3393 } 3394 #endif 3395 #if USE_ITT_NOTIFY 3396 __kmp_threads[global_tid]->th.th_ident = loc; 3397 #endif 3398 retval = 3399 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3400 global_tid, FALSE, reduce_size, reduce_data, reduce_func); 3401 retval = (retval != 0) ? (0) : (1); 3402 #if OMPT_SUPPORT && OMPT_OPTIONAL 3403 if (ompt_enabled.enabled) { 3404 ompt_frame->enter_frame = NULL; 3405 } 3406 #endif 3407 3408 // all other workers except master should do this pop here 3409 // ( none of other workers will get to __kmpc_end_reduce_nowait() ) 3410 if (__kmp_env_consistency_check) { 3411 if (retval == 0) { 3412 __kmp_pop_sync(global_tid, ct_reduce, loc); 3413 } 3414 } 3415 3416 } else { 3417 3418 // should never reach this block 3419 KMP_ASSERT(0); // "unexpected method" 3420 } 3421 #if OMP_40_ENABLED 3422 if (teams_swapped) { 3423 __kmp_restore_swapped_teams(th, team, task_state); 3424 } 3425 #endif 3426 KA_TRACE( 3427 10, 3428 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", 3429 global_tid, packed_reduction_method, retval)); 3430 3431 return retval; 3432 } 3433 3434 /*! 3435 @ingroup SYNCHRONIZATION 3436 @param loc source location information 3437 @param global_tid global thread id. 3438 @param lck pointer to the unique lock data structure 3439 3440 Finish the execution of a reduce nowait. 3441 */ 3442 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, 3443 kmp_critical_name *lck) { 3444 3445 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3446 3447 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid)); 3448 3449 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3450 3451 if (packed_reduction_method == critical_reduce_block) { 3452 3453 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3454 3455 } else if (packed_reduction_method == empty_reduce_block) { 3456 3457 // usage: if team size == 1, no synchronization is required ( on Intel 3458 // platforms only ) 3459 3460 } else if (packed_reduction_method == atomic_reduce_block) { 3461 3462 // neither master nor other workers should get here 3463 // (code gen does not generate this call in case 2: atomic reduce block) 3464 // actually it's better to remove this elseif at all; 3465 // after removal this value will checked by the 'else' and will assert 3466 3467 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3468 tree_reduce_block)) { 3469 3470 // only master gets here 3471 3472 } else { 3473 3474 // should never reach this block 3475 KMP_ASSERT(0); // "unexpected method" 3476 } 3477 3478 if (__kmp_env_consistency_check) 3479 __kmp_pop_sync(global_tid, ct_reduce, loc); 3480 3481 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", 3482 global_tid, packed_reduction_method)); 3483 3484 return; 3485 } 3486 3487 /* 2.a.ii. Reduce Block with a terminating barrier */ 3488 3489 /*! 3490 @ingroup SYNCHRONIZATION 3491 @param loc source location information 3492 @param global_tid global thread number 3493 @param num_vars number of items (variables) to be reduced 3494 @param reduce_size size of data in bytes to be reduced 3495 @param reduce_data pointer to data to be reduced 3496 @param reduce_func callback function providing reduction operation on two 3497 operands and returning result of reduction in lhs_data 3498 @param lck pointer to the unique lock data structure 3499 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3500 threads if atomic reduction needed 3501 3502 A blocking reduce that includes an implicit barrier. 3503 */ 3504 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3505 size_t reduce_size, void *reduce_data, 3506 void (*reduce_func)(void *lhs_data, void *rhs_data), 3507 kmp_critical_name *lck) { 3508 KMP_COUNT_BLOCK(REDUCE_wait); 3509 int retval = 0; 3510 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3511 #if OMP_40_ENABLED 3512 kmp_info_t *th; 3513 kmp_team_t *team; 3514 int teams_swapped = 0, task_state; 3515 #endif 3516 3517 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid)); 3518 3519 // why do we need this initialization here at all? 3520 // Reduction clause can not be a stand-alone directive. 3521 3522 // do not call __kmp_serial_initialize(), it will be called by 3523 // __kmp_parallel_initialize() if needed 3524 // possible detection of false-positive race by the threadchecker ??? 3525 if (!TCR_4(__kmp_init_parallel)) 3526 __kmp_parallel_initialize(); 3527 3528 // check correctness of reduce block nesting 3529 #if KMP_USE_DYNAMIC_LOCK 3530 if (__kmp_env_consistency_check) 3531 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3532 #else 3533 if (__kmp_env_consistency_check) 3534 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3535 #endif 3536 3537 #if OMP_40_ENABLED 3538 th = __kmp_thread_from_gtid(global_tid); 3539 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3540 #endif // OMP_40_ENABLED 3541 3542 packed_reduction_method = __kmp_determine_reduction_method( 3543 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3544 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3545 3546 if (packed_reduction_method == critical_reduce_block) { 3547 3548 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3549 retval = 1; 3550 3551 } else if (packed_reduction_method == empty_reduce_block) { 3552 3553 // usage: if team size == 1, no synchronization is required ( Intel 3554 // platforms only ) 3555 retval = 1; 3556 3557 } else if (packed_reduction_method == atomic_reduce_block) { 3558 3559 retval = 2; 3560 3561 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3562 tree_reduce_block)) { 3563 3564 // case tree_reduce_block: 3565 // this barrier should be visible to a customer and to the threading profile 3566 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3567 #if OMPT_SUPPORT 3568 omp_frame_t *ompt_frame; 3569 if (ompt_enabled.enabled) { 3570 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3571 if (ompt_frame->enter_frame == NULL) 3572 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3573 OMPT_STORE_RETURN_ADDRESS(global_tid); 3574 } 3575 #endif 3576 #if USE_ITT_NOTIFY 3577 __kmp_threads[global_tid]->th.th_ident = 3578 loc; // needed for correct notification of frames 3579 #endif 3580 retval = 3581 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3582 global_tid, TRUE, reduce_size, reduce_data, reduce_func); 3583 retval = (retval != 0) ? (0) : (1); 3584 #if OMPT_SUPPORT && OMPT_OPTIONAL 3585 if (ompt_enabled.enabled) { 3586 ompt_frame->enter_frame = NULL; 3587 } 3588 #endif 3589 3590 // all other workers except master should do this pop here 3591 // ( none of other workers except master will enter __kmpc_end_reduce() ) 3592 if (__kmp_env_consistency_check) { 3593 if (retval == 0) { // 0: all other workers; 1: master 3594 __kmp_pop_sync(global_tid, ct_reduce, loc); 3595 } 3596 } 3597 3598 } else { 3599 3600 // should never reach this block 3601 KMP_ASSERT(0); // "unexpected method" 3602 } 3603 #if OMP_40_ENABLED 3604 if (teams_swapped) { 3605 __kmp_restore_swapped_teams(th, team, task_state); 3606 } 3607 #endif 3608 3609 KA_TRACE(10, 3610 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", 3611 global_tid, packed_reduction_method, retval)); 3612 3613 return retval; 3614 } 3615 3616 /*! 3617 @ingroup SYNCHRONIZATION 3618 @param loc source location information 3619 @param global_tid global thread id. 3620 @param lck pointer to the unique lock data structure 3621 3622 Finish the execution of a blocking reduce. 3623 The <tt>lck</tt> pointer must be the same as that used in the corresponding 3624 start function. 3625 */ 3626 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, 3627 kmp_critical_name *lck) { 3628 3629 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3630 #if OMP_40_ENABLED 3631 kmp_info_t *th; 3632 kmp_team_t *team; 3633 int teams_swapped = 0, task_state; 3634 #endif 3635 3636 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid)); 3637 3638 #if OMP_40_ENABLED 3639 th = __kmp_thread_from_gtid(global_tid); 3640 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3641 #endif // OMP_40_ENABLED 3642 3643 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3644 3645 // this barrier should be visible to a customer and to the threading profile 3646 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3647 3648 if (packed_reduction_method == critical_reduce_block) { 3649 3650 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3651 3652 // TODO: implicit barrier: should be exposed 3653 #if OMPT_SUPPORT 3654 omp_frame_t *ompt_frame; 3655 if (ompt_enabled.enabled) { 3656 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3657 if (ompt_frame->enter_frame == NULL) 3658 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3659 OMPT_STORE_RETURN_ADDRESS(global_tid); 3660 } 3661 #endif 3662 #if USE_ITT_NOTIFY 3663 __kmp_threads[global_tid]->th.th_ident = loc; 3664 #endif 3665 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3666 #if OMPT_SUPPORT && OMPT_OPTIONAL 3667 if (ompt_enabled.enabled) { 3668 ompt_frame->enter_frame = NULL; 3669 } 3670 #endif 3671 3672 } else if (packed_reduction_method == empty_reduce_block) { 3673 3674 // usage: if team size==1, no synchronization is required (Intel platforms only) 3675 3676 // TODO: implicit barrier: should be exposed 3677 #if OMPT_SUPPORT 3678 omp_frame_t *ompt_frame; 3679 if (ompt_enabled.enabled) { 3680 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3681 if (ompt_frame->enter_frame == NULL) 3682 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3683 OMPT_STORE_RETURN_ADDRESS(global_tid); 3684 } 3685 #endif 3686 #if USE_ITT_NOTIFY 3687 __kmp_threads[global_tid]->th.th_ident = loc; 3688 #endif 3689 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3690 #if OMPT_SUPPORT && OMPT_OPTIONAL 3691 if (ompt_enabled.enabled) { 3692 ompt_frame->enter_frame = NULL; 3693 } 3694 #endif 3695 3696 } else if (packed_reduction_method == atomic_reduce_block) { 3697 3698 #if OMPT_SUPPORT 3699 omp_frame_t *ompt_frame; 3700 if (ompt_enabled.enabled) { 3701 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3702 if (ompt_frame->enter_frame == NULL) 3703 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3704 OMPT_STORE_RETURN_ADDRESS(global_tid); 3705 } 3706 #endif 3707 // TODO: implicit barrier: should be exposed 3708 #if USE_ITT_NOTIFY 3709 __kmp_threads[global_tid]->th.th_ident = loc; 3710 #endif 3711 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3712 #if OMPT_SUPPORT && OMPT_OPTIONAL 3713 if (ompt_enabled.enabled) { 3714 ompt_frame->enter_frame = NULL; 3715 } 3716 #endif 3717 3718 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3719 tree_reduce_block)) { 3720 3721 // only master executes here (master releases all other workers) 3722 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3723 global_tid); 3724 3725 } else { 3726 3727 // should never reach this block 3728 KMP_ASSERT(0); // "unexpected method" 3729 } 3730 #if OMP_40_ENABLED 3731 if (teams_swapped) { 3732 __kmp_restore_swapped_teams(th, team, task_state); 3733 } 3734 #endif 3735 3736 if (__kmp_env_consistency_check) 3737 __kmp_pop_sync(global_tid, ct_reduce, loc); 3738 3739 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n", 3740 global_tid, packed_reduction_method)); 3741 3742 return; 3743 } 3744 3745 #undef __KMP_GET_REDUCTION_METHOD 3746 #undef __KMP_SET_REDUCTION_METHOD 3747 3748 /* end of interface to fast scalable reduce routines */ 3749 3750 kmp_uint64 __kmpc_get_taskid() { 3751 3752 kmp_int32 gtid; 3753 kmp_info_t *thread; 3754 3755 gtid = __kmp_get_gtid(); 3756 if (gtid < 0) { 3757 return 0; 3758 } 3759 thread = __kmp_thread_from_gtid(gtid); 3760 return thread->th.th_current_task->td_task_id; 3761 3762 } // __kmpc_get_taskid 3763 3764 kmp_uint64 __kmpc_get_parent_taskid() { 3765 3766 kmp_int32 gtid; 3767 kmp_info_t *thread; 3768 kmp_taskdata_t *parent_task; 3769 3770 gtid = __kmp_get_gtid(); 3771 if (gtid < 0) { 3772 return 0; 3773 } 3774 thread = __kmp_thread_from_gtid(gtid); 3775 parent_task = thread->th.th_current_task->td_parent; 3776 return (parent_task == NULL ? 0 : parent_task->td_task_id); 3777 3778 } // __kmpc_get_parent_taskid 3779 3780 #if OMP_45_ENABLED 3781 /*! 3782 @ingroup WORK_SHARING 3783 @param loc source location information. 3784 @param gtid global thread number. 3785 @param num_dims number of associated doacross loops. 3786 @param dims info on loops bounds. 3787 3788 Initialize doacross loop information. 3789 Expect compiler send us inclusive bounds, 3790 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2. 3791 */ 3792 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, 3793 const struct kmp_dim *dims) { 3794 int j, idx; 3795 kmp_int64 last, trace_count; 3796 kmp_info_t *th = __kmp_threads[gtid]; 3797 kmp_team_t *team = th->th.th_team; 3798 kmp_uint32 *flags; 3799 kmp_disp_t *pr_buf = th->th.th_dispatch; 3800 dispatch_shared_info_t *sh_buf; 3801 3802 KA_TRACE( 3803 20, 3804 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n", 3805 gtid, num_dims, !team->t.t_serialized)); 3806 KMP_DEBUG_ASSERT(dims != NULL); 3807 KMP_DEBUG_ASSERT(num_dims > 0); 3808 3809 if (team->t.t_serialized) { 3810 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n")); 3811 return; // no dependencies if team is serialized 3812 } 3813 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 3814 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for 3815 // the next loop 3816 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3817 3818 // Save bounds info into allocated private buffer 3819 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL); 3820 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc( 3821 th, sizeof(kmp_int64) * (4 * num_dims + 1)); 3822 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3823 pr_buf->th_doacross_info[0] = 3824 (kmp_int64)num_dims; // first element is number of dimensions 3825 // Save also address of num_done in order to access it later without knowing 3826 // the buffer index 3827 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done; 3828 pr_buf->th_doacross_info[2] = dims[0].lo; 3829 pr_buf->th_doacross_info[3] = dims[0].up; 3830 pr_buf->th_doacross_info[4] = dims[0].st; 3831 last = 5; 3832 for (j = 1; j < num_dims; ++j) { 3833 kmp_int64 3834 range_length; // To keep ranges of all dimensions but the first dims[0] 3835 if (dims[j].st == 1) { // most common case 3836 // AC: should we care of ranges bigger than LLONG_MAX? (not for now) 3837 range_length = dims[j].up - dims[j].lo + 1; 3838 } else { 3839 if (dims[j].st > 0) { 3840 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo); 3841 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1; 3842 } else { // negative increment 3843 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up); 3844 range_length = 3845 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1; 3846 } 3847 } 3848 pr_buf->th_doacross_info[last++] = range_length; 3849 pr_buf->th_doacross_info[last++] = dims[j].lo; 3850 pr_buf->th_doacross_info[last++] = dims[j].up; 3851 pr_buf->th_doacross_info[last++] = dims[j].st; 3852 } 3853 3854 // Compute total trip count. 3855 // Start with range of dims[0] which we don't need to keep in the buffer. 3856 if (dims[0].st == 1) { // most common case 3857 trace_count = dims[0].up - dims[0].lo + 1; 3858 } else if (dims[0].st > 0) { 3859 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo); 3860 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1; 3861 } else { // negative increment 3862 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up); 3863 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1; 3864 } 3865 for (j = 1; j < num_dims; ++j) { 3866 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges 3867 } 3868 KMP_DEBUG_ASSERT(trace_count > 0); 3869 3870 // Check if shared buffer is not occupied by other loop (idx - 3871 // __kmp_dispatch_num_buffers) 3872 if (idx != sh_buf->doacross_buf_idx) { 3873 // Shared buffer is occupied, wait for it to be free 3874 __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx, 3875 __kmp_eq_4, NULL); 3876 } 3877 #if KMP_32_BIT_ARCH 3878 // Check if we are the first thread. After the CAS the first thread gets 0, 3879 // others get 1 if initialization is in progress, allocated pointer otherwise. 3880 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated. 3881 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32( 3882 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1); 3883 #else 3884 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64( 3885 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL); 3886 #endif 3887 if (flags == NULL) { 3888 // we are the first thread, allocate the array of flags 3889 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration 3890 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1); 3891 KMP_MB(); 3892 sh_buf->doacross_flags = flags; 3893 } else if (flags == (kmp_uint32 *)1) { 3894 #if KMP_32_BIT_ARCH 3895 // initialization is still in progress, need to wait 3896 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1) 3897 #else 3898 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL) 3899 #endif 3900 KMP_YIELD(TRUE); 3901 KMP_MB(); 3902 } else { 3903 KMP_MB(); 3904 } 3905 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value 3906 pr_buf->th_doacross_flags = 3907 sh_buf->doacross_flags; // save private copy in order to not 3908 // touch shared buffer on each iteration 3909 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid)); 3910 } 3911 3912 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) { 3913 kmp_int32 shft, num_dims, i; 3914 kmp_uint32 flag; 3915 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 3916 kmp_info_t *th = __kmp_threads[gtid]; 3917 kmp_team_t *team = th->th.th_team; 3918 kmp_disp_t *pr_buf; 3919 kmp_int64 lo, up, st; 3920 3921 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid)); 3922 if (team->t.t_serialized) { 3923 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n")); 3924 return; // no dependencies if team is serialized 3925 } 3926 3927 // calculate sequential iteration number and check out-of-bounds condition 3928 pr_buf = th->th.th_dispatch; 3929 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3930 num_dims = pr_buf->th_doacross_info[0]; 3931 lo = pr_buf->th_doacross_info[2]; 3932 up = pr_buf->th_doacross_info[3]; 3933 st = pr_buf->th_doacross_info[4]; 3934 if (st == 1) { // most common case 3935 if (vec[0] < lo || vec[0] > up) { 3936 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3937 "bounds [%lld,%lld]\n", 3938 gtid, vec[0], lo, up)); 3939 return; 3940 } 3941 iter_number = vec[0] - lo; 3942 } else if (st > 0) { 3943 if (vec[0] < lo || vec[0] > up) { 3944 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3945 "bounds [%lld,%lld]\n", 3946 gtid, vec[0], lo, up)); 3947 return; 3948 } 3949 iter_number = (kmp_uint64)(vec[0] - lo) / st; 3950 } else { // negative increment 3951 if (vec[0] > lo || vec[0] < up) { 3952 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3953 "bounds [%lld,%lld]\n", 3954 gtid, vec[0], lo, up)); 3955 return; 3956 } 3957 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 3958 } 3959 for (i = 1; i < num_dims; ++i) { 3960 kmp_int64 iter, ln; 3961 kmp_int32 j = i * 4; 3962 ln = pr_buf->th_doacross_info[j + 1]; 3963 lo = pr_buf->th_doacross_info[j + 2]; 3964 up = pr_buf->th_doacross_info[j + 3]; 3965 st = pr_buf->th_doacross_info[j + 4]; 3966 if (st == 1) { 3967 if (vec[i] < lo || vec[i] > up) { 3968 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3969 "bounds [%lld,%lld]\n", 3970 gtid, vec[i], lo, up)); 3971 return; 3972 } 3973 iter = vec[i] - lo; 3974 } else if (st > 0) { 3975 if (vec[i] < lo || vec[i] > up) { 3976 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3977 "bounds [%lld,%lld]\n", 3978 gtid, vec[i], lo, up)); 3979 return; 3980 } 3981 iter = (kmp_uint64)(vec[i] - lo) / st; 3982 } else { // st < 0 3983 if (vec[i] > lo || vec[i] < up) { 3984 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3985 "bounds [%lld,%lld]\n", 3986 gtid, vec[i], lo, up)); 3987 return; 3988 } 3989 iter = (kmp_uint64)(lo - vec[i]) / (-st); 3990 } 3991 iter_number = iter + ln * iter_number; 3992 } 3993 shft = iter_number % 32; // use 32-bit granularity 3994 iter_number >>= 5; // divided by 32 3995 flag = 1 << shft; 3996 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) { 3997 KMP_YIELD(TRUE); 3998 } 3999 KMP_MB(); 4000 KA_TRACE(20, 4001 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", 4002 gtid, (iter_number << 5) + shft)); 4003 } 4004 4005 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) { 4006 kmp_int32 shft, num_dims, i; 4007 kmp_uint32 flag; 4008 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 4009 kmp_info_t *th = __kmp_threads[gtid]; 4010 kmp_team_t *team = th->th.th_team; 4011 kmp_disp_t *pr_buf; 4012 kmp_int64 lo, st; 4013 4014 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid)); 4015 if (team->t.t_serialized) { 4016 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n")); 4017 return; // no dependencies if team is serialized 4018 } 4019 4020 // calculate sequential iteration number (same as in "wait" but no 4021 // out-of-bounds checks) 4022 pr_buf = th->th.th_dispatch; 4023 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 4024 num_dims = pr_buf->th_doacross_info[0]; 4025 lo = pr_buf->th_doacross_info[2]; 4026 st = pr_buf->th_doacross_info[4]; 4027 if (st == 1) { // most common case 4028 iter_number = vec[0] - lo; 4029 } else if (st > 0) { 4030 iter_number = (kmp_uint64)(vec[0] - lo) / st; 4031 } else { // negative increment 4032 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 4033 } 4034 for (i = 1; i < num_dims; ++i) { 4035 kmp_int64 iter, ln; 4036 kmp_int32 j = i * 4; 4037 ln = pr_buf->th_doacross_info[j + 1]; 4038 lo = pr_buf->th_doacross_info[j + 2]; 4039 st = pr_buf->th_doacross_info[j + 4]; 4040 if (st == 1) { 4041 iter = vec[i] - lo; 4042 } else if (st > 0) { 4043 iter = (kmp_uint64)(vec[i] - lo) / st; 4044 } else { // st < 0 4045 iter = (kmp_uint64)(lo - vec[i]) / (-st); 4046 } 4047 iter_number = iter + ln * iter_number; 4048 } 4049 shft = iter_number % 32; // use 32-bit granularity 4050 iter_number >>= 5; // divided by 32 4051 flag = 1 << shft; 4052 KMP_MB(); 4053 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) 4054 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag); 4055 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid, 4056 (iter_number << 5) + shft)); 4057 } 4058 4059 void __kmpc_doacross_fini(ident_t *loc, int gtid) { 4060 kmp_int32 num_done; 4061 kmp_info_t *th = __kmp_threads[gtid]; 4062 kmp_team_t *team = th->th.th_team; 4063 kmp_disp_t *pr_buf = th->th.th_dispatch; 4064 4065 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid)); 4066 if (team->t.t_serialized) { 4067 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team)); 4068 return; // nothing to do 4069 } 4070 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1; 4071 if (num_done == th->th.th_team_nproc) { 4072 // we are the last thread, need to free shared resources 4073 int idx = pr_buf->th_doacross_buf_idx - 1; 4074 dispatch_shared_info_t *sh_buf = 4075 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 4076 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == 4077 (kmp_int64)&sh_buf->doacross_num_done); 4078 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done); 4079 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx); 4080 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags)); 4081 sh_buf->doacross_flags = NULL; 4082 sh_buf->doacross_num_done = 0; 4083 sh_buf->doacross_buf_idx += 4084 __kmp_dispatch_num_buffers; // free buffer for future re-use 4085 } 4086 // free private resources (need to keep buffer index forever) 4087 pr_buf->th_doacross_flags = NULL; 4088 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info); 4089 pr_buf->th_doacross_info = NULL; 4090 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid)); 4091 } 4092 #endif 4093 4094 #if OMP_50_ENABLED 4095 int __kmpc_get_target_offload(void) { 4096 if (!__kmp_init_serial) { 4097 __kmp_serial_initialize(); 4098 } 4099 return __kmp_target_offload; 4100 } 4101 #endif // OMP_50_ENABLED 4102 4103 // end of file // 4104