1 /* 2 * kmp_barrier.cpp 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp_wait_release.h" 14 #include "kmp_barrier.h" 15 #include "kmp_itt.h" 16 #include "kmp_os.h" 17 #include "kmp_stats.h" 18 #include "ompt-specific.h" 19 // for distributed barrier 20 #include "kmp_affinity.h" 21 22 #if KMP_MIC 23 #include <immintrin.h> 24 #define USE_NGO_STORES 1 25 #endif // KMP_MIC 26 27 #include "tsan_annotations.h" 28 29 #if KMP_MIC && USE_NGO_STORES 30 // ICV copying 31 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 32 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 33 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 34 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") 35 #else 36 #define ngo_load(src) ((void)0) 37 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 38 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 39 #define ngo_sync() ((void)0) 40 #endif /* KMP_MIC && USE_NGO_STORES */ 41 42 void __kmp_print_structure(void); // Forward declaration 43 44 // ---------------------------- Barrier Algorithms ---------------------------- 45 // Distributed barrier 46 47 // Compute how many threads to have polling each cache-line. 48 // We want to limit the number of writes to IDEAL_GO_RESOLUTION. 49 void distributedBarrier::computeVarsForN(size_t n) { 50 int nsockets = 1; 51 if (__kmp_topology) { 52 int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET); 53 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 54 int ncores_per_socket = 55 __kmp_topology->calculate_ratio(core_level, socket_level); 56 nsockets = __kmp_topology->get_count(socket_level); 57 58 if (nsockets <= 0) 59 nsockets = 1; 60 if (ncores_per_socket <= 0) 61 ncores_per_socket = 1; 62 63 threads_per_go = ncores_per_socket >> 1; 64 if (!fix_threads_per_go) { 65 // Minimize num_gos 66 if (threads_per_go > 4) { 67 if (KMP_OPTIMIZE_FOR_REDUCTIONS) { 68 threads_per_go = threads_per_go >> 1; 69 } 70 if (threads_per_go > 4 && nsockets == 1) 71 threads_per_go = threads_per_go >> 1; 72 } 73 } 74 if (threads_per_go == 0) 75 threads_per_go = 1; 76 fix_threads_per_go = true; 77 num_gos = n / threads_per_go; 78 if (n % threads_per_go) 79 num_gos++; 80 if (nsockets == 1 || num_gos == 1) 81 num_groups = 1; 82 else { 83 num_groups = num_gos / nsockets; 84 if (num_gos % nsockets) 85 num_groups++; 86 } 87 if (num_groups <= 0) 88 num_groups = 1; 89 gos_per_group = num_gos / num_groups; 90 if (num_gos % num_groups) 91 gos_per_group++; 92 threads_per_group = threads_per_go * gos_per_group; 93 } else { 94 num_gos = n / threads_per_go; 95 if (n % threads_per_go) 96 num_gos++; 97 if (num_gos == 1) 98 num_groups = 1; 99 else { 100 num_groups = num_gos / 2; 101 if (num_gos % 2) 102 num_groups++; 103 } 104 gos_per_group = num_gos / num_groups; 105 if (num_gos % num_groups) 106 gos_per_group++; 107 threads_per_group = threads_per_go * gos_per_group; 108 } 109 } 110 111 void distributedBarrier::computeGo(size_t n) { 112 // Minimize num_gos 113 for (num_gos = 1;; num_gos++) 114 if (IDEAL_CONTENTION * num_gos >= n) 115 break; 116 threads_per_go = n / num_gos; 117 if (n % num_gos) 118 threads_per_go++; 119 while (num_gos > MAX_GOS) { 120 threads_per_go++; 121 num_gos = n / threads_per_go; 122 if (n % threads_per_go) 123 num_gos++; 124 } 125 computeVarsForN(n); 126 } 127 128 // This function is to resize the barrier arrays when the new number of threads 129 // exceeds max_threads, which is the current size of all the arrays 130 void distributedBarrier::resize(size_t nthr) { 131 KMP_DEBUG_ASSERT(nthr > max_threads); 132 133 // expand to requested size * 2 134 max_threads = nthr * 2; 135 136 // allocate arrays to new max threads 137 for (int i = 0; i < MAX_ITERS; ++i) { 138 if (flags[i]) 139 flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i], 140 max_threads * sizeof(flags_s)); 141 else 142 flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s)); 143 } 144 145 if (go) 146 go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s)); 147 else 148 go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s)); 149 150 if (iter) 151 iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s)); 152 else 153 iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s)); 154 155 if (sleep) 156 sleep = 157 (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s)); 158 else 159 sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s)); 160 } 161 162 // This function is to set all the go flags that threads might be waiting 163 // on, and when blocktime is not infinite, it should be followed by a wake-up 164 // call to each thread 165 kmp_uint64 distributedBarrier::go_release() { 166 kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS; 167 for (size_t j = 0; j < num_gos; j++) { 168 go[j].go.store(next_go); 169 } 170 return next_go; 171 } 172 173 void distributedBarrier::go_reset() { 174 for (size_t j = 0; j < max_threads; ++j) { 175 for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) { 176 flags[i][j].stillNeed = 1; 177 } 178 go[j].go.store(0); 179 iter[j].iter = 0; 180 } 181 } 182 183 // This function inits/re-inits the distributed barrier for a particular number 184 // of threads. If a resize of arrays is needed, it calls the resize function. 185 void distributedBarrier::init(size_t nthr) { 186 size_t old_max = max_threads; 187 if (nthr > max_threads) { // need more space in arrays 188 resize(nthr); 189 } 190 191 for (size_t i = 0; i < max_threads; i++) { 192 for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) { 193 flags[j][i].stillNeed = 1; 194 } 195 go[i].go.store(0); 196 iter[i].iter = 0; 197 if (i >= old_max) 198 sleep[i].sleep = false; 199 } 200 201 // Recalculate num_gos, etc. based on new nthr 202 computeVarsForN(nthr); 203 204 num_threads = nthr; 205 206 if (team_icvs == NULL) 207 team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t)); 208 } 209 210 // This function is used only when KMP_BLOCKTIME is not infinite. 211 // static 212 void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team, 213 size_t start, size_t stop, size_t inc, 214 size_t tid) { 215 KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME); 216 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 217 return; 218 219 kmp_info_t **other_threads = team->t.t_threads; 220 for (size_t thr = start; thr < stop; thr += inc) { 221 KMP_DEBUG_ASSERT(other_threads[thr]); 222 int gtid = other_threads[thr]->th.th_info.ds.ds_gtid; 223 // Wake up worker regardless of if it appears to be sleeping or not 224 __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL); 225 } 226 } 227 228 static void __kmp_dist_barrier_gather( 229 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 230 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 231 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather); 232 kmp_team_t *team; 233 distributedBarrier *b; 234 kmp_info_t **other_threads; 235 kmp_uint64 my_current_iter, my_next_iter; 236 kmp_uint32 nproc; 237 bool group_leader; 238 239 team = this_thr->th.th_team; 240 nproc = this_thr->th.th_team_nproc; 241 other_threads = team->t.t_threads; 242 b = team->t.b; 243 my_current_iter = b->iter[tid].iter; 244 my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS; 245 group_leader = ((tid % b->threads_per_group) == 0); 246 247 KA_TRACE(20, 248 ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n", 249 gtid, team->t.t_id, tid, bt)); 250 251 #if USE_ITT_BUILD && USE_ITT_NOTIFY 252 // Barrier imbalance - save arrive time to the thread 253 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 254 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 255 __itt_get_timestamp(); 256 } 257 #endif 258 259 if (group_leader) { 260 // Start from the thread after the group leader 261 size_t group_start = tid + 1; 262 size_t group_end = tid + b->threads_per_group; 263 size_t threads_pending = 0; 264 265 if (group_end > nproc) 266 group_end = nproc; 267 do { // wait for threads in my group 268 threads_pending = 0; 269 // Check all the flags every time to avoid branch misspredict 270 for (size_t thr = group_start; thr < group_end; thr++) { 271 // Each thread uses a different cache line 272 threads_pending += b->flags[my_current_iter][thr].stillNeed; 273 } 274 // Execute tasks here 275 if (__kmp_tasking_mode != tskm_immediate_exec) { 276 kmp_task_team_t *task_team = this_thr->th.th_task_team; 277 if (task_team != NULL) { 278 if (TCR_SYNC_4(task_team->tt.tt_active)) { 279 if (KMP_TASKING_ENABLED(task_team)) { 280 int tasks_completed = FALSE; 281 __kmp_atomic_execute_tasks_64( 282 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE, 283 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0); 284 } else 285 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 286 } 287 } else { 288 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 289 } // if 290 } 291 if (TCR_4(__kmp_global.g.g_done)) { 292 if (__kmp_global.g.g_abort) 293 __kmp_abort_thread(); 294 break; 295 } else if (__kmp_tasking_mode != tskm_immediate_exec && 296 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { 297 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 298 } 299 } while (threads_pending > 0); 300 301 if (reduce) { // Perform reduction if needed 302 OMPT_REDUCTION_DECL(this_thr, gtid); 303 OMPT_REDUCTION_BEGIN; 304 // Group leader reduces all threads in group 305 for (size_t thr = group_start; thr < group_end; thr++) { 306 (*reduce)(this_thr->th.th_local.reduce_data, 307 other_threads[thr]->th.th_local.reduce_data); 308 } 309 OMPT_REDUCTION_END; 310 } 311 312 // Set flag for next iteration 313 b->flags[my_next_iter][tid].stillNeed = 1; 314 // Each thread uses a different cache line; resets stillNeed to 0 to 315 // indicate it has reached the barrier 316 b->flags[my_current_iter][tid].stillNeed = 0; 317 318 do { // wait for all group leaders 319 threads_pending = 0; 320 for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) { 321 threads_pending += b->flags[my_current_iter][thr].stillNeed; 322 } 323 // Execute tasks here 324 if (__kmp_tasking_mode != tskm_immediate_exec) { 325 kmp_task_team_t *task_team = this_thr->th.th_task_team; 326 if (task_team != NULL) { 327 if (TCR_SYNC_4(task_team->tt.tt_active)) { 328 if (KMP_TASKING_ENABLED(task_team)) { 329 int tasks_completed = FALSE; 330 __kmp_atomic_execute_tasks_64( 331 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE, 332 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0); 333 } else 334 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 335 } 336 } else { 337 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 338 } // if 339 } 340 if (TCR_4(__kmp_global.g.g_done)) { 341 if (__kmp_global.g.g_abort) 342 __kmp_abort_thread(); 343 break; 344 } else if (__kmp_tasking_mode != tskm_immediate_exec && 345 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { 346 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 347 } 348 } while (threads_pending > 0); 349 350 if (reduce) { // Perform reduction if needed 351 if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders 352 OMPT_REDUCTION_DECL(this_thr, gtid); 353 OMPT_REDUCTION_BEGIN; 354 for (size_t thr = b->threads_per_group; thr < nproc; 355 thr += b->threads_per_group) { 356 (*reduce)(this_thr->th.th_local.reduce_data, 357 other_threads[thr]->th.th_local.reduce_data); 358 } 359 OMPT_REDUCTION_END; 360 } 361 } 362 } else { 363 // Set flag for next iteration 364 b->flags[my_next_iter][tid].stillNeed = 1; 365 // Each thread uses a different cache line; resets stillNeed to 0 to 366 // indicate it has reached the barrier 367 b->flags[my_current_iter][tid].stillNeed = 0; 368 } 369 370 KMP_MFENCE(); 371 372 KA_TRACE(20, 373 ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 374 gtid, team->t.t_id, tid, bt)); 375 } 376 377 static void __kmp_dist_barrier_release( 378 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 379 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 380 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release); 381 kmp_team_t *team; 382 distributedBarrier *b; 383 kmp_bstate_t *thr_bar; 384 kmp_uint64 my_current_iter, next_go; 385 size_t my_go_index; 386 bool group_leader; 387 388 KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n", 389 gtid, tid, bt)); 390 391 thr_bar = &this_thr->th.th_bar[bt].bb; 392 393 if (!KMP_MASTER_TID(tid)) { 394 // workers and non-master group leaders need to check their presence in team 395 do { 396 if (this_thr->th.th_used_in_team.load() != 1 && 397 this_thr->th.th_used_in_team.load() != 3) { 398 // Thread is not in use in a team. Wait on location in tid's thread 399 // struct. The 0 value tells anyone looking that this thread is spinning 400 // or sleeping until this location becomes 3 again; 3 is the transition 401 // state to get to 1 which is waiting on go and being in the team 402 kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3); 403 if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2, 404 0) || 405 this_thr->th.th_used_in_team.load() == 0) { 406 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj)); 407 } 408 #if USE_ITT_BUILD && USE_ITT_NOTIFY 409 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 410 // In fork barrier where we could not get the object reliably 411 itt_sync_obj = 412 __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 413 // Cancel wait on previous parallel region... 414 __kmp_itt_task_starting(itt_sync_obj); 415 416 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 417 return; 418 419 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 420 if (itt_sync_obj != NULL) 421 // Call prepare as early as possible for "new" barrier 422 __kmp_itt_task_finished(itt_sync_obj); 423 } else 424 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 425 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 426 return; 427 } 428 if (this_thr->th.th_used_in_team.load() != 1 && 429 this_thr->th.th_used_in_team.load() != 3) // spurious wake-up? 430 continue; 431 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 432 return; 433 434 // At this point, the thread thinks it is in use in a team, or in 435 // transition to be used in a team, but it might have reached this barrier 436 // before it was marked unused by the team. Unused threads are awoken and 437 // shifted to wait on local thread struct elsewhere. It also might reach 438 // this point by being picked up for use by a different team. Either way, 439 // we need to update the tid. 440 tid = __kmp_tid_from_gtid(gtid); 441 team = this_thr->th.th_team; 442 KMP_DEBUG_ASSERT(tid >= 0); 443 KMP_DEBUG_ASSERT(team); 444 b = team->t.b; 445 my_current_iter = b->iter[tid].iter; 446 next_go = my_current_iter + distributedBarrier::MAX_ITERS; 447 my_go_index = tid / b->threads_per_go; 448 if (this_thr->th.th_used_in_team.load() == 3) { 449 KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1); 450 } 451 // Check if go flag is set 452 if (b->go[my_go_index].go.load() != next_go) { 453 // Wait on go flag on team 454 kmp_atomic_flag_64<false, true> my_flag( 455 &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep)); 456 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj)); 457 KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter || 458 b->iter[tid].iter == 0); 459 KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false); 460 } 461 462 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 463 return; 464 // At this point, the thread's go location was set. This means the primary 465 // thread is safely in the barrier, and so this thread's data is 466 // up-to-date, but we should check again that this thread is really in 467 // use in the team, as it could have been woken up for the purpose of 468 // changing team size, or reaping threads at shutdown. 469 if (this_thr->th.th_used_in_team.load() == 1) 470 break; 471 } while (1); 472 473 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 474 return; 475 476 group_leader = ((tid % b->threads_per_group) == 0); 477 if (group_leader) { 478 // Tell all the threads in my group they can go! 479 for (size_t go_idx = my_go_index + 1; 480 go_idx < my_go_index + b->gos_per_group; go_idx++) { 481 b->go[go_idx].go.store(next_go); 482 } 483 // Fence added so that workers can see changes to go. sfence inadequate. 484 KMP_MFENCE(); 485 } 486 487 #if KMP_BARRIER_ICV_PUSH 488 if (propagate_icvs) { // copy ICVs to final dest 489 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, 490 tid, FALSE); 491 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 492 (kmp_internal_control_t *)team->t.b->team_icvs); 493 copy_icvs(&thr_bar->th_fixed_icvs, 494 &team->t.t_implicit_task_taskdata[tid].td_icvs); 495 } 496 #endif 497 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) { 498 // This thread is now awake and participating in the barrier; 499 // wake up the other threads in the group 500 size_t nproc = this_thr->th.th_team_nproc; 501 size_t group_end = tid + b->threads_per_group; 502 if (nproc < group_end) 503 group_end = nproc; 504 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid); 505 } 506 } else { // Primary thread 507 team = this_thr->th.th_team; 508 b = team->t.b; 509 my_current_iter = b->iter[tid].iter; 510 next_go = my_current_iter + distributedBarrier::MAX_ITERS; 511 #if KMP_BARRIER_ICV_PUSH 512 if (propagate_icvs) { 513 // primary thread has ICVs in final destination; copy 514 copy_icvs(&thr_bar->th_fixed_icvs, 515 &team->t.t_implicit_task_taskdata[tid].td_icvs); 516 } 517 #endif 518 // Tell all the group leaders they can go! 519 for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) { 520 b->go[go_idx].go.store(next_go); 521 } 522 523 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 524 // Wake-up the group leaders 525 size_t nproc = this_thr->th.th_team_nproc; 526 __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc, 527 b->threads_per_group, tid); 528 } 529 530 // Tell all the threads in my group they can go! 531 for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) { 532 b->go[go_idx].go.store(next_go); 533 } 534 535 // Fence added so that workers can see changes to go. sfence inadequate. 536 KMP_MFENCE(); 537 538 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 539 // Wake-up the other threads in my group 540 size_t nproc = this_thr->th.th_team_nproc; 541 size_t group_end = tid + b->threads_per_group; 542 if (nproc < group_end) 543 group_end = nproc; 544 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid); 545 } 546 } 547 // Update to next iteration 548 KMP_ASSERT(my_current_iter == b->iter[tid].iter); 549 b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS; 550 551 KA_TRACE( 552 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 553 gtid, team->t.t_id, tid, bt)); 554 } 555 556 // Linear Barrier 557 template <bool cancellable = false> 558 static bool __kmp_linear_barrier_gather_template( 559 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 560 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 561 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); 562 kmp_team_t *team = this_thr->th.th_team; 563 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 564 kmp_info_t **other_threads = team->t.t_threads; 565 566 KA_TRACE( 567 20, 568 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 569 gtid, team->t.t_id, tid, bt)); 570 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 571 572 #if USE_ITT_BUILD && USE_ITT_NOTIFY 573 // Barrier imbalance - save arrive time to the thread 574 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 575 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 576 __itt_get_timestamp(); 577 } 578 #endif 579 // We now perform a linear reduction to signal that all of the threads have 580 // arrived. 581 if (!KMP_MASTER_TID(tid)) { 582 KA_TRACE(20, 583 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 584 "arrived(%p): %llu => %llu\n", 585 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team), 586 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived, 587 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 588 // Mark arrival to primary thread 589 /* After performing this write, a worker thread may not assume that the team 590 is valid any more - it could be deallocated by the primary thread at any 591 time. */ 592 ANNOTATE_BARRIER_BEGIN(this_thr); 593 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]); 594 flag.release(); 595 } else { 596 kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; 597 int nproc = this_thr->th.th_team_nproc; 598 int i; 599 // Don't have to worry about sleep bit here or atomic since team setting 600 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; 601 602 // Collect all the worker team member threads. 603 for (i = 1; i < nproc; ++i) { 604 #if KMP_CACHE_MANAGE 605 // Prefetch next thread's arrived count 606 if (i + 1 < nproc) 607 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived); 608 #endif /* KMP_CACHE_MANAGE */ 609 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 610 "arrived(%p) == %llu\n", 611 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 612 team->t.t_id, i, 613 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); 614 615 // Wait for worker thread to arrive 616 if (cancellable) { 617 kmp_flag_64<true, false> flag( 618 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); 619 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj))) 620 return true; 621 } else { 622 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, 623 new_state); 624 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 625 } 626 ANNOTATE_BARRIER_END(other_threads[i]); 627 #if USE_ITT_BUILD && USE_ITT_NOTIFY 628 // Barrier imbalance - write min of the thread time and the other thread 629 // time to the thread. 630 if (__kmp_forkjoin_frames_mode == 2) { 631 this_thr->th.th_bar_min_time = KMP_MIN( 632 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time); 633 } 634 #endif 635 if (reduce) { 636 KA_TRACE(100, 637 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 638 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 639 team->t.t_id, i)); 640 ANNOTATE_REDUCE_AFTER(reduce); 641 OMPT_REDUCTION_DECL(this_thr, gtid); 642 OMPT_REDUCTION_BEGIN; 643 (*reduce)(this_thr->th.th_local.reduce_data, 644 other_threads[i]->th.th_local.reduce_data); 645 OMPT_REDUCTION_END; 646 ANNOTATE_REDUCE_BEFORE(reduce); 647 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 648 } 649 } 650 // Don't have to worry about sleep bit here or atomic since team setting 651 team_bar->b_arrived = new_state; 652 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " 653 "arrived(%p) = %llu\n", 654 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, 655 new_state)); 656 } 657 KA_TRACE( 658 20, 659 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 660 gtid, team->t.t_id, tid, bt)); 661 return false; 662 } 663 664 template <bool cancellable = false> 665 static bool __kmp_linear_barrier_release_template( 666 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 667 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 668 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); 669 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 670 kmp_team_t *team; 671 672 if (KMP_MASTER_TID(tid)) { 673 unsigned int i; 674 kmp_uint32 nproc = this_thr->th.th_team_nproc; 675 kmp_info_t **other_threads; 676 677 team = __kmp_threads[gtid]->th.th_team; 678 KMP_DEBUG_ASSERT(team != NULL); 679 other_threads = team->t.t_threads; 680 681 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for " 682 "barrier type %d\n", 683 gtid, team->t.t_id, tid, bt)); 684 685 if (nproc > 1) { 686 #if KMP_BARRIER_ICV_PUSH 687 { 688 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 689 if (propagate_icvs) { 690 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); 691 for (i = 1; i < nproc; ++i) { 692 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], 693 team, i, FALSE); 694 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, 695 &team->t.t_implicit_task_taskdata[0].td_icvs); 696 } 697 ngo_sync(); 698 } 699 } 700 #endif // KMP_BARRIER_ICV_PUSH 701 702 // Now, release all of the worker threads 703 for (i = 1; i < nproc; ++i) { 704 #if KMP_CACHE_MANAGE 705 // Prefetch next thread's go flag 706 if (i + 1 < nproc) 707 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go); 708 #endif /* KMP_CACHE_MANAGE */ 709 KA_TRACE( 710 20, 711 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 712 "go(%p): %u => %u\n", 713 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid, 714 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go, 715 other_threads[i]->th.th_bar[bt].bb.b_go, 716 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); 717 ANNOTATE_BARRIER_BEGIN(other_threads[i]); 718 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go, 719 other_threads[i]); 720 flag.release(); 721 } 722 } 723 } else { // Wait for the PRIMARY thread to release us 724 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", 725 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 726 if (cancellable) { 727 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 728 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj))) 729 return true; 730 } else { 731 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 732 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 733 } 734 ANNOTATE_BARRIER_END(this_thr); 735 #if USE_ITT_BUILD && USE_ITT_NOTIFY 736 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 737 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is 738 // disabled) 739 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 740 // Cancel wait on previous parallel region... 741 __kmp_itt_task_starting(itt_sync_obj); 742 743 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 744 return false; 745 746 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 747 if (itt_sync_obj != NULL) 748 // Call prepare as early as possible for "new" barrier 749 __kmp_itt_task_finished(itt_sync_obj); 750 } else 751 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 752 // Early exit for reaping threads releasing forkjoin barrier 753 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 754 return false; 755 // The worker thread may now assume that the team is valid. 756 #ifdef KMP_DEBUG 757 tid = __kmp_tid_from_gtid(gtid); 758 team = __kmp_threads[gtid]->th.th_team; 759 #endif 760 KMP_DEBUG_ASSERT(team != NULL); 761 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 762 KA_TRACE(20, 763 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 764 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 765 KMP_MB(); // Flush all pending memory write invalidates. 766 } 767 KA_TRACE( 768 20, 769 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 770 gtid, team->t.t_id, tid, bt)); 771 return false; 772 } 773 774 static void __kmp_linear_barrier_gather( 775 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 776 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 777 __kmp_linear_barrier_gather_template<false>( 778 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 779 } 780 781 static bool __kmp_linear_barrier_gather_cancellable( 782 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 783 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 784 return __kmp_linear_barrier_gather_template<true>( 785 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 786 } 787 788 static void __kmp_linear_barrier_release( 789 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 790 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 791 __kmp_linear_barrier_release_template<false>( 792 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 793 } 794 795 static bool __kmp_linear_barrier_release_cancellable( 796 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 797 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 798 return __kmp_linear_barrier_release_template<true>( 799 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 800 } 801 802 // Tree barrier 803 static void __kmp_tree_barrier_gather( 804 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 805 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 806 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); 807 kmp_team_t *team = this_thr->th.th_team; 808 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 809 kmp_info_t **other_threads = team->t.t_threads; 810 kmp_uint32 nproc = this_thr->th.th_team_nproc; 811 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 812 kmp_uint32 branch_factor = 1 << branch_bits; 813 kmp_uint32 child; 814 kmp_uint32 child_tid; 815 kmp_uint64 new_state = 0; 816 817 KA_TRACE( 818 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 819 gtid, team->t.t_id, tid, bt)); 820 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 821 822 #if USE_ITT_BUILD && USE_ITT_NOTIFY 823 // Barrier imbalance - save arrive time to the thread 824 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 825 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 826 __itt_get_timestamp(); 827 } 828 #endif 829 // Perform tree gather to wait until all threads have arrived; reduce any 830 // required data as we go 831 child_tid = (tid << branch_bits) + 1; 832 if (child_tid < nproc) { 833 // Parent threads wait for all their children to arrive 834 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 835 child = 1; 836 do { 837 kmp_info_t *child_thr = other_threads[child_tid]; 838 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 839 #if KMP_CACHE_MANAGE 840 // Prefetch next thread's arrived count 841 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 842 KMP_CACHE_PREFETCH( 843 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived); 844 #endif /* KMP_CACHE_MANAGE */ 845 KA_TRACE(20, 846 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 847 "arrived(%p) == %llu\n", 848 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 849 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 850 // Wait for child to arrive 851 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 852 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 853 ANNOTATE_BARRIER_END(child_thr); 854 #if USE_ITT_BUILD && USE_ITT_NOTIFY 855 // Barrier imbalance - write min of the thread time and a child time to 856 // the thread. 857 if (__kmp_forkjoin_frames_mode == 2) { 858 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 859 child_thr->th.th_bar_min_time); 860 } 861 #endif 862 if (reduce) { 863 KA_TRACE(100, 864 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 865 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 866 team->t.t_id, child_tid)); 867 ANNOTATE_REDUCE_AFTER(reduce); 868 OMPT_REDUCTION_DECL(this_thr, gtid); 869 OMPT_REDUCTION_BEGIN; 870 (*reduce)(this_thr->th.th_local.reduce_data, 871 child_thr->th.th_local.reduce_data); 872 OMPT_REDUCTION_END; 873 ANNOTATE_REDUCE_BEFORE(reduce); 874 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 875 } 876 child++; 877 child_tid++; 878 } while (child <= branch_factor && child_tid < nproc); 879 } 880 881 if (!KMP_MASTER_TID(tid)) { // Worker threads 882 kmp_int32 parent_tid = (tid - 1) >> branch_bits; 883 884 KA_TRACE(20, 885 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 886 "arrived(%p): %llu => %llu\n", 887 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 888 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 889 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 890 891 // Mark arrival to parent thread 892 /* After performing this write, a worker thread may not assume that the team 893 is valid any more - it could be deallocated by the primary thread at any 894 time. */ 895 ANNOTATE_BARRIER_BEGIN(this_thr); 896 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]); 897 flag.release(); 898 } else { 899 // Need to update the team arrived pointer if we are the primary thread 900 if (nproc > 1) // New value was already computed above 901 team->t.t_bar[bt].b_arrived = new_state; 902 else 903 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 904 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " 905 "arrived(%p) = %llu\n", 906 gtid, team->t.t_id, tid, team->t.t_id, 907 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 908 } 909 KA_TRACE(20, 910 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 911 gtid, team->t.t_id, tid, bt)); 912 } 913 914 static void __kmp_tree_barrier_release( 915 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 916 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 917 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); 918 kmp_team_t *team; 919 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 920 kmp_uint32 nproc; 921 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 922 kmp_uint32 branch_factor = 1 << branch_bits; 923 kmp_uint32 child; 924 kmp_uint32 child_tid; 925 926 // Perform a tree release for all of the threads that have been gathered 927 if (!KMP_MASTER_TID( 928 tid)) { // Handle fork barrier workers who aren't part of a team yet 929 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid, 930 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 931 // Wait for parent thread to release us 932 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 933 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 934 ANNOTATE_BARRIER_END(this_thr); 935 #if USE_ITT_BUILD && USE_ITT_NOTIFY 936 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 937 // In fork barrier where we could not get the object reliably (or 938 // ITTNOTIFY is disabled) 939 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 940 // Cancel wait on previous parallel region... 941 __kmp_itt_task_starting(itt_sync_obj); 942 943 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 944 return; 945 946 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 947 if (itt_sync_obj != NULL) 948 // Call prepare as early as possible for "new" barrier 949 __kmp_itt_task_finished(itt_sync_obj); 950 } else 951 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 952 // Early exit for reaping threads releasing forkjoin barrier 953 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 954 return; 955 956 // The worker thread may now assume that the team is valid. 957 team = __kmp_threads[gtid]->th.th_team; 958 KMP_DEBUG_ASSERT(team != NULL); 959 tid = __kmp_tid_from_gtid(gtid); 960 961 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 962 KA_TRACE(20, 963 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid, 964 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 965 KMP_MB(); // Flush all pending memory write invalidates. 966 } else { 967 team = __kmp_threads[gtid]->th.th_team; 968 KMP_DEBUG_ASSERT(team != NULL); 969 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for " 970 "barrier type %d\n", 971 gtid, team->t.t_id, tid, bt)); 972 } 973 nproc = this_thr->th.th_team_nproc; 974 child_tid = (tid << branch_bits) + 1; 975 976 if (child_tid < nproc) { 977 kmp_info_t **other_threads = team->t.t_threads; 978 child = 1; 979 // Parent threads release all their children 980 do { 981 kmp_info_t *child_thr = other_threads[child_tid]; 982 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 983 #if KMP_CACHE_MANAGE 984 // Prefetch next thread's go count 985 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 986 KMP_CACHE_PREFETCH( 987 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go); 988 #endif /* KMP_CACHE_MANAGE */ 989 990 #if KMP_BARRIER_ICV_PUSH 991 { 992 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 993 if (propagate_icvs) { 994 __kmp_init_implicit_task(team->t.t_ident, 995 team->t.t_threads[child_tid], team, 996 child_tid, FALSE); 997 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, 998 &team->t.t_implicit_task_taskdata[0].td_icvs); 999 } 1000 } 1001 #endif // KMP_BARRIER_ICV_PUSH 1002 KA_TRACE(20, 1003 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 1004 "go(%p): %u => %u\n", 1005 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1006 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1007 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1008 // Release child from barrier 1009 ANNOTATE_BARRIER_BEGIN(child_thr); 1010 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1011 flag.release(); 1012 child++; 1013 child_tid++; 1014 } while (child <= branch_factor && child_tid < nproc); 1015 } 1016 KA_TRACE( 1017 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 1018 gtid, team->t.t_id, tid, bt)); 1019 } 1020 1021 // Hyper Barrier 1022 static void __kmp_hyper_barrier_gather( 1023 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1024 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1025 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); 1026 kmp_team_t *team = this_thr->th.th_team; 1027 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1028 kmp_info_t **other_threads = team->t.t_threads; 1029 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; 1030 kmp_uint32 num_threads = this_thr->th.th_team_nproc; 1031 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 1032 kmp_uint32 branch_factor = 1 << branch_bits; 1033 kmp_uint32 offset; 1034 kmp_uint32 level; 1035 1036 KA_TRACE( 1037 20, 1038 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 1039 gtid, team->t.t_id, tid, bt)); 1040 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 1041 1042 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1043 // Barrier imbalance - save arrive time to the thread 1044 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 1045 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 1046 __itt_get_timestamp(); 1047 } 1048 #endif 1049 /* Perform a hypercube-embedded tree gather to wait until all of the threads 1050 have arrived, and reduce any required data as we go. */ 1051 kmp_flag_64<> p_flag(&thr_bar->b_arrived); 1052 for (level = 0, offset = 1; offset < num_threads; 1053 level += branch_bits, offset <<= branch_bits) { 1054 kmp_uint32 child; 1055 kmp_uint32 child_tid; 1056 1057 if (((tid >> level) & (branch_factor - 1)) != 0) { 1058 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1); 1059 1060 KMP_MB(); // Synchronize parent and child threads. 1061 KA_TRACE(20, 1062 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 1063 "arrived(%p): %llu => %llu\n", 1064 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 1065 team->t.t_id, parent_tid, &thr_bar->b_arrived, 1066 thr_bar->b_arrived, 1067 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 1068 // Mark arrival to parent thread 1069 /* After performing this write (in the last iteration of the enclosing for 1070 loop), a worker thread may not assume that the team is valid any more 1071 - it could be deallocated by the primary thread at any time. */ 1072 ANNOTATE_BARRIER_BEGIN(this_thr); 1073 p_flag.set_waiter(other_threads[parent_tid]); 1074 p_flag.release(); 1075 break; 1076 } 1077 1078 // Parent threads wait for children to arrive 1079 if (new_state == KMP_BARRIER_UNUSED_STATE) 1080 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 1081 for (child = 1, child_tid = tid + (1 << level); 1082 child < branch_factor && child_tid < num_threads; 1083 child++, child_tid += (1 << level)) { 1084 kmp_info_t *child_thr = other_threads[child_tid]; 1085 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1086 #if KMP_CACHE_MANAGE 1087 kmp_uint32 next_child_tid = child_tid + (1 << level); 1088 // Prefetch next thread's arrived count 1089 if (child + 1 < branch_factor && next_child_tid < num_threads) 1090 KMP_CACHE_PREFETCH( 1091 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); 1092 #endif /* KMP_CACHE_MANAGE */ 1093 KA_TRACE(20, 1094 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 1095 "arrived(%p) == %llu\n", 1096 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1097 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 1098 // Wait for child to arrive 1099 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state); 1100 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1101 ANNOTATE_BARRIER_END(child_thr); 1102 KMP_MB(); // Synchronize parent and child threads. 1103 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1104 // Barrier imbalance - write min of the thread time and a child time to 1105 // the thread. 1106 if (__kmp_forkjoin_frames_mode == 2) { 1107 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 1108 child_thr->th.th_bar_min_time); 1109 } 1110 #endif 1111 if (reduce) { 1112 KA_TRACE(100, 1113 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 1114 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1115 team->t.t_id, child_tid)); 1116 ANNOTATE_REDUCE_AFTER(reduce); 1117 OMPT_REDUCTION_DECL(this_thr, gtid); 1118 OMPT_REDUCTION_BEGIN; 1119 (*reduce)(this_thr->th.th_local.reduce_data, 1120 child_thr->th.th_local.reduce_data); 1121 OMPT_REDUCTION_END; 1122 ANNOTATE_REDUCE_BEFORE(reduce); 1123 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 1124 } 1125 } 1126 } 1127 1128 if (KMP_MASTER_TID(tid)) { 1129 // Need to update the team arrived pointer if we are the primary thread 1130 if (new_state == KMP_BARRIER_UNUSED_STATE) 1131 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 1132 else 1133 team->t.t_bar[bt].b_arrived = new_state; 1134 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " 1135 "arrived(%p) = %llu\n", 1136 gtid, team->t.t_id, tid, team->t.t_id, 1137 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 1138 } 1139 KA_TRACE( 1140 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 1141 gtid, team->t.t_id, tid, bt)); 1142 } 1143 1144 // The reverse versions seem to beat the forward versions overall 1145 #define KMP_REVERSE_HYPER_BAR 1146 static void __kmp_hyper_barrier_release( 1147 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1148 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1149 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); 1150 kmp_team_t *team; 1151 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1152 kmp_info_t **other_threads; 1153 kmp_uint32 num_threads; 1154 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 1155 kmp_uint32 branch_factor = 1 << branch_bits; 1156 kmp_uint32 child; 1157 kmp_uint32 child_tid; 1158 kmp_uint32 offset; 1159 kmp_uint32 level; 1160 1161 /* Perform a hypercube-embedded tree release for all of the threads that have 1162 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads 1163 are released in the reverse order of the corresponding gather, otherwise 1164 threads are released in the same order. */ 1165 if (KMP_MASTER_TID(tid)) { // primary thread 1166 team = __kmp_threads[gtid]->th.th_team; 1167 KMP_DEBUG_ASSERT(team != NULL); 1168 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for " 1169 "barrier type %d\n", 1170 gtid, team->t.t_id, tid, bt)); 1171 #if KMP_BARRIER_ICV_PUSH 1172 if (propagate_icvs) { // primary already has ICVs in final destination; copy 1173 copy_icvs(&thr_bar->th_fixed_icvs, 1174 &team->t.t_implicit_task_taskdata[tid].td_icvs); 1175 } 1176 #endif 1177 } else { // Handle fork barrier workers who aren't part of a team yet 1178 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid, 1179 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 1180 // Wait for parent thread to release us 1181 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 1182 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1183 ANNOTATE_BARRIER_END(this_thr); 1184 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1185 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 1186 // In fork barrier where we could not get the object reliably 1187 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 1188 // Cancel wait on previous parallel region... 1189 __kmp_itt_task_starting(itt_sync_obj); 1190 1191 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1192 return; 1193 1194 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1195 if (itt_sync_obj != NULL) 1196 // Call prepare as early as possible for "new" barrier 1197 __kmp_itt_task_finished(itt_sync_obj); 1198 } else 1199 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1200 // Early exit for reaping threads releasing forkjoin barrier 1201 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1202 return; 1203 1204 // The worker thread may now assume that the team is valid. 1205 team = __kmp_threads[gtid]->th.th_team; 1206 KMP_DEBUG_ASSERT(team != NULL); 1207 tid = __kmp_tid_from_gtid(gtid); 1208 1209 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 1210 KA_TRACE(20, 1211 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 1212 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 1213 KMP_MB(); // Flush all pending memory write invalidates. 1214 } 1215 num_threads = this_thr->th.th_team_nproc; 1216 other_threads = team->t.t_threads; 1217 1218 #ifdef KMP_REVERSE_HYPER_BAR 1219 // Count up to correct level for parent 1220 for (level = 0, offset = 1; 1221 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0); 1222 level += branch_bits, offset <<= branch_bits) 1223 ; 1224 1225 // Now go down from there 1226 for (level -= branch_bits, offset >>= branch_bits; offset != 0; 1227 level -= branch_bits, offset >>= branch_bits) 1228 #else 1229 // Go down the tree, level by level 1230 for (level = 0, offset = 1; offset < num_threads; 1231 level += branch_bits, offset <<= branch_bits) 1232 #endif // KMP_REVERSE_HYPER_BAR 1233 { 1234 #ifdef KMP_REVERSE_HYPER_BAR 1235 /* Now go in reverse order through the children, highest to lowest. 1236 Initial setting of child is conservative here. */ 1237 child = num_threads >> ((level == 0) ? level : level - 1); 1238 for (child = (child < branch_factor - 1) ? child : branch_factor - 1, 1239 child_tid = tid + (child << level); 1240 child >= 1; child--, child_tid -= (1 << level)) 1241 #else 1242 if (((tid >> level) & (branch_factor - 1)) != 0) 1243 // No need to go lower than this, since this is the level parent would be 1244 // notified 1245 break; 1246 // Iterate through children on this level of the tree 1247 for (child = 1, child_tid = tid + (1 << level); 1248 child < branch_factor && child_tid < num_threads; 1249 child++, child_tid += (1 << level)) 1250 #endif // KMP_REVERSE_HYPER_BAR 1251 { 1252 if (child_tid >= num_threads) 1253 continue; // Child doesn't exist so keep going 1254 else { 1255 kmp_info_t *child_thr = other_threads[child_tid]; 1256 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1257 #if KMP_CACHE_MANAGE 1258 kmp_uint32 next_child_tid = child_tid - (1 << level); 1259 // Prefetch next thread's go count 1260 #ifdef KMP_REVERSE_HYPER_BAR 1261 if (child - 1 >= 1 && next_child_tid < num_threads) 1262 #else 1263 if (child + 1 < branch_factor && next_child_tid < num_threads) 1264 #endif // KMP_REVERSE_HYPER_BAR 1265 KMP_CACHE_PREFETCH( 1266 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); 1267 #endif /* KMP_CACHE_MANAGE */ 1268 1269 #if KMP_BARRIER_ICV_PUSH 1270 if (propagate_icvs) // push my fixed ICVs to my child 1271 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 1272 #endif // KMP_BARRIER_ICV_PUSH 1273 1274 KA_TRACE( 1275 20, 1276 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 1277 "go(%p): %u => %u\n", 1278 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1279 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1280 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1281 // Release child from barrier 1282 ANNOTATE_BARRIER_BEGIN(child_thr); 1283 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1284 flag.release(); 1285 } 1286 } 1287 } 1288 #if KMP_BARRIER_ICV_PUSH 1289 if (propagate_icvs && 1290 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest 1291 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 1292 FALSE); 1293 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1294 &thr_bar->th_fixed_icvs); 1295 } 1296 #endif 1297 KA_TRACE( 1298 20, 1299 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 1300 gtid, team->t.t_id, tid, bt)); 1301 } 1302 1303 // Hierarchical Barrier 1304 1305 // Initialize thread barrier data 1306 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. 1307 Performs the minimum amount of initialization required based on how the team 1308 has changed. Returns true if leaf children will require both on-core and 1309 traditional wake-up mechanisms. For example, if the team size increases, 1310 threads already in the team will respond to on-core wakeup on their parent 1311 thread, but threads newly added to the team will only be listening on the 1312 their local b_go. */ 1313 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, 1314 kmp_bstate_t *thr_bar, 1315 kmp_uint32 nproc, int gtid, 1316 int tid, kmp_team_t *team) { 1317 // Checks to determine if (re-)initialization is needed 1318 bool uninitialized = thr_bar->team == NULL; 1319 bool team_changed = team != thr_bar->team; 1320 bool team_sz_changed = nproc != thr_bar->nproc; 1321 bool tid_changed = tid != thr_bar->old_tid; 1322 bool retval = false; 1323 1324 if (uninitialized || team_sz_changed) { 1325 __kmp_get_hierarchy(nproc, thr_bar); 1326 } 1327 1328 if (uninitialized || team_sz_changed || tid_changed) { 1329 thr_bar->my_level = thr_bar->depth - 1; // default for primary thread 1330 thr_bar->parent_tid = -1; // default for primary thread 1331 if (!KMP_MASTER_TID(tid)) { 1332 // if not primary thread, find parent thread in hierarchy 1333 kmp_uint32 d = 0; 1334 while (d < thr_bar->depth) { // find parent based on level of thread in 1335 // hierarchy, and note level 1336 kmp_uint32 rem; 1337 if (d == thr_bar->depth - 2) { // reached level right below the primary 1338 thr_bar->parent_tid = 0; 1339 thr_bar->my_level = d; 1340 break; 1341 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) { 1342 // TODO: can we make the above op faster? 1343 // thread is not a subtree root at next level, so this is max 1344 thr_bar->parent_tid = tid - rem; 1345 thr_bar->my_level = d; 1346 break; 1347 } 1348 ++d; 1349 } 1350 } 1351 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) / 1352 (thr_bar->skip_per_level[thr_bar->my_level])), 1353 &(thr_bar->offset)); 1354 thr_bar->old_tid = tid; 1355 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 1356 thr_bar->team = team; 1357 thr_bar->parent_bar = 1358 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 1359 } 1360 if (uninitialized || team_changed || tid_changed) { 1361 thr_bar->team = team; 1362 thr_bar->parent_bar = 1363 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 1364 retval = true; 1365 } 1366 if (uninitialized || team_sz_changed || tid_changed) { 1367 thr_bar->nproc = nproc; 1368 thr_bar->leaf_kids = thr_bar->base_leaf_kids; 1369 if (thr_bar->my_level == 0) 1370 thr_bar->leaf_kids = 0; 1371 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc) 1372 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids)); 1373 thr_bar->leaf_state = 0; 1374 for (int i = 0; i < thr_bar->leaf_kids; ++i) 1375 ((char *)&(thr_bar->leaf_state))[7 - i] = 1; 1376 } 1377 return retval; 1378 } 1379 1380 static void __kmp_hierarchical_barrier_gather( 1381 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1382 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1383 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); 1384 kmp_team_t *team = this_thr->th.th_team; 1385 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1386 kmp_uint32 nproc = this_thr->th.th_team_nproc; 1387 kmp_info_t **other_threads = team->t.t_threads; 1388 kmp_uint64 new_state = 0; 1389 1390 int level = team->t.t_level; 1391 if (other_threads[0] 1392 ->th.th_teams_microtask) // are we inside the teams construct? 1393 if (this_thr->th.th_teams_size.nteams > 1) 1394 ++level; // level was not increased in teams construct for team_of_masters 1395 if (level == 1) 1396 thr_bar->use_oncore_barrier = 1; 1397 else 1398 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 1399 1400 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " 1401 "barrier type %d\n", 1402 gtid, team->t.t_id, tid, bt)); 1403 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 1404 1405 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1406 // Barrier imbalance - save arrive time to the thread 1407 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 1408 this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); 1409 } 1410 #endif 1411 1412 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, 1413 team); 1414 1415 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) 1416 kmp_int32 child_tid; 1417 new_state = 1418 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 1419 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1420 thr_bar->use_oncore_barrier) { 1421 if (thr_bar->leaf_kids) { 1422 // First, wait for leaf children to check-in on my b_arrived flag 1423 kmp_uint64 leaf_state = 1424 KMP_MASTER_TID(tid) 1425 ? thr_bar->b_arrived | thr_bar->leaf_state 1426 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; 1427 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " 1428 "for leaf kids\n", 1429 gtid, team->t.t_id, tid)); 1430 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state); 1431 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1432 if (reduce) { 1433 ANNOTATE_REDUCE_AFTER(reduce); 1434 OMPT_REDUCTION_DECL(this_thr, gtid); 1435 OMPT_REDUCTION_BEGIN; 1436 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; 1437 ++child_tid) { 1438 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 1439 "T#%d(%d:%d)\n", 1440 gtid, team->t.t_id, tid, 1441 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1442 child_tid)); 1443 ANNOTATE_BARRIER_END(other_threads[child_tid]); 1444 (*reduce)(this_thr->th.th_local.reduce_data, 1445 other_threads[child_tid]->th.th_local.reduce_data); 1446 } 1447 OMPT_REDUCTION_END; 1448 ANNOTATE_REDUCE_BEFORE(reduce); 1449 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 1450 } 1451 // clear leaf_state bits 1452 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state)); 1453 } 1454 // Next, wait for higher level children on each child's b_arrived flag 1455 for (kmp_uint32 d = 1; d < thr_bar->my_level; 1456 ++d) { // gather lowest level threads first, but skip 0 1457 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 1458 skip = thr_bar->skip_per_level[d]; 1459 if (last > nproc) 1460 last = nproc; 1461 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1462 kmp_info_t *child_thr = other_threads[child_tid]; 1463 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1464 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 1465 "T#%d(%d:%d) " 1466 "arrived(%p) == %llu\n", 1467 gtid, team->t.t_id, tid, 1468 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1469 child_tid, &child_bar->b_arrived, new_state)); 1470 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 1471 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1472 ANNOTATE_BARRIER_END(child_thr); 1473 if (reduce) { 1474 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 1475 "T#%d(%d:%d)\n", 1476 gtid, team->t.t_id, tid, 1477 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1478 child_tid)); 1479 ANNOTATE_REDUCE_AFTER(reduce); 1480 (*reduce)(this_thr->th.th_local.reduce_data, 1481 child_thr->th.th_local.reduce_data); 1482 ANNOTATE_REDUCE_BEFORE(reduce); 1483 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 1484 } 1485 } 1486 } 1487 } else { // Blocktime is not infinite 1488 for (kmp_uint32 d = 0; d < thr_bar->my_level; 1489 ++d) { // Gather lowest level threads first 1490 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 1491 skip = thr_bar->skip_per_level[d]; 1492 if (last > nproc) 1493 last = nproc; 1494 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1495 kmp_info_t *child_thr = other_threads[child_tid]; 1496 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1497 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 1498 "T#%d(%d:%d) " 1499 "arrived(%p) == %llu\n", 1500 gtid, team->t.t_id, tid, 1501 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1502 child_tid, &child_bar->b_arrived, new_state)); 1503 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 1504 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1505 ANNOTATE_BARRIER_END(child_thr); 1506 if (reduce) { 1507 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 1508 "T#%d(%d:%d)\n", 1509 gtid, team->t.t_id, tid, 1510 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1511 child_tid)); 1512 ANNOTATE_REDUCE_AFTER(reduce); 1513 (*reduce)(this_thr->th.th_local.reduce_data, 1514 child_thr->th.th_local.reduce_data); 1515 ANNOTATE_REDUCE_BEFORE(reduce); 1516 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 1517 } 1518 } 1519 } 1520 } 1521 } 1522 // All subordinates are gathered; now release parent if not primary thread 1523 1524 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy 1525 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing" 1526 " T#%d(%d:%d) arrived(%p): %llu => %llu\n", 1527 gtid, team->t.t_id, tid, 1528 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, 1529 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 1530 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 1531 /* Mark arrival to parent: After performing this write, a worker thread may 1532 not assume that the team is valid any more - it could be deallocated by 1533 the primary thread at any time. */ 1534 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || 1535 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived 1536 // flag; release it 1537 ANNOTATE_BARRIER_BEGIN(this_thr); 1538 kmp_flag_64<> flag(&thr_bar->b_arrived, 1539 other_threads[thr_bar->parent_tid]); 1540 flag.release(); 1541 } else { 1542 // Leaf does special release on "offset" bits of parent's b_arrived flag 1543 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 1544 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, 1545 thr_bar->offset + 1); 1546 flag.set_waiter(other_threads[thr_bar->parent_tid]); 1547 flag.release(); 1548 } 1549 } else { // Primary thread needs to update the team's b_arrived value 1550 team->t.t_bar[bt].b_arrived = new_state; 1551 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " 1552 "arrived(%p) = %llu\n", 1553 gtid, team->t.t_id, tid, team->t.t_id, 1554 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 1555 } 1556 // Is the team access below unsafe or just technically invalid? 1557 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " 1558 "barrier type %d\n", 1559 gtid, team->t.t_id, tid, bt)); 1560 } 1561 1562 static void __kmp_hierarchical_barrier_release( 1563 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1564 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1565 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); 1566 kmp_team_t *team; 1567 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1568 kmp_uint32 nproc; 1569 bool team_change = false; // indicates on-core barrier shouldn't be used 1570 1571 if (KMP_MASTER_TID(tid)) { 1572 team = __kmp_threads[gtid]->th.th_team; 1573 KMP_DEBUG_ASSERT(team != NULL); 1574 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary " 1575 "entered barrier type %d\n", 1576 gtid, team->t.t_id, tid, bt)); 1577 } else { // Worker threads 1578 // Wait for parent thread to release me 1579 if (!thr_bar->use_oncore_barrier || 1580 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 || 1581 thr_bar->team == NULL) { 1582 // Use traditional method of waiting on my own b_go flag 1583 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; 1584 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 1585 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1586 ANNOTATE_BARRIER_END(this_thr); 1587 TCW_8(thr_bar->b_go, 1588 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1589 } else { // Thread barrier data is initialized, this is a leaf, blocktime is 1590 // infinite, not nested 1591 // Wait on my "offset" bits on parent's b_go flag 1592 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; 1593 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, 1594 thr_bar->offset + 1, bt, 1595 this_thr USE_ITT_BUILD_ARG(itt_sync_obj)); 1596 flag.wait(this_thr, TRUE); 1597 if (thr_bar->wait_flag == 1598 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go 1599 TCW_8(thr_bar->b_go, 1600 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1601 } else { // Reset my bits on parent's b_go flag 1602 (RCAST(volatile char *, 1603 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0; 1604 } 1605 } 1606 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 1607 // Early exit for reaping threads releasing forkjoin barrier 1608 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1609 return; 1610 // The worker thread may now assume that the team is valid. 1611 team = __kmp_threads[gtid]->th.th_team; 1612 KMP_DEBUG_ASSERT(team != NULL); 1613 tid = __kmp_tid_from_gtid(gtid); 1614 1615 KA_TRACE( 1616 20, 1617 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 1618 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 1619 KMP_MB(); // Flush all pending memory write invalidates. 1620 } 1621 1622 nproc = this_thr->th.th_team_nproc; 1623 int level = team->t.t_level; 1624 if (team->t.t_threads[0] 1625 ->th.th_teams_microtask) { // are we inside the teams construct? 1626 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 1627 this_thr->th.th_teams_level == level) 1628 ++level; // level was not increased in teams construct for team_of_workers 1629 if (this_thr->th.th_teams_size.nteams > 1) 1630 ++level; // level was not increased in teams construct for team_of_masters 1631 } 1632 if (level == 1) 1633 thr_bar->use_oncore_barrier = 1; 1634 else 1635 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 1636 1637 // If the team size has increased, we still communicate with old leaves via 1638 // oncore barrier. 1639 unsigned short int old_leaf_kids = thr_bar->leaf_kids; 1640 kmp_uint64 old_leaf_state = thr_bar->leaf_state; 1641 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, 1642 tid, team); 1643 // But if the entire team changes, we won't use oncore barrier at all 1644 if (team_change) 1645 old_leaf_kids = 0; 1646 1647 #if KMP_BARRIER_ICV_PUSH 1648 if (propagate_icvs) { 1649 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 1650 FALSE); 1651 if (KMP_MASTER_TID( 1652 tid)) { // primary already has copy in final destination; copy 1653 copy_icvs(&thr_bar->th_fixed_icvs, 1654 &team->t.t_implicit_task_taskdata[tid].td_icvs); 1655 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1656 thr_bar->use_oncore_barrier) { // optimization for inf blocktime 1657 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) 1658 // leaves (on-core children) pull parent's fixed ICVs directly to local 1659 // ICV store 1660 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1661 &thr_bar->parent_bar->th_fixed_icvs); 1662 // non-leaves will get ICVs piggybacked with b_go via NGO store 1663 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs 1664 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can 1665 // access 1666 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); 1667 else // leaves copy parent's fixed ICVs directly to local ICV store 1668 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1669 &thr_bar->parent_bar->th_fixed_icvs); 1670 } 1671 } 1672 #endif // KMP_BARRIER_ICV_PUSH 1673 1674 // Now, release my children 1675 if (thr_bar->my_level) { // not a leaf 1676 kmp_int32 child_tid; 1677 kmp_uint32 last; 1678 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1679 thr_bar->use_oncore_barrier) { 1680 if (KMP_MASTER_TID(tid)) { // do a flat release 1681 // Set local b_go to bump children via NGO store of the cache line 1682 // containing IVCs and b_go. 1683 thr_bar->b_go = KMP_BARRIER_STATE_BUMP; 1684 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of 1685 // the cache line 1686 ngo_load(&thr_bar->th_fixed_icvs); 1687 // This loops over all the threads skipping only the leaf nodes in the 1688 // hierarchy 1689 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc; 1690 child_tid += thr_bar->skip_per_level[1]) { 1691 kmp_bstate_t *child_bar = 1692 &team->t.t_threads[child_tid]->th.th_bar[bt].bb; 1693 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1694 "releasing T#%d(%d:%d)" 1695 " go(%p): %u => %u\n", 1696 gtid, team->t.t_id, tid, 1697 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1698 child_tid, &child_bar->b_go, child_bar->b_go, 1699 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1700 // Use ngo store (if available) to both store ICVs and release child 1701 // via child's b_go 1702 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 1703 } 1704 ngo_sync(); 1705 } 1706 TCW_8(thr_bar->b_go, 1707 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1708 // Now, release leaf children 1709 if (thr_bar->leaf_kids) { // if there are any 1710 // We test team_change on the off-chance that the level 1 team changed. 1711 if (team_change || 1712 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new 1713 if (old_leaf_kids) { // release old leaf kids 1714 thr_bar->b_go |= old_leaf_state; 1715 } 1716 // Release new leaf kids 1717 last = tid + thr_bar->skip_per_level[1]; 1718 if (last > nproc) 1719 last = nproc; 1720 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last; 1721 ++child_tid) { // skip_per_level[0]=1 1722 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1723 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1724 KA_TRACE( 1725 20, 1726 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 1727 " T#%d(%d:%d) go(%p): %u => %u\n", 1728 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1729 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1730 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1731 // Release child using child's b_go flag 1732 ANNOTATE_BARRIER_BEGIN(child_thr); 1733 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1734 flag.release(); 1735 } 1736 } else { // Release all children at once with leaf_state bits on my own 1737 // b_go flag 1738 thr_bar->b_go |= thr_bar->leaf_state; 1739 } 1740 } 1741 } else { // Blocktime is not infinite; do a simple hierarchical release 1742 for (int d = thr_bar->my_level - 1; d >= 0; 1743 --d) { // Release highest level threads first 1744 last = tid + thr_bar->skip_per_level[d + 1]; 1745 kmp_uint32 skip = thr_bar->skip_per_level[d]; 1746 if (last > nproc) 1747 last = nproc; 1748 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1749 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1750 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1751 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1752 "releasing T#%d(%d:%d) go(%p): %u => %u\n", 1753 gtid, team->t.t_id, tid, 1754 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1755 child_tid, &child_bar->b_go, child_bar->b_go, 1756 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1757 // Release child using child's b_go flag 1758 ANNOTATE_BARRIER_BEGIN(child_thr); 1759 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1760 flag.release(); 1761 } 1762 } 1763 } 1764 #if KMP_BARRIER_ICV_PUSH 1765 if (propagate_icvs && !KMP_MASTER_TID(tid)) 1766 // non-leaves copy ICVs from fixed ICVs to local dest 1767 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1768 &thr_bar->th_fixed_icvs); 1769 #endif // KMP_BARRIER_ICV_PUSH 1770 } 1771 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " 1772 "barrier type %d\n", 1773 gtid, team->t.t_id, tid, bt)); 1774 } 1775 1776 // End of Barrier Algorithms 1777 1778 // type traits for cancellable value 1779 // if cancellable is true, then is_cancellable is a normal boolean variable 1780 // if cancellable is false, then is_cancellable is a compile time constant 1781 template <bool cancellable> struct is_cancellable {}; 1782 template <> struct is_cancellable<true> { 1783 bool value; 1784 is_cancellable() : value(false) {} 1785 is_cancellable(bool b) : value(b) {} 1786 is_cancellable &operator=(bool b) { 1787 value = b; 1788 return *this; 1789 } 1790 operator bool() const { return value; } 1791 }; 1792 template <> struct is_cancellable<false> { 1793 is_cancellable &operator=(bool b) { return *this; } 1794 constexpr operator bool() const { return false; } 1795 }; 1796 1797 // Internal function to do a barrier. 1798 /* If is_split is true, do a split barrier, otherwise, do a plain barrier 1799 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split 1800 barrier 1801 When cancellable = false, 1802 Returns 0 if primary thread, 1 if worker thread. 1803 When cancellable = true 1804 Returns 0 if not cancelled, 1 if cancelled. */ 1805 template <bool cancellable = false> 1806 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, 1807 size_t reduce_size, void *reduce_data, 1808 void (*reduce)(void *, void *)) { 1809 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); 1810 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1811 int tid = __kmp_tid_from_gtid(gtid); 1812 kmp_info_t *this_thr = __kmp_threads[gtid]; 1813 kmp_team_t *team = this_thr->th.th_team; 1814 int status = 0; 1815 is_cancellable<cancellable> cancelled; 1816 #if OMPT_SUPPORT && OMPT_OPTIONAL 1817 ompt_data_t *my_task_data; 1818 ompt_data_t *my_parallel_data; 1819 void *return_address; 1820 ompt_sync_region_t barrier_kind; 1821 #endif 1822 1823 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid, 1824 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1825 1826 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 1827 #if OMPT_SUPPORT 1828 if (ompt_enabled.enabled) { 1829 #if OMPT_OPTIONAL 1830 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 1831 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 1832 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1833 barrier_kind = __ompt_get_barrier_kind(bt, this_thr); 1834 if (ompt_enabled.ompt_callback_sync_region) { 1835 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1836 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1837 return_address); 1838 } 1839 if (ompt_enabled.ompt_callback_sync_region_wait) { 1840 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1841 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1842 return_address); 1843 } 1844 #endif 1845 // It is OK to report the barrier state after the barrier begin callback. 1846 // According to the OMPT specification, a compliant implementation may 1847 // even delay reporting this state until the barrier begins to wait. 1848 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; 1849 } 1850 #endif 1851 1852 if (!team->t.t_serialized) { 1853 #if USE_ITT_BUILD 1854 // This value will be used in itt notify events below. 1855 void *itt_sync_obj = NULL; 1856 #if USE_ITT_NOTIFY 1857 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1858 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1859 #endif 1860 #endif /* USE_ITT_BUILD */ 1861 if (__kmp_tasking_mode == tskm_extra_barrier) { 1862 __kmp_tasking_barrier(team, this_thr, gtid); 1863 KA_TRACE(15, 1864 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid, 1865 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1866 } 1867 1868 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 1869 access it when the team struct is not guaranteed to exist. */ 1870 // See note about the corresponding code in __kmp_join_barrier() being 1871 // performance-critical. 1872 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1873 #if KMP_USE_MONITOR 1874 this_thr->th.th_team_bt_intervals = 1875 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1876 this_thr->th.th_team_bt_set = 1877 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1878 #else 1879 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1880 #endif 1881 } 1882 1883 #if USE_ITT_BUILD 1884 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1885 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1886 #endif /* USE_ITT_BUILD */ 1887 #if USE_DEBUGGER 1888 // Let the debugger know: the thread arrived to the barrier and waiting. 1889 if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct 1890 team->t.t_bar[bt].b_master_arrived += 1; 1891 } else { 1892 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; 1893 } // if 1894 #endif /* USE_DEBUGGER */ 1895 if (reduce != NULL) { 1896 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 1897 this_thr->th.th_local.reduce_data = reduce_data; 1898 } 1899 1900 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) 1901 // use 0 to only setup the current team if nthreads > 1 1902 __kmp_task_team_setup(this_thr, team, 0); 1903 1904 if (cancellable) { 1905 cancelled = __kmp_linear_barrier_gather_cancellable( 1906 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1907 } else { 1908 switch (__kmp_barrier_gather_pattern[bt]) { 1909 case bp_dist_bar: { 1910 __kmp_dist_barrier_gather(bt, this_thr, gtid, tid, 1911 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1912 break; 1913 } 1914 case bp_hyper_bar: { 1915 // don't set branch bits to 0; use linear 1916 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1917 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, 1918 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1919 break; 1920 } 1921 case bp_hierarchical_bar: { 1922 __kmp_hierarchical_barrier_gather( 1923 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1924 break; 1925 } 1926 case bp_tree_bar: { 1927 // don't set branch bits to 0; use linear 1928 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1929 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, 1930 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1931 break; 1932 } 1933 default: { 1934 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, 1935 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1936 } 1937 } 1938 } 1939 1940 KMP_MB(); 1941 1942 if (KMP_MASTER_TID(tid)) { 1943 status = 0; 1944 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 1945 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1946 } 1947 #if USE_DEBUGGER 1948 // Let the debugger know: All threads are arrived and starting leaving the 1949 // barrier. 1950 team->t.t_bar[bt].b_team_arrived += 1; 1951 #endif 1952 1953 if (__kmp_omp_cancellation) { 1954 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request); 1955 // Reset cancellation flag for worksharing constructs 1956 if (cancel_request == cancel_loop || 1957 cancel_request == cancel_sections) { 1958 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq); 1959 } 1960 } 1961 #if USE_ITT_BUILD 1962 /* TODO: In case of split reduction barrier, primary thread may send 1963 acquired event early, before the final summation into the shared 1964 variable is done (final summation can be a long operation for array 1965 reductions). */ 1966 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1967 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1968 #endif /* USE_ITT_BUILD */ 1969 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1970 // Barrier - report frame end (only if active_level == 1) 1971 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 1972 __kmp_forkjoin_frames_mode && 1973 (this_thr->th.th_teams_microtask == NULL || // either not in teams 1974 this_thr->th.th_teams_size.nteams == 1) && // or inside single team 1975 team->t.t_active_level == 1) { 1976 ident_t *loc = __kmp_threads[gtid]->th.th_ident; 1977 kmp_uint64 cur_time = __itt_get_timestamp(); 1978 kmp_info_t **other_threads = team->t.t_threads; 1979 int nproc = this_thr->th.th_team_nproc; 1980 int i; 1981 switch (__kmp_forkjoin_frames_mode) { 1982 case 1: 1983 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1984 loc, nproc); 1985 this_thr->th.th_frame_time = cur_time; 1986 break; 1987 case 2: // AC 2015-01-19: currently does not work for hierarchical (to 1988 // be fixed) 1989 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1990 1, loc, nproc); 1991 break; 1992 case 3: 1993 if (__itt_metadata_add_ptr) { 1994 // Initialize with primary thread's wait time 1995 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1996 // Set arrive time to zero to be able to check it in 1997 // __kmp_invoke_task(); the same is done inside the loop below 1998 this_thr->th.th_bar_arrive_time = 0; 1999 for (i = 1; i < nproc; ++i) { 2000 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 2001 other_threads[i]->th.th_bar_arrive_time = 0; 2002 } 2003 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 2004 cur_time, delta, 2005 (kmp_uint64)(reduce != NULL)); 2006 } 2007 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 2008 loc, nproc); 2009 this_thr->th.th_frame_time = cur_time; 2010 break; 2011 } 2012 } 2013 #endif /* USE_ITT_BUILD */ 2014 } else { 2015 status = 1; 2016 #if USE_ITT_BUILD 2017 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2018 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 2019 #endif /* USE_ITT_BUILD */ 2020 } 2021 if ((status == 1 || !is_split) && !cancelled) { 2022 if (cancellable) { 2023 cancelled = __kmp_linear_barrier_release_cancellable( 2024 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2025 } else { 2026 switch (__kmp_barrier_release_pattern[bt]) { 2027 case bp_dist_bar: { 2028 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2029 __kmp_dist_barrier_release(bt, this_thr, gtid, tid, 2030 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2031 break; 2032 } 2033 case bp_hyper_bar: { 2034 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2035 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 2036 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2037 break; 2038 } 2039 case bp_hierarchical_bar: { 2040 __kmp_hierarchical_barrier_release( 2041 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2042 break; 2043 } 2044 case bp_tree_bar: { 2045 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2046 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 2047 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2048 break; 2049 } 2050 default: { 2051 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 2052 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2053 } 2054 } 2055 } 2056 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 2057 __kmp_task_team_sync(this_thr, team); 2058 } 2059 } 2060 2061 #if USE_ITT_BUILD 2062 /* GEH: TODO: Move this under if-condition above and also include in 2063 __kmp_end_split_barrier(). This will more accurately represent the actual 2064 release time of the threads for split barriers. */ 2065 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2066 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 2067 #endif /* USE_ITT_BUILD */ 2068 } else { // Team is serialized. 2069 status = 0; 2070 if (__kmp_tasking_mode != tskm_immediate_exec) { 2071 if (this_thr->th.th_task_team != NULL) { 2072 #if USE_ITT_NOTIFY 2073 void *itt_sync_obj = NULL; 2074 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2075 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 2076 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 2077 } 2078 #endif 2079 2080 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == 2081 TRUE); 2082 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 2083 __kmp_task_team_setup(this_thr, team, 0); 2084 2085 #if USE_ITT_BUILD 2086 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2087 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 2088 #endif /* USE_ITT_BUILD */ 2089 } 2090 } 2091 } 2092 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", 2093 gtid, __kmp_team_from_gtid(gtid)->t.t_id, 2094 __kmp_tid_from_gtid(gtid), status)); 2095 2096 #if OMPT_SUPPORT 2097 if (ompt_enabled.enabled) { 2098 #if OMPT_OPTIONAL 2099 if (ompt_enabled.ompt_callback_sync_region_wait) { 2100 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2101 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 2102 return_address); 2103 } 2104 if (ompt_enabled.ompt_callback_sync_region) { 2105 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2106 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 2107 return_address); 2108 } 2109 #endif 2110 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 2111 } 2112 #endif 2113 ANNOTATE_BARRIER_END(&team->t.t_bar); 2114 2115 if (cancellable) 2116 return (int)cancelled; 2117 return status; 2118 } 2119 2120 // Returns 0 if primary thread, 1 if worker thread. 2121 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, 2122 size_t reduce_size, void *reduce_data, 2123 void (*reduce)(void *, void *)) { 2124 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data, 2125 reduce); 2126 } 2127 2128 #if defined(KMP_GOMP_COMPAT) 2129 // Returns 1 if cancelled, 0 otherwise 2130 int __kmp_barrier_gomp_cancel(int gtid) { 2131 if (__kmp_omp_cancellation) { 2132 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE, 2133 0, NULL, NULL); 2134 if (cancelled) { 2135 int tid = __kmp_tid_from_gtid(gtid); 2136 kmp_info_t *this_thr = __kmp_threads[gtid]; 2137 if (KMP_MASTER_TID(tid)) { 2138 // Primary thread does not need to revert anything 2139 } else { 2140 // Workers need to revert their private b_arrived flag 2141 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -= 2142 KMP_BARRIER_STATE_BUMP; 2143 } 2144 } 2145 return cancelled; 2146 } 2147 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2148 return FALSE; 2149 } 2150 #endif 2151 2152 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { 2153 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); 2154 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 2155 KMP_DEBUG_ASSERT(bt < bs_last_barrier); 2156 int tid = __kmp_tid_from_gtid(gtid); 2157 kmp_info_t *this_thr = __kmp_threads[gtid]; 2158 kmp_team_t *team = this_thr->th.th_team; 2159 2160 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 2161 if (!team->t.t_serialized) { 2162 if (KMP_MASTER_GTID(gtid)) { 2163 switch (__kmp_barrier_release_pattern[bt]) { 2164 case bp_dist_bar: { 2165 __kmp_dist_barrier_release(bt, this_thr, gtid, tid, 2166 FALSE USE_ITT_BUILD_ARG(NULL)); 2167 break; 2168 } 2169 case bp_hyper_bar: { 2170 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2171 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 2172 FALSE USE_ITT_BUILD_ARG(NULL)); 2173 break; 2174 } 2175 case bp_hierarchical_bar: { 2176 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, 2177 FALSE USE_ITT_BUILD_ARG(NULL)); 2178 break; 2179 } 2180 case bp_tree_bar: { 2181 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2182 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 2183 FALSE USE_ITT_BUILD_ARG(NULL)); 2184 break; 2185 } 2186 default: { 2187 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 2188 FALSE USE_ITT_BUILD_ARG(NULL)); 2189 } 2190 } 2191 if (__kmp_tasking_mode != tskm_immediate_exec) { 2192 __kmp_task_team_sync(this_thr, team); 2193 } // if 2194 } 2195 } 2196 ANNOTATE_BARRIER_END(&team->t.t_bar); 2197 } 2198 2199 void __kmp_join_barrier(int gtid) { 2200 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); 2201 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 2202 2203 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 2204 2205 kmp_info_t *this_thr = __kmp_threads[gtid]; 2206 kmp_team_t *team; 2207 kmp_uint nproc; 2208 kmp_info_t *master_thread; 2209 int tid; 2210 #ifdef KMP_DEBUG 2211 int team_id; 2212 #endif /* KMP_DEBUG */ 2213 #if USE_ITT_BUILD 2214 void *itt_sync_obj = NULL; 2215 #if USE_ITT_NOTIFY 2216 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need 2217 // Get object created at fork_barrier 2218 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2219 #endif 2220 #endif /* USE_ITT_BUILD */ 2221 KMP_MB(); 2222 2223 // Get current info 2224 team = this_thr->th.th_team; 2225 nproc = this_thr->th.th_team_nproc; 2226 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); 2227 tid = __kmp_tid_from_gtid(gtid); 2228 #ifdef KMP_DEBUG 2229 team_id = team->t.t_id; 2230 #endif /* KMP_DEBUG */ 2231 master_thread = this_thr->th.th_team_master; 2232 #ifdef KMP_DEBUG 2233 if (master_thread != team->t.t_threads[0]) { 2234 __kmp_print_structure(); 2235 } 2236 #endif /* KMP_DEBUG */ 2237 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); 2238 KMP_MB(); 2239 2240 // Verify state 2241 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); 2242 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); 2243 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); 2244 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", 2245 gtid, team_id, tid)); 2246 2247 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 2248 #if OMPT_SUPPORT 2249 if (ompt_enabled.enabled) { 2250 #if OMPT_OPTIONAL 2251 ompt_data_t *my_task_data; 2252 ompt_data_t *my_parallel_data; 2253 void *codeptr = NULL; 2254 int ds_tid = this_thr->th.th_info.ds.ds_tid; 2255 if (KMP_MASTER_TID(ds_tid) && 2256 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 2257 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 2258 codeptr = team->t.ompt_team_info.master_return_address; 2259 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 2260 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 2261 if (ompt_enabled.ompt_callback_sync_region) { 2262 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2263 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, 2264 my_task_data, codeptr); 2265 } 2266 if (ompt_enabled.ompt_callback_sync_region_wait) { 2267 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2268 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, 2269 my_task_data, codeptr); 2270 } 2271 if (!KMP_MASTER_TID(ds_tid)) 2272 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr); 2273 #endif 2274 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit; 2275 } 2276 #endif 2277 2278 if (__kmp_tasking_mode == tskm_extra_barrier) { 2279 __kmp_tasking_barrier(team, this_thr, gtid); 2280 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n", 2281 gtid, team_id, tid)); 2282 } 2283 #ifdef KMP_DEBUG 2284 if (__kmp_tasking_mode != tskm_immediate_exec) { 2285 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = " 2286 "%p, th_task_team = %p\n", 2287 __kmp_gtid_from_thread(this_thr), team_id, 2288 team->t.t_task_team[this_thr->th.th_task_state], 2289 this_thr->th.th_task_team)); 2290 if (this_thr->th.th_task_team) 2291 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == 2292 team->t.t_task_team[this_thr->th.th_task_state]); 2293 } 2294 #endif /* KMP_DEBUG */ 2295 2296 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 2297 access it when the team struct is not guaranteed to exist. Doing these 2298 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround, 2299 we do not perform the copy if blocktime=infinite, since the values are not 2300 used by __kmp_wait_template() in that case. */ 2301 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 2302 #if KMP_USE_MONITOR 2303 this_thr->th.th_team_bt_intervals = 2304 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 2305 this_thr->th.th_team_bt_set = 2306 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 2307 #else 2308 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 2309 #endif 2310 } 2311 2312 #if USE_ITT_BUILD 2313 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2314 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 2315 #endif /* USE_ITT_BUILD */ 2316 2317 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { 2318 case bp_dist_bar: { 2319 __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2320 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2321 break; 2322 } 2323 case bp_hyper_bar: { 2324 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 2325 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2326 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2327 break; 2328 } 2329 case bp_hierarchical_bar: { 2330 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2331 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2332 break; 2333 } 2334 case bp_tree_bar: { 2335 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 2336 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2337 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2338 break; 2339 } 2340 default: { 2341 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2342 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2343 } 2344 } 2345 2346 /* From this point on, the team data structure may be deallocated at any time 2347 by the primary thread - it is unsafe to reference it in any of the worker 2348 threads. Any per-team data items that need to be referenced before the 2349 end of the barrier should be moved to the kmp_task_team_t structs. */ 2350 if (KMP_MASTER_TID(tid)) { 2351 if (__kmp_tasking_mode != tskm_immediate_exec) { 2352 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 2353 } 2354 if (__kmp_display_affinity) { 2355 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0); 2356 } 2357 #if KMP_STATS_ENABLED 2358 // Have primary thread flag the workers to indicate they are now waiting for 2359 // next parallel region, Also wake them up so they switch their timers to 2360 // idle. 2361 for (int i = 0; i < team->t.t_nproc; ++i) { 2362 kmp_info_t *team_thread = team->t.t_threads[i]; 2363 if (team_thread == this_thr) 2364 continue; 2365 team_thread->th.th_stats->setIdleFlag(); 2366 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && 2367 team_thread->th.th_sleep_loc != NULL) 2368 __kmp_null_resume_wrapper(team_thread); 2369 } 2370 #endif 2371 #if USE_ITT_BUILD 2372 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2373 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 2374 #endif /* USE_ITT_BUILD */ 2375 2376 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2377 // Join barrier - report frame end 2378 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2379 __kmp_forkjoin_frames_mode && 2380 (this_thr->th.th_teams_microtask == NULL || // either not in teams 2381 this_thr->th.th_teams_size.nteams == 1) && // or inside single team 2382 team->t.t_active_level == 1) { 2383 kmp_uint64 cur_time = __itt_get_timestamp(); 2384 ident_t *loc = team->t.t_ident; 2385 kmp_info_t **other_threads = team->t.t_threads; 2386 int nproc = this_thr->th.th_team_nproc; 2387 int i; 2388 switch (__kmp_forkjoin_frames_mode) { 2389 case 1: 2390 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 2391 loc, nproc); 2392 break; 2393 case 2: 2394 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, 2395 loc, nproc); 2396 break; 2397 case 3: 2398 if (__itt_metadata_add_ptr) { 2399 // Initialize with primary thread's wait time 2400 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 2401 // Set arrive time to zero to be able to check it in 2402 // __kmp_invoke_task(); the same is done inside the loop below 2403 this_thr->th.th_bar_arrive_time = 0; 2404 for (i = 1; i < nproc; ++i) { 2405 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 2406 other_threads[i]->th.th_bar_arrive_time = 0; 2407 } 2408 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 2409 cur_time, delta, 0); 2410 } 2411 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 2412 loc, nproc); 2413 this_thr->th.th_frame_time = cur_time; 2414 break; 2415 } 2416 } 2417 #endif /* USE_ITT_BUILD */ 2418 } 2419 #if USE_ITT_BUILD 2420 else { 2421 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2422 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 2423 } 2424 #endif /* USE_ITT_BUILD */ 2425 2426 #if KMP_DEBUG 2427 if (KMP_MASTER_TID(tid)) { 2428 KA_TRACE( 2429 15, 2430 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", 2431 gtid, team_id, tid, nproc)); 2432 } 2433 #endif /* KMP_DEBUG */ 2434 2435 // TODO now, mark worker threads as done so they may be disbanded 2436 KMP_MB(); // Flush all pending memory write invalidates. 2437 KA_TRACE(10, 2438 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); 2439 2440 ANNOTATE_BARRIER_END(&team->t.t_bar); 2441 } 2442 2443 // TODO release worker threads' fork barriers as we are ready instead of all at 2444 // once 2445 void __kmp_fork_barrier(int gtid, int tid) { 2446 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); 2447 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 2448 kmp_info_t *this_thr = __kmp_threads[gtid]; 2449 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; 2450 #if USE_ITT_BUILD 2451 void *itt_sync_obj = NULL; 2452 #endif /* USE_ITT_BUILD */ 2453 if (team) 2454 ANNOTATE_BARRIER_END(&team->t.t_bar); 2455 2456 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid, 2457 (team != NULL) ? team->t.t_id : -1, tid)); 2458 2459 // th_team pointer only valid for primary thread here 2460 if (KMP_MASTER_TID(tid)) { 2461 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2462 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2463 // Create itt barrier object 2464 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); 2465 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing 2466 } 2467 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2468 2469 #ifdef KMP_DEBUG 2470 KMP_DEBUG_ASSERT(team); 2471 kmp_info_t **other_threads = team->t.t_threads; 2472 int i; 2473 2474 // Verify state 2475 KMP_MB(); 2476 2477 for (i = 1; i < team->t.t_nproc; ++i) { 2478 KA_TRACE(500, 2479 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " 2480 "== %u.\n", 2481 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, 2482 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, 2483 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); 2484 KMP_DEBUG_ASSERT( 2485 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) & 2486 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE); 2487 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); 2488 } 2489 #endif 2490 2491 if (__kmp_tasking_mode != tskm_immediate_exec) { 2492 // 0 indicates setup current task team if nthreads > 1 2493 __kmp_task_team_setup(this_thr, team, 0); 2494 } 2495 2496 /* The primary thread may have changed its blocktime between join barrier 2497 and fork barrier. Copy the blocktime info to the thread, where 2498 __kmp_wait_template() can access it when the team struct is not 2499 guaranteed to exist. */ 2500 // See note about the corresponding code in __kmp_join_barrier() being 2501 // performance-critical 2502 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 2503 #if KMP_USE_MONITOR 2504 this_thr->th.th_team_bt_intervals = 2505 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 2506 this_thr->th.th_team_bt_set = 2507 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 2508 #else 2509 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 2510 #endif 2511 } 2512 } // primary thread 2513 2514 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { 2515 case bp_dist_bar: { 2516 __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2517 TRUE USE_ITT_BUILD_ARG(NULL)); 2518 break; 2519 } 2520 case bp_hyper_bar: { 2521 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 2522 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2523 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2524 break; 2525 } 2526 case bp_hierarchical_bar: { 2527 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2528 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2529 break; 2530 } 2531 case bp_tree_bar: { 2532 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 2533 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2534 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2535 break; 2536 } 2537 default: { 2538 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2539 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2540 } 2541 } 2542 2543 #if OMPT_SUPPORT 2544 if (ompt_enabled.enabled && 2545 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 2546 int ds_tid = this_thr->th.th_info.ds.ds_tid; 2547 ompt_data_t *task_data = (team) 2548 ? OMPT_CUR_TASK_DATA(this_thr) 2549 : &(this_thr->th.ompt_thread_info.task_data); 2550 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 2551 #if OMPT_OPTIONAL 2552 void *codeptr = NULL; 2553 if (KMP_MASTER_TID(ds_tid) && 2554 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 2555 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 2556 codeptr = team ? team->t.ompt_team_info.master_return_address : NULL; 2557 if (ompt_enabled.ompt_callback_sync_region_wait) { 2558 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2559 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 2560 codeptr); 2561 } 2562 if (ompt_enabled.ompt_callback_sync_region) { 2563 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2564 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 2565 codeptr); 2566 } 2567 #endif 2568 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 2569 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2570 ompt_scope_end, NULL, task_data, 0, ds_tid, 2571 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 2572 } 2573 } 2574 #endif 2575 2576 // Early exit for reaping threads releasing forkjoin barrier 2577 if (TCR_4(__kmp_global.g.g_done)) { 2578 this_thr->th.th_task_team = NULL; 2579 2580 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2581 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2582 if (!KMP_MASTER_TID(tid)) { 2583 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2584 if (itt_sync_obj) 2585 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 2586 } 2587 } 2588 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2589 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); 2590 return; 2591 } 2592 2593 /* We can now assume that a valid team structure has been allocated by the 2594 primary thread and propagated to all worker threads. The current thread, 2595 however, may not be part of the team, so we can't blindly assume that the 2596 team pointer is non-null. */ 2597 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); 2598 KMP_DEBUG_ASSERT(team != NULL); 2599 tid = __kmp_tid_from_gtid(gtid); 2600 2601 #if KMP_BARRIER_ICV_PULL 2602 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in 2603 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's 2604 implicit task has this data before this function is called. We cannot 2605 modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's 2606 thread struct, because it is not always the case that the threads arrays 2607 have been allocated when __kmp_fork_call() is executed. */ 2608 { 2609 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 2610 if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs 2611 // Copy the initial ICVs from the primary thread's thread struct to the 2612 // implicit task for this tid. 2613 KA_TRACE(10, 2614 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); 2615 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, 2616 tid, FALSE); 2617 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 2618 &team->t.t_threads[0] 2619 ->th.th_bar[bs_forkjoin_barrier] 2620 .bb.th_fixed_icvs); 2621 } 2622 } 2623 #endif // KMP_BARRIER_ICV_PULL 2624 2625 if (__kmp_tasking_mode != tskm_immediate_exec) { 2626 __kmp_task_team_sync(this_thr, team); 2627 } 2628 2629 #if KMP_AFFINITY_SUPPORTED 2630 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 2631 if (proc_bind == proc_bind_intel) { 2632 // Call dynamic affinity settings 2633 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) { 2634 __kmp_balanced_affinity(this_thr, team->t.t_nproc); 2635 } 2636 } else if (proc_bind != proc_bind_false) { 2637 if (this_thr->th.th_new_place == this_thr->th.th_current_place) { 2638 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", 2639 __kmp_gtid_from_thread(this_thr), 2640 this_thr->th.th_current_place)); 2641 } else { 2642 __kmp_affinity_set_place(gtid); 2643 } 2644 } 2645 #endif // KMP_AFFINITY_SUPPORTED 2646 // Perform the display affinity functionality 2647 if (__kmp_display_affinity) { 2648 if (team->t.t_display_affinity 2649 #if KMP_AFFINITY_SUPPORTED 2650 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) 2651 #endif 2652 ) { 2653 // NULL means use the affinity-format-var ICV 2654 __kmp_aux_display_affinity(gtid, NULL); 2655 this_thr->th.th_prev_num_threads = team->t.t_nproc; 2656 this_thr->th.th_prev_level = team->t.t_level; 2657 } 2658 } 2659 if (!KMP_MASTER_TID(tid)) 2660 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator); 2661 2662 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2663 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2664 if (!KMP_MASTER_TID(tid)) { 2665 // Get correct barrier object 2666 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2667 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired 2668 } // (prepare called inside barrier_release) 2669 } 2670 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2671 ANNOTATE_BARRIER_END(&team->t.t_bar); 2672 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, 2673 team->t.t_id, tid)); 2674 } 2675 2676 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 2677 kmp_internal_control_t *new_icvs, ident_t *loc) { 2678 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); 2679 2680 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); 2681 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 2682 2683 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in 2684 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's 2685 implicit task has this data before this function is called. */ 2686 #if KMP_BARRIER_ICV_PULL 2687 /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which 2688 remains untouched), where all of the worker threads can access them and 2689 make their own copies after the barrier. */ 2690 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2691 // allocated at this point 2692 copy_icvs( 2693 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, 2694 new_icvs); 2695 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0, 2696 team->t.t_threads[0], team)); 2697 #elif KMP_BARRIER_ICV_PUSH 2698 // The ICVs will be propagated in the fork barrier, so nothing needs to be 2699 // done here. 2700 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0, 2701 team->t.t_threads[0], team)); 2702 #else 2703 // Copy the ICVs to each of the non-primary threads. This takes O(nthreads) 2704 // time. 2705 ngo_load(new_icvs); 2706 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2707 // allocated at this point 2708 for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread 2709 // TODO: GEH - pass in better source location info since usually NULL here 2710 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2711 f, team->t.t_threads[f], team)); 2712 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); 2713 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); 2714 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2715 f, team->t.t_threads[f], team)); 2716 } 2717 ngo_sync(); 2718 #endif // KMP_BARRIER_ICV_PULL 2719 } 2720