1 /* 2 * kmp_barrier.cpp 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_wait_release.h" 15 #include "kmp_itt.h" 16 #include "kmp_os.h" 17 #include "kmp_stats.h" 18 #include "ompt-specific.h" 19 20 #if KMP_MIC 21 #include <immintrin.h> 22 #define USE_NGO_STORES 1 23 #endif // KMP_MIC 24 25 #include "tsan_annotations.h" 26 27 #if KMP_MIC && USE_NGO_STORES 28 // ICV copying 29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") 33 #else 34 #define ngo_load(src) ((void)0) 35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 37 #define ngo_sync() ((void)0) 38 #endif /* KMP_MIC && USE_NGO_STORES */ 39 40 void __kmp_print_structure(void); // Forward declaration 41 42 // ---------------------------- Barrier Algorithms ---------------------------- 43 44 // Linear Barrier 45 template <bool cancellable = false> 46 static bool __kmp_linear_barrier_gather_template( 47 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 48 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 49 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); 50 kmp_team_t *team = this_thr->th.th_team; 51 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 52 kmp_info_t **other_threads = team->t.t_threads; 53 54 KA_TRACE( 55 20, 56 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 57 gtid, team->t.t_id, tid, bt)); 58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 59 60 #if USE_ITT_BUILD && USE_ITT_NOTIFY 61 // Barrier imbalance - save arrive time to the thread 62 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 63 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 64 __itt_get_timestamp(); 65 } 66 #endif 67 // We now perform a linear reduction to signal that all of the threads have 68 // arrived. 69 if (!KMP_MASTER_TID(tid)) { 70 KA_TRACE(20, 71 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 72 "arrived(%p): %llu => %llu\n", 73 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team), 74 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived, 75 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 76 // Mark arrival to master thread 77 /* After performing this write, a worker thread may not assume that the team 78 is valid any more - it could be deallocated by the master thread at any 79 time. */ 80 ANNOTATE_BARRIER_BEGIN(this_thr); 81 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]); 82 flag.release(); 83 } else { 84 kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; 85 int nproc = this_thr->th.th_team_nproc; 86 int i; 87 // Don't have to worry about sleep bit here or atomic since team setting 88 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; 89 90 // Collect all the worker team member threads. 91 for (i = 1; i < nproc; ++i) { 92 #if KMP_CACHE_MANAGE 93 // Prefetch next thread's arrived count 94 if (i + 1 < nproc) 95 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived); 96 #endif /* KMP_CACHE_MANAGE */ 97 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 98 "arrived(%p) == %llu\n", 99 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 100 team->t.t_id, i, 101 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); 102 103 // Wait for worker thread to arrive 104 if (cancellable) { 105 kmp_flag_64<true, false> flag( 106 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); 107 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj))) 108 return true; 109 } else { 110 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, 111 new_state); 112 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 113 } 114 ANNOTATE_BARRIER_END(other_threads[i]); 115 #if USE_ITT_BUILD && USE_ITT_NOTIFY 116 // Barrier imbalance - write min of the thread time and the other thread 117 // time to the thread. 118 if (__kmp_forkjoin_frames_mode == 2) { 119 this_thr->th.th_bar_min_time = KMP_MIN( 120 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time); 121 } 122 #endif 123 if (reduce) { 124 KA_TRACE(100, 125 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 126 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 127 team->t.t_id, i)); 128 ANNOTATE_REDUCE_AFTER(reduce); 129 OMPT_REDUCTION_DECL(this_thr, gtid); 130 OMPT_REDUCTION_BEGIN; 131 (*reduce)(this_thr->th.th_local.reduce_data, 132 other_threads[i]->th.th_local.reduce_data); 133 OMPT_REDUCTION_END; 134 ANNOTATE_REDUCE_BEFORE(reduce); 135 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 136 } 137 } 138 // Don't have to worry about sleep bit here or atomic since team setting 139 team_bar->b_arrived = new_state; 140 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " 141 "arrived(%p) = %llu\n", 142 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, 143 new_state)); 144 } 145 KA_TRACE( 146 20, 147 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 148 gtid, team->t.t_id, tid, bt)); 149 return false; 150 } 151 152 template <bool cancellable = false> 153 static bool __kmp_linear_barrier_release_template( 154 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 155 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 156 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); 157 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 158 kmp_team_t *team; 159 160 if (KMP_MASTER_TID(tid)) { 161 unsigned int i; 162 kmp_uint32 nproc = this_thr->th.th_team_nproc; 163 kmp_info_t **other_threads; 164 165 team = __kmp_threads[gtid]->th.th_team; 166 KMP_DEBUG_ASSERT(team != NULL); 167 other_threads = team->t.t_threads; 168 169 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for " 170 "barrier type %d\n", 171 gtid, team->t.t_id, tid, bt)); 172 173 if (nproc > 1) { 174 #if KMP_BARRIER_ICV_PUSH 175 { 176 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 177 if (propagate_icvs) { 178 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); 179 for (i = 1; i < nproc; ++i) { 180 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], 181 team, i, FALSE); 182 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, 183 &team->t.t_implicit_task_taskdata[0].td_icvs); 184 } 185 ngo_sync(); 186 } 187 } 188 #endif // KMP_BARRIER_ICV_PUSH 189 190 // Now, release all of the worker threads 191 for (i = 1; i < nproc; ++i) { 192 #if KMP_CACHE_MANAGE 193 // Prefetch next thread's go flag 194 if (i + 1 < nproc) 195 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go); 196 #endif /* KMP_CACHE_MANAGE */ 197 KA_TRACE( 198 20, 199 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 200 "go(%p): %u => %u\n", 201 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid, 202 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go, 203 other_threads[i]->th.th_bar[bt].bb.b_go, 204 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); 205 ANNOTATE_BARRIER_BEGIN(other_threads[i]); 206 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go, 207 other_threads[i]); 208 flag.release(); 209 } 210 } 211 } else { // Wait for the MASTER thread to release us 212 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", 213 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 214 if (cancellable) { 215 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 216 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj))) 217 return true; 218 } else { 219 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 220 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 221 } 222 ANNOTATE_BARRIER_END(this_thr); 223 #if USE_ITT_BUILD && USE_ITT_NOTIFY 224 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 225 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is 226 // disabled) 227 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 228 // Cancel wait on previous parallel region... 229 __kmp_itt_task_starting(itt_sync_obj); 230 231 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 232 return false; 233 234 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 235 if (itt_sync_obj != NULL) 236 // Call prepare as early as possible for "new" barrier 237 __kmp_itt_task_finished(itt_sync_obj); 238 } else 239 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 240 // Early exit for reaping threads releasing forkjoin barrier 241 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 242 return false; 243 // The worker thread may now assume that the team is valid. 244 #ifdef KMP_DEBUG 245 tid = __kmp_tid_from_gtid(gtid); 246 team = __kmp_threads[gtid]->th.th_team; 247 #endif 248 KMP_DEBUG_ASSERT(team != NULL); 249 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 250 KA_TRACE(20, 251 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 252 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 253 KMP_MB(); // Flush all pending memory write invalidates. 254 } 255 KA_TRACE( 256 20, 257 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 258 gtid, team->t.t_id, tid, bt)); 259 return false; 260 } 261 262 static void __kmp_linear_barrier_gather( 263 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 264 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 265 __kmp_linear_barrier_gather_template<false>( 266 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 267 } 268 269 static bool __kmp_linear_barrier_gather_cancellable( 270 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 271 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 272 return __kmp_linear_barrier_gather_template<true>( 273 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 274 } 275 276 static void __kmp_linear_barrier_release( 277 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 278 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 279 __kmp_linear_barrier_release_template<false>( 280 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 281 } 282 283 static bool __kmp_linear_barrier_release_cancellable( 284 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 285 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 286 return __kmp_linear_barrier_release_template<true>( 287 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 288 } 289 290 // Tree barrier 291 static void __kmp_tree_barrier_gather( 292 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 293 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 294 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); 295 kmp_team_t *team = this_thr->th.th_team; 296 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 297 kmp_info_t **other_threads = team->t.t_threads; 298 kmp_uint32 nproc = this_thr->th.th_team_nproc; 299 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 300 kmp_uint32 branch_factor = 1 << branch_bits; 301 kmp_uint32 child; 302 kmp_uint32 child_tid; 303 kmp_uint64 new_state; 304 305 KA_TRACE( 306 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 307 gtid, team->t.t_id, tid, bt)); 308 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 309 310 #if USE_ITT_BUILD && USE_ITT_NOTIFY 311 // Barrier imbalance - save arrive time to the thread 312 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 313 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 314 __itt_get_timestamp(); 315 } 316 #endif 317 // Perform tree gather to wait until all threads have arrived; reduce any 318 // required data as we go 319 child_tid = (tid << branch_bits) + 1; 320 if (child_tid < nproc) { 321 // Parent threads wait for all their children to arrive 322 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 323 child = 1; 324 do { 325 kmp_info_t *child_thr = other_threads[child_tid]; 326 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 327 #if KMP_CACHE_MANAGE 328 // Prefetch next thread's arrived count 329 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 330 KMP_CACHE_PREFETCH( 331 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived); 332 #endif /* KMP_CACHE_MANAGE */ 333 KA_TRACE(20, 334 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 335 "arrived(%p) == %llu\n", 336 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 337 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 338 // Wait for child to arrive 339 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 340 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 341 ANNOTATE_BARRIER_END(child_thr); 342 #if USE_ITT_BUILD && USE_ITT_NOTIFY 343 // Barrier imbalance - write min of the thread time and a child time to 344 // the thread. 345 if (__kmp_forkjoin_frames_mode == 2) { 346 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 347 child_thr->th.th_bar_min_time); 348 } 349 #endif 350 if (reduce) { 351 KA_TRACE(100, 352 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 353 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 354 team->t.t_id, child_tid)); 355 ANNOTATE_REDUCE_AFTER(reduce); 356 OMPT_REDUCTION_DECL(this_thr, gtid); 357 OMPT_REDUCTION_BEGIN; 358 (*reduce)(this_thr->th.th_local.reduce_data, 359 child_thr->th.th_local.reduce_data); 360 OMPT_REDUCTION_END; 361 ANNOTATE_REDUCE_BEFORE(reduce); 362 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 363 } 364 child++; 365 child_tid++; 366 } while (child <= branch_factor && child_tid < nproc); 367 } 368 369 if (!KMP_MASTER_TID(tid)) { // Worker threads 370 kmp_int32 parent_tid = (tid - 1) >> branch_bits; 371 372 KA_TRACE(20, 373 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 374 "arrived(%p): %llu => %llu\n", 375 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 376 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 377 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 378 379 // Mark arrival to parent thread 380 /* After performing this write, a worker thread may not assume that the team 381 is valid any more - it could be deallocated by the master thread at any 382 time. */ 383 ANNOTATE_BARRIER_BEGIN(this_thr); 384 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]); 385 flag.release(); 386 } else { 387 // Need to update the team arrived pointer if we are the master thread 388 if (nproc > 1) // New value was already computed above 389 team->t.t_bar[bt].b_arrived = new_state; 390 else 391 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 392 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " 393 "arrived(%p) = %llu\n", 394 gtid, team->t.t_id, tid, team->t.t_id, 395 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 396 } 397 KA_TRACE(20, 398 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 399 gtid, team->t.t_id, tid, bt)); 400 } 401 402 static void __kmp_tree_barrier_release( 403 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 404 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 405 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); 406 kmp_team_t *team; 407 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 408 kmp_uint32 nproc; 409 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 410 kmp_uint32 branch_factor = 1 << branch_bits; 411 kmp_uint32 child; 412 kmp_uint32 child_tid; 413 414 // Perform a tree release for all of the threads that have been gathered 415 if (!KMP_MASTER_TID( 416 tid)) { // Handle fork barrier workers who aren't part of a team yet 417 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid, 418 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 419 // Wait for parent thread to release us 420 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 421 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 422 ANNOTATE_BARRIER_END(this_thr); 423 #if USE_ITT_BUILD && USE_ITT_NOTIFY 424 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 425 // In fork barrier where we could not get the object reliably (or 426 // ITTNOTIFY is disabled) 427 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 428 // Cancel wait on previous parallel region... 429 __kmp_itt_task_starting(itt_sync_obj); 430 431 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 432 return; 433 434 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 435 if (itt_sync_obj != NULL) 436 // Call prepare as early as possible for "new" barrier 437 __kmp_itt_task_finished(itt_sync_obj); 438 } else 439 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 440 // Early exit for reaping threads releasing forkjoin barrier 441 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 442 return; 443 444 // The worker thread may now assume that the team is valid. 445 team = __kmp_threads[gtid]->th.th_team; 446 KMP_DEBUG_ASSERT(team != NULL); 447 tid = __kmp_tid_from_gtid(gtid); 448 449 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 450 KA_TRACE(20, 451 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid, 452 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 453 KMP_MB(); // Flush all pending memory write invalidates. 454 } else { 455 team = __kmp_threads[gtid]->th.th_team; 456 KMP_DEBUG_ASSERT(team != NULL); 457 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for " 458 "barrier type %d\n", 459 gtid, team->t.t_id, tid, bt)); 460 } 461 nproc = this_thr->th.th_team_nproc; 462 child_tid = (tid << branch_bits) + 1; 463 464 if (child_tid < nproc) { 465 kmp_info_t **other_threads = team->t.t_threads; 466 child = 1; 467 // Parent threads release all their children 468 do { 469 kmp_info_t *child_thr = other_threads[child_tid]; 470 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 471 #if KMP_CACHE_MANAGE 472 // Prefetch next thread's go count 473 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 474 KMP_CACHE_PREFETCH( 475 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go); 476 #endif /* KMP_CACHE_MANAGE */ 477 478 #if KMP_BARRIER_ICV_PUSH 479 { 480 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 481 if (propagate_icvs) { 482 __kmp_init_implicit_task(team->t.t_ident, 483 team->t.t_threads[child_tid], team, 484 child_tid, FALSE); 485 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, 486 &team->t.t_implicit_task_taskdata[0].td_icvs); 487 } 488 } 489 #endif // KMP_BARRIER_ICV_PUSH 490 KA_TRACE(20, 491 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 492 "go(%p): %u => %u\n", 493 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 494 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 495 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 496 // Release child from barrier 497 ANNOTATE_BARRIER_BEGIN(child_thr); 498 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 499 flag.release(); 500 child++; 501 child_tid++; 502 } while (child <= branch_factor && child_tid < nproc); 503 } 504 KA_TRACE( 505 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 506 gtid, team->t.t_id, tid, bt)); 507 } 508 509 // Hyper Barrier 510 static void __kmp_hyper_barrier_gather( 511 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 512 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 513 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); 514 kmp_team_t *team = this_thr->th.th_team; 515 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 516 kmp_info_t **other_threads = team->t.t_threads; 517 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; 518 kmp_uint32 num_threads = this_thr->th.th_team_nproc; 519 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 520 kmp_uint32 branch_factor = 1 << branch_bits; 521 kmp_uint32 offset; 522 kmp_uint32 level; 523 524 KA_TRACE( 525 20, 526 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 527 gtid, team->t.t_id, tid, bt)); 528 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 529 530 #if USE_ITT_BUILD && USE_ITT_NOTIFY 531 // Barrier imbalance - save arrive time to the thread 532 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 533 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 534 __itt_get_timestamp(); 535 } 536 #endif 537 /* Perform a hypercube-embedded tree gather to wait until all of the threads 538 have arrived, and reduce any required data as we go. */ 539 kmp_flag_64<> p_flag(&thr_bar->b_arrived); 540 for (level = 0, offset = 1; offset < num_threads; 541 level += branch_bits, offset <<= branch_bits) { 542 kmp_uint32 child; 543 kmp_uint32 child_tid; 544 545 if (((tid >> level) & (branch_factor - 1)) != 0) { 546 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1); 547 548 KMP_MB(); // Synchronize parent and child threads. 549 KA_TRACE(20, 550 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 551 "arrived(%p): %llu => %llu\n", 552 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 553 team->t.t_id, parent_tid, &thr_bar->b_arrived, 554 thr_bar->b_arrived, 555 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 556 // Mark arrival to parent thread 557 /* After performing this write (in the last iteration of the enclosing for 558 loop), a worker thread may not assume that the team is valid any more 559 - it could be deallocated by the master thread at any time. */ 560 ANNOTATE_BARRIER_BEGIN(this_thr); 561 p_flag.set_waiter(other_threads[parent_tid]); 562 p_flag.release(); 563 break; 564 } 565 566 // Parent threads wait for children to arrive 567 if (new_state == KMP_BARRIER_UNUSED_STATE) 568 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 569 for (child = 1, child_tid = tid + (1 << level); 570 child < branch_factor && child_tid < num_threads; 571 child++, child_tid += (1 << level)) { 572 kmp_info_t *child_thr = other_threads[child_tid]; 573 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 574 #if KMP_CACHE_MANAGE 575 kmp_uint32 next_child_tid = child_tid + (1 << level); 576 // Prefetch next thread's arrived count 577 if (child + 1 < branch_factor && next_child_tid < num_threads) 578 KMP_CACHE_PREFETCH( 579 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); 580 #endif /* KMP_CACHE_MANAGE */ 581 KA_TRACE(20, 582 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 583 "arrived(%p) == %llu\n", 584 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 585 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 586 // Wait for child to arrive 587 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state); 588 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 589 ANNOTATE_BARRIER_END(child_thr); 590 KMP_MB(); // Synchronize parent and child threads. 591 #if USE_ITT_BUILD && USE_ITT_NOTIFY 592 // Barrier imbalance - write min of the thread time and a child time to 593 // the thread. 594 if (__kmp_forkjoin_frames_mode == 2) { 595 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 596 child_thr->th.th_bar_min_time); 597 } 598 #endif 599 if (reduce) { 600 KA_TRACE(100, 601 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 602 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 603 team->t.t_id, child_tid)); 604 ANNOTATE_REDUCE_AFTER(reduce); 605 OMPT_REDUCTION_DECL(this_thr, gtid); 606 OMPT_REDUCTION_BEGIN; 607 (*reduce)(this_thr->th.th_local.reduce_data, 608 child_thr->th.th_local.reduce_data); 609 OMPT_REDUCTION_END; 610 ANNOTATE_REDUCE_BEFORE(reduce); 611 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 612 } 613 } 614 } 615 616 if (KMP_MASTER_TID(tid)) { 617 // Need to update the team arrived pointer if we are the master thread 618 if (new_state == KMP_BARRIER_UNUSED_STATE) 619 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 620 else 621 team->t.t_bar[bt].b_arrived = new_state; 622 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " 623 "arrived(%p) = %llu\n", 624 gtid, team->t.t_id, tid, team->t.t_id, 625 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 626 } 627 KA_TRACE( 628 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 629 gtid, team->t.t_id, tid, bt)); 630 } 631 632 // The reverse versions seem to beat the forward versions overall 633 #define KMP_REVERSE_HYPER_BAR 634 static void __kmp_hyper_barrier_release( 635 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 636 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 637 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); 638 kmp_team_t *team; 639 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 640 kmp_info_t **other_threads; 641 kmp_uint32 num_threads; 642 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 643 kmp_uint32 branch_factor = 1 << branch_bits; 644 kmp_uint32 child; 645 kmp_uint32 child_tid; 646 kmp_uint32 offset; 647 kmp_uint32 level; 648 649 /* Perform a hypercube-embedded tree release for all of the threads that have 650 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads 651 are released in the reverse order of the corresponding gather, otherwise 652 threads are released in the same order. */ 653 if (KMP_MASTER_TID(tid)) { // master 654 team = __kmp_threads[gtid]->th.th_team; 655 KMP_DEBUG_ASSERT(team != NULL); 656 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for " 657 "barrier type %d\n", 658 gtid, team->t.t_id, tid, bt)); 659 #if KMP_BARRIER_ICV_PUSH 660 if (propagate_icvs) { // master already has ICVs in final destination; copy 661 copy_icvs(&thr_bar->th_fixed_icvs, 662 &team->t.t_implicit_task_taskdata[tid].td_icvs); 663 } 664 #endif 665 } else { // Handle fork barrier workers who aren't part of a team yet 666 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid, 667 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 668 // Wait for parent thread to release us 669 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 670 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 671 ANNOTATE_BARRIER_END(this_thr); 672 #if USE_ITT_BUILD && USE_ITT_NOTIFY 673 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 674 // In fork barrier where we could not get the object reliably 675 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 676 // Cancel wait on previous parallel region... 677 __kmp_itt_task_starting(itt_sync_obj); 678 679 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 680 return; 681 682 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 683 if (itt_sync_obj != NULL) 684 // Call prepare as early as possible for "new" barrier 685 __kmp_itt_task_finished(itt_sync_obj); 686 } else 687 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 688 // Early exit for reaping threads releasing forkjoin barrier 689 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 690 return; 691 692 // The worker thread may now assume that the team is valid. 693 team = __kmp_threads[gtid]->th.th_team; 694 KMP_DEBUG_ASSERT(team != NULL); 695 tid = __kmp_tid_from_gtid(gtid); 696 697 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 698 KA_TRACE(20, 699 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 700 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 701 KMP_MB(); // Flush all pending memory write invalidates. 702 } 703 num_threads = this_thr->th.th_team_nproc; 704 other_threads = team->t.t_threads; 705 706 #ifdef KMP_REVERSE_HYPER_BAR 707 // Count up to correct level for parent 708 for (level = 0, offset = 1; 709 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0); 710 level += branch_bits, offset <<= branch_bits) 711 ; 712 713 // Now go down from there 714 for (level -= branch_bits, offset >>= branch_bits; offset != 0; 715 level -= branch_bits, offset >>= branch_bits) 716 #else 717 // Go down the tree, level by level 718 for (level = 0, offset = 1; offset < num_threads; 719 level += branch_bits, offset <<= branch_bits) 720 #endif // KMP_REVERSE_HYPER_BAR 721 { 722 #ifdef KMP_REVERSE_HYPER_BAR 723 /* Now go in reverse order through the children, highest to lowest. 724 Initial setting of child is conservative here. */ 725 child = num_threads >> ((level == 0) ? level : level - 1); 726 for (child = (child < branch_factor - 1) ? child : branch_factor - 1, 727 child_tid = tid + (child << level); 728 child >= 1; child--, child_tid -= (1 << level)) 729 #else 730 if (((tid >> level) & (branch_factor - 1)) != 0) 731 // No need to go lower than this, since this is the level parent would be 732 // notified 733 break; 734 // Iterate through children on this level of the tree 735 for (child = 1, child_tid = tid + (1 << level); 736 child < branch_factor && child_tid < num_threads; 737 child++, child_tid += (1 << level)) 738 #endif // KMP_REVERSE_HYPER_BAR 739 { 740 if (child_tid >= num_threads) 741 continue; // Child doesn't exist so keep going 742 else { 743 kmp_info_t *child_thr = other_threads[child_tid]; 744 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 745 #if KMP_CACHE_MANAGE 746 kmp_uint32 next_child_tid = child_tid - (1 << level); 747 // Prefetch next thread's go count 748 #ifdef KMP_REVERSE_HYPER_BAR 749 if (child - 1 >= 1 && next_child_tid < num_threads) 750 #else 751 if (child + 1 < branch_factor && next_child_tid < num_threads) 752 #endif // KMP_REVERSE_HYPER_BAR 753 KMP_CACHE_PREFETCH( 754 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); 755 #endif /* KMP_CACHE_MANAGE */ 756 757 #if KMP_BARRIER_ICV_PUSH 758 if (propagate_icvs) // push my fixed ICVs to my child 759 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 760 #endif // KMP_BARRIER_ICV_PUSH 761 762 KA_TRACE( 763 20, 764 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 765 "go(%p): %u => %u\n", 766 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 767 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 768 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 769 // Release child from barrier 770 ANNOTATE_BARRIER_BEGIN(child_thr); 771 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 772 flag.release(); 773 } 774 } 775 } 776 #if KMP_BARRIER_ICV_PUSH 777 if (propagate_icvs && 778 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest 779 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 780 FALSE); 781 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 782 &thr_bar->th_fixed_icvs); 783 } 784 #endif 785 KA_TRACE( 786 20, 787 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 788 gtid, team->t.t_id, tid, bt)); 789 } 790 791 // Hierarchical Barrier 792 793 // Initialize thread barrier data 794 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. 795 Performs the minimum amount of initialization required based on how the team 796 has changed. Returns true if leaf children will require both on-core and 797 traditional wake-up mechanisms. For example, if the team size increases, 798 threads already in the team will respond to on-core wakeup on their parent 799 thread, but threads newly added to the team will only be listening on the 800 their local b_go. */ 801 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, 802 kmp_bstate_t *thr_bar, 803 kmp_uint32 nproc, int gtid, 804 int tid, kmp_team_t *team) { 805 // Checks to determine if (re-)initialization is needed 806 bool uninitialized = thr_bar->team == NULL; 807 bool team_changed = team != thr_bar->team; 808 bool team_sz_changed = nproc != thr_bar->nproc; 809 bool tid_changed = tid != thr_bar->old_tid; 810 bool retval = false; 811 812 if (uninitialized || team_sz_changed) { 813 __kmp_get_hierarchy(nproc, thr_bar); 814 } 815 816 if (uninitialized || team_sz_changed || tid_changed) { 817 thr_bar->my_level = thr_bar->depth - 1; // default for master 818 thr_bar->parent_tid = -1; // default for master 819 if (!KMP_MASTER_TID( 820 tid)) { // if not master, find parent thread in hierarchy 821 kmp_uint32 d = 0; 822 while (d < thr_bar->depth) { // find parent based on level of thread in 823 // hierarchy, and note level 824 kmp_uint32 rem; 825 if (d == thr_bar->depth - 2) { // reached level right below the master 826 thr_bar->parent_tid = 0; 827 thr_bar->my_level = d; 828 break; 829 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) { 830 // TODO: can we make the above op faster? 831 // thread is not a subtree root at next level, so this is max 832 thr_bar->parent_tid = tid - rem; 833 thr_bar->my_level = d; 834 break; 835 } 836 ++d; 837 } 838 } 839 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) / 840 (thr_bar->skip_per_level[thr_bar->my_level])), 841 &(thr_bar->offset)); 842 thr_bar->old_tid = tid; 843 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 844 thr_bar->team = team; 845 thr_bar->parent_bar = 846 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 847 } 848 if (uninitialized || team_changed || tid_changed) { 849 thr_bar->team = team; 850 thr_bar->parent_bar = 851 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 852 retval = true; 853 } 854 if (uninitialized || team_sz_changed || tid_changed) { 855 thr_bar->nproc = nproc; 856 thr_bar->leaf_kids = thr_bar->base_leaf_kids; 857 if (thr_bar->my_level == 0) 858 thr_bar->leaf_kids = 0; 859 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc) 860 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids)); 861 thr_bar->leaf_state = 0; 862 for (int i = 0; i < thr_bar->leaf_kids; ++i) 863 ((char *)&(thr_bar->leaf_state))[7 - i] = 1; 864 } 865 return retval; 866 } 867 868 static void __kmp_hierarchical_barrier_gather( 869 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 870 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 871 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); 872 kmp_team_t *team = this_thr->th.th_team; 873 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 874 kmp_uint32 nproc = this_thr->th.th_team_nproc; 875 kmp_info_t **other_threads = team->t.t_threads; 876 kmp_uint64 new_state; 877 878 int level = team->t.t_level; 879 if (other_threads[0] 880 ->th.th_teams_microtask) // are we inside the teams construct? 881 if (this_thr->th.th_teams_size.nteams > 1) 882 ++level; // level was not increased in teams construct for team_of_masters 883 if (level == 1) 884 thr_bar->use_oncore_barrier = 1; 885 else 886 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 887 888 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " 889 "barrier type %d\n", 890 gtid, team->t.t_id, tid, bt)); 891 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 892 893 #if USE_ITT_BUILD && USE_ITT_NOTIFY 894 // Barrier imbalance - save arrive time to the thread 895 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 896 this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); 897 } 898 #endif 899 900 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, 901 team); 902 903 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) 904 kmp_int32 child_tid; 905 new_state = 906 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 907 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 908 thr_bar->use_oncore_barrier) { 909 if (thr_bar->leaf_kids) { 910 // First, wait for leaf children to check-in on my b_arrived flag 911 kmp_uint64 leaf_state = 912 KMP_MASTER_TID(tid) 913 ? thr_bar->b_arrived | thr_bar->leaf_state 914 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; 915 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " 916 "for leaf kids\n", 917 gtid, team->t.t_id, tid)); 918 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state); 919 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 920 if (reduce) { 921 ANNOTATE_REDUCE_AFTER(reduce); 922 OMPT_REDUCTION_DECL(this_thr, gtid); 923 OMPT_REDUCTION_BEGIN; 924 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; 925 ++child_tid) { 926 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 927 "T#%d(%d:%d)\n", 928 gtid, team->t.t_id, tid, 929 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 930 child_tid)); 931 ANNOTATE_BARRIER_END(other_threads[child_tid]); 932 (*reduce)(this_thr->th.th_local.reduce_data, 933 other_threads[child_tid]->th.th_local.reduce_data); 934 } 935 OMPT_REDUCTION_END; 936 ANNOTATE_REDUCE_BEFORE(reduce); 937 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 938 } 939 // clear leaf_state bits 940 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state)); 941 } 942 // Next, wait for higher level children on each child's b_arrived flag 943 for (kmp_uint32 d = 1; d < thr_bar->my_level; 944 ++d) { // gather lowest level threads first, but skip 0 945 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 946 skip = thr_bar->skip_per_level[d]; 947 if (last > nproc) 948 last = nproc; 949 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 950 kmp_info_t *child_thr = other_threads[child_tid]; 951 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 952 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 953 "T#%d(%d:%d) " 954 "arrived(%p) == %llu\n", 955 gtid, team->t.t_id, tid, 956 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 957 child_tid, &child_bar->b_arrived, new_state)); 958 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 959 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 960 ANNOTATE_BARRIER_END(child_thr); 961 if (reduce) { 962 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 963 "T#%d(%d:%d)\n", 964 gtid, team->t.t_id, tid, 965 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 966 child_tid)); 967 ANNOTATE_REDUCE_AFTER(reduce); 968 (*reduce)(this_thr->th.th_local.reduce_data, 969 child_thr->th.th_local.reduce_data); 970 ANNOTATE_REDUCE_BEFORE(reduce); 971 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 972 } 973 } 974 } 975 } else { // Blocktime is not infinite 976 for (kmp_uint32 d = 0; d < thr_bar->my_level; 977 ++d) { // Gather lowest level threads first 978 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 979 skip = thr_bar->skip_per_level[d]; 980 if (last > nproc) 981 last = nproc; 982 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 983 kmp_info_t *child_thr = other_threads[child_tid]; 984 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 985 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 986 "T#%d(%d:%d) " 987 "arrived(%p) == %llu\n", 988 gtid, team->t.t_id, tid, 989 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 990 child_tid, &child_bar->b_arrived, new_state)); 991 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 992 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 993 ANNOTATE_BARRIER_END(child_thr); 994 if (reduce) { 995 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 996 "T#%d(%d:%d)\n", 997 gtid, team->t.t_id, tid, 998 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 999 child_tid)); 1000 ANNOTATE_REDUCE_AFTER(reduce); 1001 (*reduce)(this_thr->th.th_local.reduce_data, 1002 child_thr->th.th_local.reduce_data); 1003 ANNOTATE_REDUCE_BEFORE(reduce); 1004 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 1005 } 1006 } 1007 } 1008 } 1009 } 1010 // All subordinates are gathered; now release parent if not master thread 1011 1012 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy 1013 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing" 1014 " T#%d(%d:%d) arrived(%p): %llu => %llu\n", 1015 gtid, team->t.t_id, tid, 1016 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, 1017 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 1018 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 1019 /* Mark arrival to parent: After performing this write, a worker thread may 1020 not assume that the team is valid any more - it could be deallocated by 1021 the master thread at any time. */ 1022 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || 1023 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived 1024 // flag; release it 1025 ANNOTATE_BARRIER_BEGIN(this_thr); 1026 kmp_flag_64<> flag(&thr_bar->b_arrived, 1027 other_threads[thr_bar->parent_tid]); 1028 flag.release(); 1029 } else { 1030 // Leaf does special release on "offset" bits of parent's b_arrived flag 1031 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 1032 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, 1033 thr_bar->offset + 1); 1034 flag.set_waiter(other_threads[thr_bar->parent_tid]); 1035 flag.release(); 1036 } 1037 } else { // Master thread needs to update the team's b_arrived value 1038 team->t.t_bar[bt].b_arrived = new_state; 1039 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " 1040 "arrived(%p) = %llu\n", 1041 gtid, team->t.t_id, tid, team->t.t_id, 1042 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 1043 } 1044 // Is the team access below unsafe or just technically invalid? 1045 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " 1046 "barrier type %d\n", 1047 gtid, team->t.t_id, tid, bt)); 1048 } 1049 1050 static void __kmp_hierarchical_barrier_release( 1051 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1052 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1053 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); 1054 kmp_team_t *team; 1055 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1056 kmp_uint32 nproc; 1057 bool team_change = false; // indicates on-core barrier shouldn't be used 1058 1059 if (KMP_MASTER_TID(tid)) { 1060 team = __kmp_threads[gtid]->th.th_team; 1061 KMP_DEBUG_ASSERT(team != NULL); 1062 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master " 1063 "entered barrier type %d\n", 1064 gtid, team->t.t_id, tid, bt)); 1065 } else { // Worker threads 1066 // Wait for parent thread to release me 1067 if (!thr_bar->use_oncore_barrier || 1068 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 || 1069 thr_bar->team == NULL) { 1070 // Use traditional method of waiting on my own b_go flag 1071 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; 1072 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 1073 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1074 ANNOTATE_BARRIER_END(this_thr); 1075 TCW_8(thr_bar->b_go, 1076 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1077 } else { // Thread barrier data is initialized, this is a leaf, blocktime is 1078 // infinite, not nested 1079 // Wait on my "offset" bits on parent's b_go flag 1080 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; 1081 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, 1082 thr_bar->offset + 1, bt, 1083 this_thr USE_ITT_BUILD_ARG(itt_sync_obj)); 1084 flag.wait(this_thr, TRUE); 1085 if (thr_bar->wait_flag == 1086 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go 1087 TCW_8(thr_bar->b_go, 1088 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1089 } else { // Reset my bits on parent's b_go flag 1090 (RCAST(volatile char *, 1091 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0; 1092 } 1093 } 1094 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 1095 // Early exit for reaping threads releasing forkjoin barrier 1096 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1097 return; 1098 // The worker thread may now assume that the team is valid. 1099 team = __kmp_threads[gtid]->th.th_team; 1100 KMP_DEBUG_ASSERT(team != NULL); 1101 tid = __kmp_tid_from_gtid(gtid); 1102 1103 KA_TRACE( 1104 20, 1105 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 1106 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 1107 KMP_MB(); // Flush all pending memory write invalidates. 1108 } 1109 1110 nproc = this_thr->th.th_team_nproc; 1111 int level = team->t.t_level; 1112 if (team->t.t_threads[0] 1113 ->th.th_teams_microtask) { // are we inside the teams construct? 1114 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 1115 this_thr->th.th_teams_level == level) 1116 ++level; // level was not increased in teams construct for team_of_workers 1117 if (this_thr->th.th_teams_size.nteams > 1) 1118 ++level; // level was not increased in teams construct for team_of_masters 1119 } 1120 if (level == 1) 1121 thr_bar->use_oncore_barrier = 1; 1122 else 1123 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 1124 1125 // If the team size has increased, we still communicate with old leaves via 1126 // oncore barrier. 1127 unsigned short int old_leaf_kids = thr_bar->leaf_kids; 1128 kmp_uint64 old_leaf_state = thr_bar->leaf_state; 1129 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, 1130 tid, team); 1131 // But if the entire team changes, we won't use oncore barrier at all 1132 if (team_change) 1133 old_leaf_kids = 0; 1134 1135 #if KMP_BARRIER_ICV_PUSH 1136 if (propagate_icvs) { 1137 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 1138 FALSE); 1139 if (KMP_MASTER_TID( 1140 tid)) { // master already has copy in final destination; copy 1141 copy_icvs(&thr_bar->th_fixed_icvs, 1142 &team->t.t_implicit_task_taskdata[tid].td_icvs); 1143 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1144 thr_bar->use_oncore_barrier) { // optimization for inf blocktime 1145 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) 1146 // leaves (on-core children) pull parent's fixed ICVs directly to local 1147 // ICV store 1148 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1149 &thr_bar->parent_bar->th_fixed_icvs); 1150 // non-leaves will get ICVs piggybacked with b_go via NGO store 1151 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs 1152 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can 1153 // access 1154 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); 1155 else // leaves copy parent's fixed ICVs directly to local ICV store 1156 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1157 &thr_bar->parent_bar->th_fixed_icvs); 1158 } 1159 } 1160 #endif // KMP_BARRIER_ICV_PUSH 1161 1162 // Now, release my children 1163 if (thr_bar->my_level) { // not a leaf 1164 kmp_int32 child_tid; 1165 kmp_uint32 last; 1166 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1167 thr_bar->use_oncore_barrier) { 1168 if (KMP_MASTER_TID(tid)) { // do a flat release 1169 // Set local b_go to bump children via NGO store of the cache line 1170 // containing IVCs and b_go. 1171 thr_bar->b_go = KMP_BARRIER_STATE_BUMP; 1172 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of 1173 // the cache line 1174 ngo_load(&thr_bar->th_fixed_icvs); 1175 // This loops over all the threads skipping only the leaf nodes in the 1176 // hierarchy 1177 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc; 1178 child_tid += thr_bar->skip_per_level[1]) { 1179 kmp_bstate_t *child_bar = 1180 &team->t.t_threads[child_tid]->th.th_bar[bt].bb; 1181 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1182 "releasing T#%d(%d:%d)" 1183 " go(%p): %u => %u\n", 1184 gtid, team->t.t_id, tid, 1185 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1186 child_tid, &child_bar->b_go, child_bar->b_go, 1187 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1188 // Use ngo store (if available) to both store ICVs and release child 1189 // via child's b_go 1190 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 1191 } 1192 ngo_sync(); 1193 } 1194 TCW_8(thr_bar->b_go, 1195 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1196 // Now, release leaf children 1197 if (thr_bar->leaf_kids) { // if there are any 1198 // We test team_change on the off-chance that the level 1 team changed. 1199 if (team_change || 1200 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new 1201 if (old_leaf_kids) { // release old leaf kids 1202 thr_bar->b_go |= old_leaf_state; 1203 } 1204 // Release new leaf kids 1205 last = tid + thr_bar->skip_per_level[1]; 1206 if (last > nproc) 1207 last = nproc; 1208 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last; 1209 ++child_tid) { // skip_per_level[0]=1 1210 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1211 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1212 KA_TRACE( 1213 20, 1214 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 1215 " T#%d(%d:%d) go(%p): %u => %u\n", 1216 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1217 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1218 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1219 // Release child using child's b_go flag 1220 ANNOTATE_BARRIER_BEGIN(child_thr); 1221 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1222 flag.release(); 1223 } 1224 } else { // Release all children at once with leaf_state bits on my own 1225 // b_go flag 1226 thr_bar->b_go |= thr_bar->leaf_state; 1227 } 1228 } 1229 } else { // Blocktime is not infinite; do a simple hierarchical release 1230 for (int d = thr_bar->my_level - 1; d >= 0; 1231 --d) { // Release highest level threads first 1232 last = tid + thr_bar->skip_per_level[d + 1]; 1233 kmp_uint32 skip = thr_bar->skip_per_level[d]; 1234 if (last > nproc) 1235 last = nproc; 1236 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1237 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1238 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1239 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1240 "releasing T#%d(%d:%d) go(%p): %u => %u\n", 1241 gtid, team->t.t_id, tid, 1242 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1243 child_tid, &child_bar->b_go, child_bar->b_go, 1244 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1245 // Release child using child's b_go flag 1246 ANNOTATE_BARRIER_BEGIN(child_thr); 1247 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1248 flag.release(); 1249 } 1250 } 1251 } 1252 #if KMP_BARRIER_ICV_PUSH 1253 if (propagate_icvs && !KMP_MASTER_TID(tid)) 1254 // non-leaves copy ICVs from fixed ICVs to local dest 1255 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1256 &thr_bar->th_fixed_icvs); 1257 #endif // KMP_BARRIER_ICV_PUSH 1258 } 1259 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " 1260 "barrier type %d\n", 1261 gtid, team->t.t_id, tid, bt)); 1262 } 1263 1264 // End of Barrier Algorithms 1265 1266 // type traits for cancellable value 1267 // if cancellable is true, then is_cancellable is a normal boolean variable 1268 // if cancellable is false, then is_cancellable is a compile time constant 1269 template <bool cancellable> struct is_cancellable {}; 1270 template <> struct is_cancellable<true> { 1271 bool value; 1272 is_cancellable() : value(false) {} 1273 is_cancellable(bool b) : value(b) {} 1274 is_cancellable &operator=(bool b) { 1275 value = b; 1276 return *this; 1277 } 1278 operator bool() const { return value; } 1279 }; 1280 template <> struct is_cancellable<false> { 1281 is_cancellable &operator=(bool b) { return *this; } 1282 constexpr operator bool() const { return false; } 1283 }; 1284 1285 // Internal function to do a barrier. 1286 /* If is_split is true, do a split barrier, otherwise, do a plain barrier 1287 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split 1288 barrier 1289 When cancellable = false, 1290 Returns 0 if master thread, 1 if worker thread. 1291 When cancellable = true 1292 Returns 0 if not cancelled, 1 if cancelled. */ 1293 template <bool cancellable = false> 1294 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, 1295 size_t reduce_size, void *reduce_data, 1296 void (*reduce)(void *, void *)) { 1297 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); 1298 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1299 int tid = __kmp_tid_from_gtid(gtid); 1300 kmp_info_t *this_thr = __kmp_threads[gtid]; 1301 kmp_team_t *team = this_thr->th.th_team; 1302 int status = 0; 1303 is_cancellable<cancellable> cancelled; 1304 #if OMPT_SUPPORT && OMPT_OPTIONAL 1305 ompt_data_t *my_task_data; 1306 ompt_data_t *my_parallel_data; 1307 void *return_address; 1308 ompt_sync_region_t barrier_kind; 1309 #endif 1310 1311 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid, 1312 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1313 1314 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 1315 #if OMPT_SUPPORT 1316 if (ompt_enabled.enabled) { 1317 #if OMPT_OPTIONAL 1318 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 1319 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 1320 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1321 barrier_kind = __ompt_get_barrier_kind(bt, this_thr); 1322 if (ompt_enabled.ompt_callback_sync_region) { 1323 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1324 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1325 return_address); 1326 } 1327 if (ompt_enabled.ompt_callback_sync_region_wait) { 1328 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1329 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1330 return_address); 1331 } 1332 #endif 1333 // It is OK to report the barrier state after the barrier begin callback. 1334 // According to the OMPT specification, a compliant implementation may 1335 // even delay reporting this state until the barrier begins to wait. 1336 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; 1337 } 1338 #endif 1339 1340 if (!team->t.t_serialized) { 1341 #if USE_ITT_BUILD 1342 // This value will be used in itt notify events below. 1343 void *itt_sync_obj = NULL; 1344 #if USE_ITT_NOTIFY 1345 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1346 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1347 #endif 1348 #endif /* USE_ITT_BUILD */ 1349 if (__kmp_tasking_mode == tskm_extra_barrier) { 1350 __kmp_tasking_barrier(team, this_thr, gtid); 1351 KA_TRACE(15, 1352 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid, 1353 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1354 } 1355 1356 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 1357 access it when the team struct is not guaranteed to exist. */ 1358 // See note about the corresponding code in __kmp_join_barrier() being 1359 // performance-critical. 1360 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1361 #if KMP_USE_MONITOR 1362 this_thr->th.th_team_bt_intervals = 1363 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1364 this_thr->th.th_team_bt_set = 1365 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1366 #else 1367 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1368 #endif 1369 } 1370 1371 #if USE_ITT_BUILD 1372 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1373 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1374 #endif /* USE_ITT_BUILD */ 1375 #if USE_DEBUGGER 1376 // Let the debugger know: the thread arrived to the barrier and waiting. 1377 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure. 1378 team->t.t_bar[bt].b_master_arrived += 1; 1379 } else { 1380 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; 1381 } // if 1382 #endif /* USE_DEBUGGER */ 1383 if (reduce != NULL) { 1384 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 1385 this_thr->th.th_local.reduce_data = reduce_data; 1386 } 1387 1388 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) 1389 // use 0 to only setup the current team if nthreads > 1 1390 __kmp_task_team_setup(this_thr, team, 0); 1391 1392 if (cancellable) { 1393 cancelled = __kmp_linear_barrier_gather_cancellable( 1394 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1395 } else { 1396 switch (__kmp_barrier_gather_pattern[bt]) { 1397 case bp_hyper_bar: { 1398 // don't set branch bits to 0; use linear 1399 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1400 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, 1401 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1402 break; 1403 } 1404 case bp_hierarchical_bar: { 1405 __kmp_hierarchical_barrier_gather( 1406 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1407 break; 1408 } 1409 case bp_tree_bar: { 1410 // don't set branch bits to 0; use linear 1411 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1412 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, 1413 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1414 break; 1415 } 1416 default: { 1417 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, 1418 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1419 } 1420 } 1421 } 1422 1423 KMP_MB(); 1424 1425 if (KMP_MASTER_TID(tid)) { 1426 status = 0; 1427 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 1428 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1429 } 1430 #if USE_DEBUGGER 1431 // Let the debugger know: All threads are arrived and starting leaving the 1432 // barrier. 1433 team->t.t_bar[bt].b_team_arrived += 1; 1434 #endif 1435 1436 if (__kmp_omp_cancellation) { 1437 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request); 1438 // Reset cancellation flag for worksharing constructs 1439 if (cancel_request == cancel_loop || 1440 cancel_request == cancel_sections) { 1441 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq); 1442 } 1443 } 1444 #if USE_ITT_BUILD 1445 /* TODO: In case of split reduction barrier, master thread may send 1446 acquired event early, before the final summation into the shared 1447 variable is done (final summation can be a long operation for array 1448 reductions). */ 1449 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1450 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1451 #endif /* USE_ITT_BUILD */ 1452 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1453 // Barrier - report frame end (only if active_level == 1) 1454 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 1455 __kmp_forkjoin_frames_mode && 1456 (this_thr->th.th_teams_microtask == NULL || // either not in teams 1457 this_thr->th.th_teams_size.nteams == 1) && // or inside single team 1458 team->t.t_active_level == 1) { 1459 ident_t *loc = __kmp_threads[gtid]->th.th_ident; 1460 kmp_uint64 cur_time = __itt_get_timestamp(); 1461 kmp_info_t **other_threads = team->t.t_threads; 1462 int nproc = this_thr->th.th_team_nproc; 1463 int i; 1464 switch (__kmp_forkjoin_frames_mode) { 1465 case 1: 1466 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1467 loc, nproc); 1468 this_thr->th.th_frame_time = cur_time; 1469 break; 1470 case 2: // AC 2015-01-19: currently does not work for hierarchical (to 1471 // be fixed) 1472 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1473 1, loc, nproc); 1474 break; 1475 case 3: 1476 if (__itt_metadata_add_ptr) { 1477 // Initialize with master's wait time 1478 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1479 // Set arrive time to zero to be able to check it in 1480 // __kmp_invoke_task(); the same is done inside the loop below 1481 this_thr->th.th_bar_arrive_time = 0; 1482 for (i = 1; i < nproc; ++i) { 1483 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 1484 other_threads[i]->th.th_bar_arrive_time = 0; 1485 } 1486 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 1487 cur_time, delta, 1488 (kmp_uint64)(reduce != NULL)); 1489 } 1490 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1491 loc, nproc); 1492 this_thr->th.th_frame_time = cur_time; 1493 break; 1494 } 1495 } 1496 #endif /* USE_ITT_BUILD */ 1497 } else { 1498 status = 1; 1499 #if USE_ITT_BUILD 1500 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1501 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1502 #endif /* USE_ITT_BUILD */ 1503 } 1504 if ((status == 1 || !is_split) && !cancelled) { 1505 if (cancellable) { 1506 cancelled = __kmp_linear_barrier_release_cancellable( 1507 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1508 } else { 1509 switch (__kmp_barrier_release_pattern[bt]) { 1510 case bp_hyper_bar: { 1511 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1512 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 1513 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1514 break; 1515 } 1516 case bp_hierarchical_bar: { 1517 __kmp_hierarchical_barrier_release( 1518 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1519 break; 1520 } 1521 case bp_tree_bar: { 1522 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1523 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 1524 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1525 break; 1526 } 1527 default: { 1528 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 1529 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1530 } 1531 } 1532 } 1533 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 1534 __kmp_task_team_sync(this_thr, team); 1535 } 1536 } 1537 1538 #if USE_ITT_BUILD 1539 /* GEH: TODO: Move this under if-condition above and also include in 1540 __kmp_end_split_barrier(). This will more accurately represent the actual 1541 release time of the threads for split barriers. */ 1542 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1543 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1544 #endif /* USE_ITT_BUILD */ 1545 } else { // Team is serialized. 1546 status = 0; 1547 if (__kmp_tasking_mode != tskm_immediate_exec) { 1548 if (this_thr->th.th_task_team != NULL) { 1549 #if USE_ITT_NOTIFY 1550 void *itt_sync_obj = NULL; 1551 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1552 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1553 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1554 } 1555 #endif 1556 1557 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == 1558 TRUE); 1559 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1560 __kmp_task_team_setup(this_thr, team, 0); 1561 1562 #if USE_ITT_BUILD 1563 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1564 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1565 #endif /* USE_ITT_BUILD */ 1566 } 1567 } 1568 } 1569 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", 1570 gtid, __kmp_team_from_gtid(gtid)->t.t_id, 1571 __kmp_tid_from_gtid(gtid), status)); 1572 1573 #if OMPT_SUPPORT 1574 if (ompt_enabled.enabled) { 1575 #if OMPT_OPTIONAL 1576 if (ompt_enabled.ompt_callback_sync_region_wait) { 1577 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1578 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 1579 return_address); 1580 } 1581 if (ompt_enabled.ompt_callback_sync_region) { 1582 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1583 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 1584 return_address); 1585 } 1586 #endif 1587 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1588 } 1589 #endif 1590 ANNOTATE_BARRIER_END(&team->t.t_bar); 1591 1592 if (cancellable) 1593 return (int)cancelled; 1594 return status; 1595 } 1596 1597 // Returns 0 if master thread, 1 if worker thread. 1598 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, 1599 size_t reduce_size, void *reduce_data, 1600 void (*reduce)(void *, void *)) { 1601 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data, 1602 reduce); 1603 } 1604 1605 #if defined(KMP_GOMP_COMPAT) 1606 // Returns 1 if cancelled, 0 otherwise 1607 int __kmp_barrier_gomp_cancel(int gtid) { 1608 if (__kmp_omp_cancellation) { 1609 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE, 1610 0, NULL, NULL); 1611 if (cancelled) { 1612 int tid = __kmp_tid_from_gtid(gtid); 1613 kmp_info_t *this_thr = __kmp_threads[gtid]; 1614 if (KMP_MASTER_TID(tid)) { 1615 // Master does not need to revert anything 1616 } else { 1617 // Workers need to revert their private b_arrived flag 1618 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -= 1619 KMP_BARRIER_STATE_BUMP; 1620 } 1621 } 1622 return cancelled; 1623 } 1624 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 1625 return FALSE; 1626 } 1627 #endif 1628 1629 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { 1630 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); 1631 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1632 int tid = __kmp_tid_from_gtid(gtid); 1633 kmp_info_t *this_thr = __kmp_threads[gtid]; 1634 kmp_team_t *team = this_thr->th.th_team; 1635 1636 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 1637 if (!team->t.t_serialized) { 1638 if (KMP_MASTER_GTID(gtid)) { 1639 switch (__kmp_barrier_release_pattern[bt]) { 1640 case bp_hyper_bar: { 1641 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1642 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 1643 FALSE USE_ITT_BUILD_ARG(NULL)); 1644 break; 1645 } 1646 case bp_hierarchical_bar: { 1647 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, 1648 FALSE USE_ITT_BUILD_ARG(NULL)); 1649 break; 1650 } 1651 case bp_tree_bar: { 1652 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1653 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 1654 FALSE USE_ITT_BUILD_ARG(NULL)); 1655 break; 1656 } 1657 default: { 1658 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 1659 FALSE USE_ITT_BUILD_ARG(NULL)); 1660 } 1661 } 1662 if (__kmp_tasking_mode != tskm_immediate_exec) { 1663 __kmp_task_team_sync(this_thr, team); 1664 } // if 1665 } 1666 } 1667 ANNOTATE_BARRIER_END(&team->t.t_bar); 1668 } 1669 1670 void __kmp_join_barrier(int gtid) { 1671 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); 1672 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1673 kmp_info_t *this_thr = __kmp_threads[gtid]; 1674 kmp_team_t *team; 1675 kmp_uint nproc; 1676 kmp_info_t *master_thread; 1677 int tid; 1678 #ifdef KMP_DEBUG 1679 int team_id; 1680 #endif /* KMP_DEBUG */ 1681 #if USE_ITT_BUILD 1682 void *itt_sync_obj = NULL; 1683 #if USE_ITT_NOTIFY 1684 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need 1685 // Get object created at fork_barrier 1686 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1687 #endif 1688 #endif /* USE_ITT_BUILD */ 1689 KMP_MB(); 1690 1691 // Get current info 1692 team = this_thr->th.th_team; 1693 nproc = this_thr->th.th_team_nproc; 1694 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); 1695 tid = __kmp_tid_from_gtid(gtid); 1696 #ifdef KMP_DEBUG 1697 team_id = team->t.t_id; 1698 #endif /* KMP_DEBUG */ 1699 master_thread = this_thr->th.th_team_master; 1700 #ifdef KMP_DEBUG 1701 if (master_thread != team->t.t_threads[0]) { 1702 __kmp_print_structure(); 1703 } 1704 #endif /* KMP_DEBUG */ 1705 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); 1706 KMP_MB(); 1707 1708 // Verify state 1709 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 1710 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); 1711 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); 1712 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); 1713 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", 1714 gtid, team_id, tid)); 1715 1716 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 1717 #if OMPT_SUPPORT 1718 if (ompt_enabled.enabled) { 1719 #if OMPT_OPTIONAL 1720 ompt_data_t *my_task_data; 1721 ompt_data_t *my_parallel_data; 1722 void *codeptr = NULL; 1723 int ds_tid = this_thr->th.th_info.ds.ds_tid; 1724 if (KMP_MASTER_TID(ds_tid) && 1725 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 1726 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 1727 codeptr = team->t.ompt_team_info.master_return_address; 1728 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 1729 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 1730 if (ompt_enabled.ompt_callback_sync_region) { 1731 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1732 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, 1733 my_task_data, codeptr); 1734 } 1735 if (ompt_enabled.ompt_callback_sync_region_wait) { 1736 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1737 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, 1738 my_task_data, codeptr); 1739 } 1740 if (!KMP_MASTER_TID(ds_tid)) 1741 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr); 1742 #endif 1743 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit; 1744 } 1745 #endif 1746 1747 if (__kmp_tasking_mode == tskm_extra_barrier) { 1748 __kmp_tasking_barrier(team, this_thr, gtid); 1749 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, 1750 team_id, tid)); 1751 } 1752 #ifdef KMP_DEBUG 1753 if (__kmp_tasking_mode != tskm_immediate_exec) { 1754 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = " 1755 "%p, th_task_team = %p\n", 1756 __kmp_gtid_from_thread(this_thr), team_id, 1757 team->t.t_task_team[this_thr->th.th_task_state], 1758 this_thr->th.th_task_team)); 1759 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == 1760 team->t.t_task_team[this_thr->th.th_task_state]); 1761 } 1762 #endif /* KMP_DEBUG */ 1763 1764 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 1765 access it when the team struct is not guaranteed to exist. Doing these 1766 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround, 1767 we do not perform the copy if blocktime=infinite, since the values are not 1768 used by __kmp_wait_template() in that case. */ 1769 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1770 #if KMP_USE_MONITOR 1771 this_thr->th.th_team_bt_intervals = 1772 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1773 this_thr->th.th_team_bt_set = 1774 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1775 #else 1776 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1777 #endif 1778 } 1779 1780 #if USE_ITT_BUILD 1781 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1782 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1783 #endif /* USE_ITT_BUILD */ 1784 1785 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { 1786 case bp_hyper_bar: { 1787 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1788 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1789 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1790 break; 1791 } 1792 case bp_hierarchical_bar: { 1793 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1794 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1795 break; 1796 } 1797 case bp_tree_bar: { 1798 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1799 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1800 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1801 break; 1802 } 1803 default: { 1804 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1805 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1806 } 1807 } 1808 1809 /* From this point on, the team data structure may be deallocated at any time 1810 by the master thread - it is unsafe to reference it in any of the worker 1811 threads. Any per-team data items that need to be referenced before the 1812 end of the barrier should be moved to the kmp_task_team_t structs. */ 1813 if (KMP_MASTER_TID(tid)) { 1814 if (__kmp_tasking_mode != tskm_immediate_exec) { 1815 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1816 } 1817 if (__kmp_display_affinity) { 1818 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0); 1819 } 1820 #if KMP_STATS_ENABLED 1821 // Have master thread flag the workers to indicate they are now waiting for 1822 // next parallel region, Also wake them up so they switch their timers to 1823 // idle. 1824 for (int i = 0; i < team->t.t_nproc; ++i) { 1825 kmp_info_t *team_thread = team->t.t_threads[i]; 1826 if (team_thread == this_thr) 1827 continue; 1828 team_thread->th.th_stats->setIdleFlag(); 1829 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && 1830 team_thread->th.th_sleep_loc != NULL) 1831 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), 1832 team_thread->th.th_sleep_loc); 1833 } 1834 #endif 1835 #if USE_ITT_BUILD 1836 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1837 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1838 #endif /* USE_ITT_BUILD */ 1839 1840 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1841 // Join barrier - report frame end 1842 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 1843 __kmp_forkjoin_frames_mode && 1844 (this_thr->th.th_teams_microtask == NULL || // either not in teams 1845 this_thr->th.th_teams_size.nteams == 1) && // or inside single team 1846 team->t.t_active_level == 1) { 1847 kmp_uint64 cur_time = __itt_get_timestamp(); 1848 ident_t *loc = team->t.t_ident; 1849 kmp_info_t **other_threads = team->t.t_threads; 1850 int nproc = this_thr->th.th_team_nproc; 1851 int i; 1852 switch (__kmp_forkjoin_frames_mode) { 1853 case 1: 1854 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1855 loc, nproc); 1856 break; 1857 case 2: 1858 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, 1859 loc, nproc); 1860 break; 1861 case 3: 1862 if (__itt_metadata_add_ptr) { 1863 // Initialize with master's wait time 1864 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1865 // Set arrive time to zero to be able to check it in 1866 // __kmp_invoke_task(); the same is done inside the loop below 1867 this_thr->th.th_bar_arrive_time = 0; 1868 for (i = 1; i < nproc; ++i) { 1869 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 1870 other_threads[i]->th.th_bar_arrive_time = 0; 1871 } 1872 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 1873 cur_time, delta, 0); 1874 } 1875 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1876 loc, nproc); 1877 this_thr->th.th_frame_time = cur_time; 1878 break; 1879 } 1880 } 1881 #endif /* USE_ITT_BUILD */ 1882 } 1883 #if USE_ITT_BUILD 1884 else { 1885 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1886 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1887 } 1888 #endif /* USE_ITT_BUILD */ 1889 1890 #if KMP_DEBUG 1891 if (KMP_MASTER_TID(tid)) { 1892 KA_TRACE( 1893 15, 1894 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", 1895 gtid, team_id, tid, nproc)); 1896 } 1897 #endif /* KMP_DEBUG */ 1898 1899 // TODO now, mark worker threads as done so they may be disbanded 1900 KMP_MB(); // Flush all pending memory write invalidates. 1901 KA_TRACE(10, 1902 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); 1903 1904 ANNOTATE_BARRIER_END(&team->t.t_bar); 1905 } 1906 1907 // TODO release worker threads' fork barriers as we are ready instead of all at 1908 // once 1909 void __kmp_fork_barrier(int gtid, int tid) { 1910 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); 1911 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1912 kmp_info_t *this_thr = __kmp_threads[gtid]; 1913 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; 1914 #if USE_ITT_BUILD 1915 void *itt_sync_obj = NULL; 1916 #endif /* USE_ITT_BUILD */ 1917 if (team) 1918 ANNOTATE_BARRIER_END(&team->t.t_bar); 1919 1920 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid, 1921 (team != NULL) ? team->t.t_id : -1, tid)); 1922 1923 // th_team pointer only valid for master thread here 1924 if (KMP_MASTER_TID(tid)) { 1925 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1926 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1927 // Create itt barrier object 1928 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); 1929 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing 1930 } 1931 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1932 1933 #ifdef KMP_DEBUG 1934 kmp_info_t **other_threads = team->t.t_threads; 1935 int i; 1936 1937 // Verify state 1938 KMP_MB(); 1939 1940 for (i = 1; i < team->t.t_nproc; ++i) { 1941 KA_TRACE(500, 1942 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " 1943 "== %u.\n", 1944 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, 1945 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, 1946 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); 1947 KMP_DEBUG_ASSERT( 1948 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) & 1949 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE); 1950 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); 1951 } 1952 #endif 1953 1954 if (__kmp_tasking_mode != tskm_immediate_exec) { 1955 // 0 indicates setup current task team if nthreads > 1 1956 __kmp_task_team_setup(this_thr, team, 0); 1957 } 1958 1959 /* The master thread may have changed its blocktime between the join barrier 1960 and the fork barrier. Copy the blocktime info to the thread, where 1961 __kmp_wait_template() can access it when the team struct is not 1962 guaranteed to exist. */ 1963 // See note about the corresponding code in __kmp_join_barrier() being 1964 // performance-critical 1965 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1966 #if KMP_USE_MONITOR 1967 this_thr->th.th_team_bt_intervals = 1968 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1969 this_thr->th.th_team_bt_set = 1970 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1971 #else 1972 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1973 #endif 1974 } 1975 } // master 1976 1977 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { 1978 case bp_hyper_bar: { 1979 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1980 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1981 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1982 break; 1983 } 1984 case bp_hierarchical_bar: { 1985 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1986 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1987 break; 1988 } 1989 case bp_tree_bar: { 1990 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1991 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1992 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1993 break; 1994 } 1995 default: { 1996 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1997 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1998 } 1999 } 2000 2001 #if OMPT_SUPPORT 2002 if (ompt_enabled.enabled && 2003 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 2004 int ds_tid = this_thr->th.th_info.ds.ds_tid; 2005 ompt_data_t *task_data = (team) 2006 ? OMPT_CUR_TASK_DATA(this_thr) 2007 : &(this_thr->th.ompt_thread_info.task_data); 2008 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 2009 #if OMPT_OPTIONAL 2010 void *codeptr = NULL; 2011 if (KMP_MASTER_TID(ds_tid) && 2012 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 2013 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 2014 codeptr = team->t.ompt_team_info.master_return_address; 2015 if (ompt_enabled.ompt_callback_sync_region_wait) { 2016 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2017 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 2018 codeptr); 2019 } 2020 if (ompt_enabled.ompt_callback_sync_region) { 2021 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2022 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 2023 codeptr); 2024 } 2025 #endif 2026 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 2027 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2028 ompt_scope_end, NULL, task_data, 0, ds_tid, 2029 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 2030 } 2031 } 2032 #endif 2033 2034 // Early exit for reaping threads releasing forkjoin barrier 2035 if (TCR_4(__kmp_global.g.g_done)) { 2036 this_thr->th.th_task_team = NULL; 2037 2038 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2039 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2040 if (!KMP_MASTER_TID(tid)) { 2041 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2042 if (itt_sync_obj) 2043 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 2044 } 2045 } 2046 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2047 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); 2048 return; 2049 } 2050 2051 /* We can now assume that a valid team structure has been allocated by the 2052 master and propagated to all worker threads. The current thread, however, 2053 may not be part of the team, so we can't blindly assume that the team 2054 pointer is non-null. */ 2055 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); 2056 KMP_DEBUG_ASSERT(team != NULL); 2057 tid = __kmp_tid_from_gtid(gtid); 2058 2059 #if KMP_BARRIER_ICV_PULL 2060 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 2061 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's 2062 implicit task has this data before this function is called. We cannot 2063 modify __kmp_fork_call() to look at the fixed ICVs in the master's thread 2064 struct, because it is not always the case that the threads arrays have 2065 been allocated when __kmp_fork_call() is executed. */ 2066 { 2067 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 2068 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs 2069 // Copy the initial ICVs from the master's thread struct to the implicit 2070 // task for this tid. 2071 KA_TRACE(10, 2072 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); 2073 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, 2074 tid, FALSE); 2075 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 2076 &team->t.t_threads[0] 2077 ->th.th_bar[bs_forkjoin_barrier] 2078 .bb.th_fixed_icvs); 2079 } 2080 } 2081 #endif // KMP_BARRIER_ICV_PULL 2082 2083 if (__kmp_tasking_mode != tskm_immediate_exec) { 2084 __kmp_task_team_sync(this_thr, team); 2085 } 2086 2087 #if KMP_AFFINITY_SUPPORTED 2088 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 2089 if (proc_bind == proc_bind_intel) { 2090 // Call dynamic affinity settings 2091 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) { 2092 __kmp_balanced_affinity(this_thr, team->t.t_nproc); 2093 } 2094 } else if (proc_bind != proc_bind_false) { 2095 if (this_thr->th.th_new_place == this_thr->th.th_current_place) { 2096 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", 2097 __kmp_gtid_from_thread(this_thr), 2098 this_thr->th.th_current_place)); 2099 } else { 2100 __kmp_affinity_set_place(gtid); 2101 } 2102 } 2103 #endif // KMP_AFFINITY_SUPPORTED 2104 // Perform the display affinity functionality 2105 if (__kmp_display_affinity) { 2106 if (team->t.t_display_affinity 2107 #if KMP_AFFINITY_SUPPORTED 2108 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) 2109 #endif 2110 ) { 2111 // NULL means use the affinity-format-var ICV 2112 __kmp_aux_display_affinity(gtid, NULL); 2113 this_thr->th.th_prev_num_threads = team->t.t_nproc; 2114 this_thr->th.th_prev_level = team->t.t_level; 2115 } 2116 } 2117 if (!KMP_MASTER_TID(tid)) 2118 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator); 2119 2120 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2121 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2122 if (!KMP_MASTER_TID(tid)) { 2123 // Get correct barrier object 2124 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2125 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired 2126 } // (prepare called inside barrier_release) 2127 } 2128 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2129 ANNOTATE_BARRIER_END(&team->t.t_bar); 2130 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, 2131 team->t.t_id, tid)); 2132 } 2133 2134 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 2135 kmp_internal_control_t *new_icvs, ident_t *loc) { 2136 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); 2137 2138 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); 2139 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 2140 2141 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 2142 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's 2143 implicit task has this data before this function is called. */ 2144 #if KMP_BARRIER_ICV_PULL 2145 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains 2146 untouched), where all of the worker threads can access them and make their 2147 own copies after the barrier. */ 2148 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2149 // allocated at this point 2150 copy_icvs( 2151 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, 2152 new_icvs); 2153 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0, 2154 team->t.t_threads[0], team)); 2155 #elif KMP_BARRIER_ICV_PUSH 2156 // The ICVs will be propagated in the fork barrier, so nothing needs to be 2157 // done here. 2158 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0, 2159 team->t.t_threads[0], team)); 2160 #else 2161 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) 2162 // time. 2163 ngo_load(new_icvs); 2164 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2165 // allocated at this point 2166 for (int f = 1; f < new_nproc; ++f) { // Skip the master thread 2167 // TODO: GEH - pass in better source location info since usually NULL here 2168 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2169 f, team->t.t_threads[f], team)); 2170 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); 2171 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); 2172 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2173 f, team->t.t_threads[f], team)); 2174 } 2175 ngo_sync(); 2176 #endif // KMP_BARRIER_ICV_PULL 2177 } 2178