1 /* 2 * kmp_barrier.cpp 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_wait_release.h" 18 #include "kmp_itt.h" 19 #include "kmp_os.h" 20 #include "kmp_stats.h" 21 22 23 #if KMP_MIC 24 #include <immintrin.h> 25 #define USE_NGO_STORES 1 26 #endif // KMP_MIC 27 28 #include "tsan_annotations.h" 29 30 #if KMP_MIC && USE_NGO_STORES 31 // ICV copying 32 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 33 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 34 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 35 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") 36 #else 37 #define ngo_load(src) ((void)0) 38 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 39 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 40 #define ngo_sync() ((void)0) 41 #endif /* KMP_MIC && USE_NGO_STORES */ 42 43 void __kmp_print_structure(void); // Forward declaration 44 45 // ---------------------------- Barrier Algorithms ---------------------------- 46 47 // Linear Barrier 48 static void __kmp_linear_barrier_gather( 49 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 50 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 51 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); 52 register kmp_team_t *team = this_thr->th.th_team; 53 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 54 register kmp_info_t **other_threads = team->t.t_threads; 55 56 KA_TRACE( 57 20, 58 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 59 gtid, team->t.t_id, tid, bt)); 60 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 61 62 #if USE_ITT_BUILD && USE_ITT_NOTIFY 63 // Barrier imbalance - save arrive time to the thread 64 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 65 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 66 __itt_get_timestamp(); 67 } 68 #endif 69 // We now perform a linear reduction to signal that all of the threads have 70 // arrived. 71 if (!KMP_MASTER_TID(tid)) { 72 KA_TRACE(20, 73 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 74 "arrived(%p): %llu => %llu\n", 75 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team), 76 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived, 77 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 78 // Mark arrival to master thread 79 /* After performing this write, a worker thread may not assume that the team 80 is valid any more - it could be deallocated by the master thread at any 81 time. */ 82 ANNOTATE_BARRIER_BEGIN(this_thr); 83 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]); 84 flag.release(); 85 } else { 86 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; 87 register int nproc = this_thr->th.th_team_nproc; 88 register int i; 89 // Don't have to worry about sleep bit here or atomic since team setting 90 register kmp_uint64 new_state = 91 team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; 92 93 // Collect all the worker team member threads. 94 for (i = 1; i < nproc; ++i) { 95 #if KMP_CACHE_MANAGE 96 // Prefetch next thread's arrived count 97 if (i + 1 < nproc) 98 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived); 99 #endif /* KMP_CACHE_MANAGE */ 100 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 101 "arrived(%p) == %llu\n", 102 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 103 team->t.t_id, i, 104 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); 105 106 // Wait for worker thread to arrive 107 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, 108 new_state); 109 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 110 ANNOTATE_BARRIER_END(other_threads[i]); 111 #if USE_ITT_BUILD && USE_ITT_NOTIFY 112 // Barrier imbalance - write min of the thread time and the other thread 113 // time to the thread. 114 if (__kmp_forkjoin_frames_mode == 2) { 115 this_thr->th.th_bar_min_time = KMP_MIN( 116 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time); 117 } 118 #endif 119 if (reduce) { 120 KA_TRACE(100, 121 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 122 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 123 team->t.t_id, i)); 124 ANNOTATE_REDUCE_AFTER(reduce); 125 (*reduce)(this_thr->th.th_local.reduce_data, 126 other_threads[i]->th.th_local.reduce_data); 127 ANNOTATE_REDUCE_BEFORE(reduce); 128 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 129 } 130 } 131 // Don't have to worry about sleep bit here or atomic since team setting 132 team_bar->b_arrived = new_state; 133 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " 134 "arrived(%p) = %llu\n", 135 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, 136 new_state)); 137 } 138 KA_TRACE( 139 20, 140 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 141 gtid, team->t.t_id, tid, bt)); 142 } 143 144 static void __kmp_linear_barrier_release( 145 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 146 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 147 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); 148 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 149 register kmp_team_t *team; 150 151 if (KMP_MASTER_TID(tid)) { 152 register unsigned int i; 153 register kmp_uint32 nproc = this_thr->th.th_team_nproc; 154 register kmp_info_t **other_threads; 155 156 team = __kmp_threads[gtid]->th.th_team; 157 KMP_DEBUG_ASSERT(team != NULL); 158 other_threads = team->t.t_threads; 159 160 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for " 161 "barrier type %d\n", 162 gtid, team->t.t_id, tid, bt)); 163 164 if (nproc > 1) { 165 #if KMP_BARRIER_ICV_PUSH 166 { 167 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 168 if (propagate_icvs) { 169 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); 170 for (i = 1; i < nproc; ++i) { 171 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], 172 team, i, FALSE); 173 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, 174 &team->t.t_implicit_task_taskdata[0].td_icvs); 175 } 176 ngo_sync(); 177 } 178 } 179 #endif // KMP_BARRIER_ICV_PUSH 180 181 // Now, release all of the worker threads 182 for (i = 1; i < nproc; ++i) { 183 #if KMP_CACHE_MANAGE 184 // Prefetch next thread's go flag 185 if (i + 1 < nproc) 186 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go); 187 #endif /* KMP_CACHE_MANAGE */ 188 KA_TRACE( 189 20, 190 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 191 "go(%p): %u => %u\n", 192 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid, 193 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go, 194 other_threads[i]->th.th_bar[bt].bb.b_go, 195 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); 196 ANNOTATE_BARRIER_BEGIN(other_threads[i]); 197 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, 198 other_threads[i]); 199 flag.release(); 200 } 201 } 202 } else { // Wait for the MASTER thread to release us 203 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", 204 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 205 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 206 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 207 ANNOTATE_BARRIER_END(this_thr); 208 #if USE_ITT_BUILD && USE_ITT_NOTIFY 209 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 210 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is 211 // disabled) 212 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 213 // Cancel wait on previous parallel region... 214 __kmp_itt_task_starting(itt_sync_obj); 215 216 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 217 return; 218 219 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 220 if (itt_sync_obj != NULL) 221 // Call prepare as early as possible for "new" barrier 222 __kmp_itt_task_finished(itt_sync_obj); 223 } else 224 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 225 // Early exit for reaping threads releasing forkjoin barrier 226 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 227 return; 228 // The worker thread may now assume that the team is valid. 229 #ifdef KMP_DEBUG 230 tid = __kmp_tid_from_gtid(gtid); 231 team = __kmp_threads[gtid]->th.th_team; 232 #endif 233 KMP_DEBUG_ASSERT(team != NULL); 234 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 235 KA_TRACE(20, 236 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 237 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 238 KMP_MB(); // Flush all pending memory write invalidates. 239 } 240 KA_TRACE( 241 20, 242 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 243 gtid, team->t.t_id, tid, bt)); 244 } 245 246 // Tree barrier 247 static void 248 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, 249 int tid, void (*reduce)(void *, void *) 250 USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 251 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); 252 register kmp_team_t *team = this_thr->th.th_team; 253 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 254 register kmp_info_t **other_threads = team->t.t_threads; 255 register kmp_uint32 nproc = this_thr->th.th_team_nproc; 256 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 257 register kmp_uint32 branch_factor = 1 << branch_bits; 258 register kmp_uint32 child; 259 register kmp_uint32 child_tid; 260 register kmp_uint64 new_state; 261 262 KA_TRACE( 263 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 264 gtid, team->t.t_id, tid, bt)); 265 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 266 267 #if USE_ITT_BUILD && USE_ITT_NOTIFY 268 // Barrier imbalance - save arrive time to the thread 269 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 270 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 271 __itt_get_timestamp(); 272 } 273 #endif 274 // Perform tree gather to wait until all threads have arrived; reduce any 275 // required data as we go 276 child_tid = (tid << branch_bits) + 1; 277 if (child_tid < nproc) { 278 // Parent threads wait for all their children to arrive 279 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 280 child = 1; 281 do { 282 register kmp_info_t *child_thr = other_threads[child_tid]; 283 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 284 #if KMP_CACHE_MANAGE 285 // Prefetch next thread's arrived count 286 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 287 KMP_CACHE_PREFETCH( 288 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived); 289 #endif /* KMP_CACHE_MANAGE */ 290 KA_TRACE(20, 291 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 292 "arrived(%p) == %llu\n", 293 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 294 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 295 // Wait for child to arrive 296 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 297 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 298 ANNOTATE_BARRIER_END(child_thr); 299 #if USE_ITT_BUILD && USE_ITT_NOTIFY 300 // Barrier imbalance - write min of the thread time and a child time to 301 // the thread. 302 if (__kmp_forkjoin_frames_mode == 2) { 303 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 304 child_thr->th.th_bar_min_time); 305 } 306 #endif 307 if (reduce) { 308 KA_TRACE(100, 309 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 310 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 311 team->t.t_id, child_tid)); 312 ANNOTATE_REDUCE_AFTER(reduce); 313 (*reduce)(this_thr->th.th_local.reduce_data, 314 child_thr->th.th_local.reduce_data); 315 ANNOTATE_REDUCE_BEFORE(reduce); 316 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 317 } 318 child++; 319 child_tid++; 320 } while (child <= branch_factor && child_tid < nproc); 321 } 322 323 if (!KMP_MASTER_TID(tid)) { // Worker threads 324 register kmp_int32 parent_tid = (tid - 1) >> branch_bits; 325 326 KA_TRACE(20, 327 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 328 "arrived(%p): %llu => %llu\n", 329 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 330 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 331 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 332 333 // Mark arrival to parent thread 334 /* After performing this write, a worker thread may not assume that the team 335 is valid any more - it could be deallocated by the master thread at any 336 time. */ 337 ANNOTATE_BARRIER_BEGIN(this_thr); 338 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]); 339 flag.release(); 340 } else { 341 // Need to update the team arrived pointer if we are the master thread 342 if (nproc > 1) // New value was already computed above 343 team->t.t_bar[bt].b_arrived = new_state; 344 else 345 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 346 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " 347 "arrived(%p) = %llu\n", 348 gtid, team->t.t_id, tid, team->t.t_id, 349 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 350 } 351 KA_TRACE(20, 352 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 353 gtid, team->t.t_id, tid, bt)); 354 } 355 356 static void __kmp_tree_barrier_release( 357 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 358 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 359 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); 360 register kmp_team_t *team; 361 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 362 register kmp_uint32 nproc; 363 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 364 register kmp_uint32 branch_factor = 1 << branch_bits; 365 register kmp_uint32 child; 366 register kmp_uint32 child_tid; 367 368 // Perform a tree release for all of the threads that have been gathered 369 if (!KMP_MASTER_TID( 370 tid)) { // Handle fork barrier workers who aren't part of a team yet 371 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid, 372 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 373 // Wait for parent thread to release us 374 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 375 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 376 ANNOTATE_BARRIER_END(this_thr); 377 #if USE_ITT_BUILD && USE_ITT_NOTIFY 378 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 379 // In fork barrier where we could not get the object reliably (or 380 // ITTNOTIFY is disabled) 381 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 382 // Cancel wait on previous parallel region... 383 __kmp_itt_task_starting(itt_sync_obj); 384 385 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 386 return; 387 388 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 389 if (itt_sync_obj != NULL) 390 // Call prepare as early as possible for "new" barrier 391 __kmp_itt_task_finished(itt_sync_obj); 392 } else 393 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 394 // Early exit for reaping threads releasing forkjoin barrier 395 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 396 return; 397 398 // The worker thread may now assume that the team is valid. 399 team = __kmp_threads[gtid]->th.th_team; 400 KMP_DEBUG_ASSERT(team != NULL); 401 tid = __kmp_tid_from_gtid(gtid); 402 403 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 404 KA_TRACE(20, 405 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid, 406 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 407 KMP_MB(); // Flush all pending memory write invalidates. 408 } else { 409 team = __kmp_threads[gtid]->th.th_team; 410 KMP_DEBUG_ASSERT(team != NULL); 411 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for " 412 "barrier type %d\n", 413 gtid, team->t.t_id, tid, bt)); 414 } 415 nproc = this_thr->th.th_team_nproc; 416 child_tid = (tid << branch_bits) + 1; 417 418 if (child_tid < nproc) { 419 register kmp_info_t **other_threads = team->t.t_threads; 420 child = 1; 421 // Parent threads release all their children 422 do { 423 register kmp_info_t *child_thr = other_threads[child_tid]; 424 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 425 #if KMP_CACHE_MANAGE 426 // Prefetch next thread's go count 427 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 428 KMP_CACHE_PREFETCH( 429 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go); 430 #endif /* KMP_CACHE_MANAGE */ 431 432 #if KMP_BARRIER_ICV_PUSH 433 { 434 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 435 if (propagate_icvs) { 436 __kmp_init_implicit_task(team->t.t_ident, 437 team->t.t_threads[child_tid], team, 438 child_tid, FALSE); 439 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, 440 &team->t.t_implicit_task_taskdata[0].td_icvs); 441 } 442 } 443 #endif // KMP_BARRIER_ICV_PUSH 444 KA_TRACE(20, 445 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 446 "go(%p): %u => %u\n", 447 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 448 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 449 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 450 // Release child from barrier 451 ANNOTATE_BARRIER_BEGIN(child_thr); 452 kmp_flag_64 flag(&child_bar->b_go, child_thr); 453 flag.release(); 454 child++; 455 child_tid++; 456 } while (child <= branch_factor && child_tid < nproc); 457 } 458 KA_TRACE( 459 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 460 gtid, team->t.t_id, tid, bt)); 461 } 462 463 // Hyper Barrier 464 static void 465 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, 466 int tid, void (*reduce)(void *, void *) 467 USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 468 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); 469 register kmp_team_t *team = this_thr->th.th_team; 470 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 471 register kmp_info_t **other_threads = team->t.t_threads; 472 register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; 473 register kmp_uint32 num_threads = this_thr->th.th_team_nproc; 474 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 475 register kmp_uint32 branch_factor = 1 << branch_bits; 476 register kmp_uint32 offset; 477 register kmp_uint32 level; 478 479 KA_TRACE( 480 20, 481 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 482 gtid, team->t.t_id, tid, bt)); 483 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 484 485 #if USE_ITT_BUILD && USE_ITT_NOTIFY 486 // Barrier imbalance - save arrive time to the thread 487 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 488 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 489 __itt_get_timestamp(); 490 } 491 #endif 492 /* Perform a hypercube-embedded tree gather to wait until all of the threads 493 have arrived, and reduce any required data as we go. */ 494 kmp_flag_64 p_flag(&thr_bar->b_arrived); 495 for (level = 0, offset = 1; offset < num_threads; 496 level += branch_bits, offset <<= branch_bits) { 497 register kmp_uint32 child; 498 register kmp_uint32 child_tid; 499 500 if (((tid >> level) & (branch_factor - 1)) != 0) { 501 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1); 502 503 KA_TRACE(20, 504 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 505 "arrived(%p): %llu => %llu\n", 506 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 507 team->t.t_id, parent_tid, &thr_bar->b_arrived, 508 thr_bar->b_arrived, 509 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 510 // Mark arrival to parent thread 511 /* After performing this write (in the last iteration of the enclosing for 512 loop), a worker thread may not assume that the team is valid any more 513 - it could be deallocated by the master thread at any time. */ 514 ANNOTATE_BARRIER_BEGIN(this_thr); 515 p_flag.set_waiter(other_threads[parent_tid]); 516 p_flag.release(); 517 break; 518 } 519 520 // Parent threads wait for children to arrive 521 if (new_state == KMP_BARRIER_UNUSED_STATE) 522 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 523 for (child = 1, child_tid = tid + (1 << level); 524 child < branch_factor && child_tid < num_threads; 525 child++, child_tid += (1 << level)) { 526 register kmp_info_t *child_thr = other_threads[child_tid]; 527 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 528 #if KMP_CACHE_MANAGE 529 register kmp_uint32 next_child_tid = child_tid + (1 << level); 530 // Prefetch next thread's arrived count 531 if (child + 1 < branch_factor && next_child_tid < num_threads) 532 KMP_CACHE_PREFETCH( 533 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); 534 #endif /* KMP_CACHE_MANAGE */ 535 KA_TRACE(20, 536 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 537 "arrived(%p) == %llu\n", 538 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 539 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 540 // Wait for child to arrive 541 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state); 542 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 543 ANNOTATE_BARRIER_END(child_thr); 544 #if USE_ITT_BUILD && USE_ITT_NOTIFY 545 // Barrier imbalance - write min of the thread time and a child time to 546 // the thread. 547 if (__kmp_forkjoin_frames_mode == 2) { 548 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 549 child_thr->th.th_bar_min_time); 550 } 551 #endif 552 if (reduce) { 553 KA_TRACE(100, 554 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 555 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 556 team->t.t_id, child_tid)); 557 ANNOTATE_REDUCE_AFTER(reduce); 558 (*reduce)(this_thr->th.th_local.reduce_data, 559 child_thr->th.th_local.reduce_data); 560 ANNOTATE_REDUCE_BEFORE(reduce); 561 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 562 } 563 } 564 } 565 566 if (KMP_MASTER_TID(tid)) { 567 // Need to update the team arrived pointer if we are the master thread 568 if (new_state == KMP_BARRIER_UNUSED_STATE) 569 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 570 else 571 team->t.t_bar[bt].b_arrived = new_state; 572 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " 573 "arrived(%p) = %llu\n", 574 gtid, team->t.t_id, tid, team->t.t_id, 575 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 576 } 577 KA_TRACE( 578 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 579 gtid, team->t.t_id, tid, bt)); 580 } 581 582 // The reverse versions seem to beat the forward versions overall 583 #define KMP_REVERSE_HYPER_BAR 584 static void __kmp_hyper_barrier_release( 585 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 586 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 587 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); 588 register kmp_team_t *team; 589 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 590 register kmp_info_t **other_threads; 591 register kmp_uint32 num_threads; 592 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 593 register kmp_uint32 branch_factor = 1 << branch_bits; 594 register kmp_uint32 child; 595 register kmp_uint32 child_tid; 596 register kmp_uint32 offset; 597 register kmp_uint32 level; 598 599 /* Perform a hypercube-embedded tree release for all of the threads that have 600 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads 601 are released in the reverse order of the corresponding gather, otherwise 602 threads are released in the same order. */ 603 if (KMP_MASTER_TID(tid)) { // master 604 team = __kmp_threads[gtid]->th.th_team; 605 KMP_DEBUG_ASSERT(team != NULL); 606 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for " 607 "barrier type %d\n", 608 gtid, team->t.t_id, tid, bt)); 609 #if KMP_BARRIER_ICV_PUSH 610 if (propagate_icvs) { // master already has ICVs in final destination; copy 611 copy_icvs(&thr_bar->th_fixed_icvs, 612 &team->t.t_implicit_task_taskdata[tid].td_icvs); 613 } 614 #endif 615 } else { // Handle fork barrier workers who aren't part of a team yet 616 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid, 617 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 618 // Wait for parent thread to release us 619 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 620 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 621 ANNOTATE_BARRIER_END(this_thr); 622 #if USE_ITT_BUILD && USE_ITT_NOTIFY 623 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 624 // In fork barrier where we could not get the object reliably 625 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 626 // Cancel wait on previous parallel region... 627 __kmp_itt_task_starting(itt_sync_obj); 628 629 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 630 return; 631 632 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 633 if (itt_sync_obj != NULL) 634 // Call prepare as early as possible for "new" barrier 635 __kmp_itt_task_finished(itt_sync_obj); 636 } else 637 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 638 // Early exit for reaping threads releasing forkjoin barrier 639 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 640 return; 641 642 // The worker thread may now assume that the team is valid. 643 team = __kmp_threads[gtid]->th.th_team; 644 KMP_DEBUG_ASSERT(team != NULL); 645 tid = __kmp_tid_from_gtid(gtid); 646 647 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 648 KA_TRACE(20, 649 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 650 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 651 KMP_MB(); // Flush all pending memory write invalidates. 652 } 653 num_threads = this_thr->th.th_team_nproc; 654 other_threads = team->t.t_threads; 655 656 #ifdef KMP_REVERSE_HYPER_BAR 657 // Count up to correct level for parent 658 for (level = 0, offset = 1; 659 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0); 660 level += branch_bits, offset <<= branch_bits) 661 ; 662 663 // Now go down from there 664 for (level -= branch_bits, offset >>= branch_bits; offset != 0; 665 level -= branch_bits, offset >>= branch_bits) 666 #else 667 // Go down the tree, level by level 668 for (level = 0, offset = 1; offset < num_threads; 669 level += branch_bits, offset <<= branch_bits) 670 #endif // KMP_REVERSE_HYPER_BAR 671 { 672 #ifdef KMP_REVERSE_HYPER_BAR 673 /* Now go in reverse order through the children, highest to lowest. 674 Initial setting of child is conservative here. */ 675 child = num_threads >> ((level == 0) ? level : level - 1); 676 for (child = (child < branch_factor - 1) ? child : branch_factor - 1, 677 child_tid = tid + (child << level); 678 child >= 1; child--, child_tid -= (1 << level)) 679 #else 680 if (((tid >> level) & (branch_factor - 1)) != 0) 681 // No need to go lower than this, since this is the level parent would be 682 // notified 683 break; 684 // Iterate through children on this level of the tree 685 for (child = 1, child_tid = tid + (1 << level); 686 child < branch_factor && child_tid < num_threads; 687 child++, child_tid += (1 << level)) 688 #endif // KMP_REVERSE_HYPER_BAR 689 { 690 if (child_tid >= num_threads) 691 continue; // Child doesn't exist so keep going 692 else { 693 register kmp_info_t *child_thr = other_threads[child_tid]; 694 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 695 #if KMP_CACHE_MANAGE 696 register kmp_uint32 next_child_tid = child_tid - (1 << level); 697 // Prefetch next thread's go count 698 #ifdef KMP_REVERSE_HYPER_BAR 699 if (child - 1 >= 1 && next_child_tid < num_threads) 700 #else 701 if (child + 1 < branch_factor && next_child_tid < num_threads) 702 #endif // KMP_REVERSE_HYPER_BAR 703 KMP_CACHE_PREFETCH( 704 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); 705 #endif /* KMP_CACHE_MANAGE */ 706 707 #if KMP_BARRIER_ICV_PUSH 708 if (propagate_icvs) // push my fixed ICVs to my child 709 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 710 #endif // KMP_BARRIER_ICV_PUSH 711 712 KA_TRACE( 713 20, 714 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 715 "go(%p): %u => %u\n", 716 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 717 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 718 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 719 // Release child from barrier 720 ANNOTATE_BARRIER_BEGIN(child_thr); 721 kmp_flag_64 flag(&child_bar->b_go, child_thr); 722 flag.release(); 723 } 724 } 725 } 726 #if KMP_BARRIER_ICV_PUSH 727 if (propagate_icvs && 728 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest 729 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 730 FALSE); 731 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 732 &thr_bar->th_fixed_icvs); 733 } 734 #endif 735 KA_TRACE( 736 20, 737 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 738 gtid, team->t.t_id, tid, bt)); 739 } 740 741 // Hierarchical Barrier 742 743 // Initialize thread barrier data 744 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. 745 Performs the minimum amount of initialization required based on how the team 746 has changed. Returns true if leaf children will require both on-core and 747 traditional wake-up mechanisms. For example, if the team size increases, 748 threads already in the team will respond to on-core wakeup on their parent 749 thread, but threads newly added to the team will only be listening on the 750 their local b_go. */ 751 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, 752 kmp_bstate_t *thr_bar, 753 kmp_uint32 nproc, int gtid, 754 int tid, kmp_team_t *team) { 755 // Checks to determine if (re-)initialization is needed 756 bool uninitialized = thr_bar->team == NULL; 757 bool team_changed = team != thr_bar->team; 758 bool team_sz_changed = nproc != thr_bar->nproc; 759 bool tid_changed = tid != thr_bar->old_tid; 760 bool retval = false; 761 762 if (uninitialized || team_sz_changed) { 763 __kmp_get_hierarchy(nproc, thr_bar); 764 } 765 766 if (uninitialized || team_sz_changed || tid_changed) { 767 thr_bar->my_level = thr_bar->depth - 1; // default for master 768 thr_bar->parent_tid = -1; // default for master 769 if (!KMP_MASTER_TID( 770 tid)) { // if not master, find parent thread in hierarchy 771 kmp_uint32 d = 0; 772 while (d < thr_bar->depth) { // find parent based on level of thread in 773 // hierarchy, and note level 774 kmp_uint32 rem; 775 if (d == thr_bar->depth - 2) { // reached level right below the master 776 thr_bar->parent_tid = 0; 777 thr_bar->my_level = d; 778 break; 779 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 780 0) { // TODO: can we make this op faster? 781 // thread is not a subtree root at next level, so this is max 782 thr_bar->parent_tid = tid - rem; 783 thr_bar->my_level = d; 784 break; 785 } 786 ++d; 787 } 788 } 789 thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1); 790 thr_bar->old_tid = tid; 791 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 792 thr_bar->team = team; 793 thr_bar->parent_bar = 794 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 795 } 796 if (uninitialized || team_changed || tid_changed) { 797 thr_bar->team = team; 798 thr_bar->parent_bar = 799 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 800 retval = true; 801 } 802 if (uninitialized || team_sz_changed || tid_changed) { 803 thr_bar->nproc = nproc; 804 thr_bar->leaf_kids = thr_bar->base_leaf_kids; 805 if (thr_bar->my_level == 0) 806 thr_bar->leaf_kids = 0; 807 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc) 808 thr_bar->leaf_kids = nproc - tid - 1; 809 thr_bar->leaf_state = 0; 810 for (int i = 0; i < thr_bar->leaf_kids; ++i) 811 ((char *)&(thr_bar->leaf_state))[7 - i] = 1; 812 } 813 return retval; 814 } 815 816 static void __kmp_hierarchical_barrier_gather( 817 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 818 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 819 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); 820 register kmp_team_t *team = this_thr->th.th_team; 821 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 822 register kmp_uint32 nproc = this_thr->th.th_team_nproc; 823 register kmp_info_t **other_threads = team->t.t_threads; 824 register kmp_uint64 new_state; 825 826 int level = team->t.t_level; 827 #if OMP_40_ENABLED 828 if (other_threads[0] 829 ->th.th_teams_microtask) // are we inside the teams construct? 830 if (this_thr->th.th_teams_size.nteams > 1) 831 ++level; // level was not increased in teams construct for team_of_masters 832 #endif 833 if (level == 1) 834 thr_bar->use_oncore_barrier = 1; 835 else 836 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 837 838 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " 839 "barrier type %d\n", 840 gtid, team->t.t_id, tid, bt)); 841 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 842 843 #if USE_ITT_BUILD && USE_ITT_NOTIFY 844 // Barrier imbalance - save arrive time to the thread 845 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 846 this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); 847 } 848 #endif 849 850 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, 851 team); 852 853 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) 854 register kmp_int32 child_tid; 855 new_state = 856 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 857 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 858 thr_bar->use_oncore_barrier) { 859 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on 860 // my b_arrived flag 861 kmp_uint64 leaf_state = 862 KMP_MASTER_TID(tid) 863 ? thr_bar->b_arrived | thr_bar->leaf_state 864 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; 865 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " 866 "for leaf kids\n", 867 gtid, team->t.t_id, tid)); 868 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state); 869 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 870 if (reduce) { 871 ANNOTATE_REDUCE_AFTER(reduce); 872 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; 873 ++child_tid) { 874 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 875 "T#%d(%d:%d)\n", 876 gtid, team->t.t_id, tid, 877 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 878 child_tid)); 879 ANNOTATE_BARRIER_END(other_threads[child_tid]); 880 (*reduce)(this_thr->th.th_local.reduce_data, 881 other_threads[child_tid]->th.th_local.reduce_data); 882 } 883 ANNOTATE_REDUCE_BEFORE(reduce); 884 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 885 } 886 (void)KMP_TEST_THEN_AND64( 887 (volatile kmp_int64 *)&thr_bar->b_arrived, 888 ~(thr_bar->leaf_state)); // clear leaf_state bits 889 } 890 // Next, wait for higher level children on each child's b_arrived flag 891 for (kmp_uint32 d = 1; d < thr_bar->my_level; 892 ++d) { // gather lowest level threads first, but skip 0 893 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 894 skip = thr_bar->skip_per_level[d]; 895 if (last > nproc) 896 last = nproc; 897 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 898 register kmp_info_t *child_thr = other_threads[child_tid]; 899 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 900 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 901 "T#%d(%d:%d) " 902 "arrived(%p) == %llu\n", 903 gtid, team->t.t_id, tid, 904 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 905 child_tid, &child_bar->b_arrived, new_state)); 906 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 907 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 908 ANNOTATE_BARRIER_END(child_thr); 909 if (reduce) { 910 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 911 "T#%d(%d:%d)\n", 912 gtid, team->t.t_id, tid, 913 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 914 child_tid)); 915 ANNOTATE_REDUCE_AFTER(reduce); 916 (*reduce)(this_thr->th.th_local.reduce_data, 917 child_thr->th.th_local.reduce_data); 918 ANNOTATE_REDUCE_BEFORE(reduce); 919 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 920 } 921 } 922 } 923 } else { // Blocktime is not infinite 924 for (kmp_uint32 d = 0; d < thr_bar->my_level; 925 ++d) { // Gather lowest level threads first 926 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 927 skip = thr_bar->skip_per_level[d]; 928 if (last > nproc) 929 last = nproc; 930 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 931 register kmp_info_t *child_thr = other_threads[child_tid]; 932 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 933 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 934 "T#%d(%d:%d) " 935 "arrived(%p) == %llu\n", 936 gtid, team->t.t_id, tid, 937 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 938 child_tid, &child_bar->b_arrived, new_state)); 939 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 940 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 941 ANNOTATE_BARRIER_END(child_thr); 942 if (reduce) { 943 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 944 "T#%d(%d:%d)\n", 945 gtid, team->t.t_id, tid, 946 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 947 child_tid)); 948 ANNOTATE_REDUCE_AFTER(reduce); 949 (*reduce)(this_thr->th.th_local.reduce_data, 950 child_thr->th.th_local.reduce_data); 951 ANNOTATE_REDUCE_BEFORE(reduce); 952 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 953 } 954 } 955 } 956 } 957 } 958 // All subordinates are gathered; now release parent if not master thread 959 960 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy 961 KA_TRACE( 962 20, 963 ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 964 "arrived(%p): %llu => %llu\n", 965 gtid, team->t.t_id, tid, 966 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, 967 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 968 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 969 /* Mark arrival to parent: After performing this write, a worker thread may 970 not assume that the team is valid any more - it could be deallocated by 971 the master thread at any time. */ 972 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || 973 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived 974 // flag; release it 975 ANNOTATE_BARRIER_BEGIN(this_thr); 976 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]); 977 flag.release(); 978 } else { // Leaf does special release on the "offset" bits of parent's 979 // b_arrived flag 980 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 981 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset); 982 flag.set_waiter(other_threads[thr_bar->parent_tid]); 983 flag.release(); 984 } 985 } else { // Master thread needs to update the team's b_arrived value 986 team->t.t_bar[bt].b_arrived = new_state; 987 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " 988 "arrived(%p) = %llu\n", 989 gtid, team->t.t_id, tid, team->t.t_id, 990 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 991 } 992 // Is the team access below unsafe or just technically invalid? 993 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " 994 "barrier type %d\n", 995 gtid, team->t.t_id, tid, bt)); 996 } 997 998 static void __kmp_hierarchical_barrier_release( 999 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1000 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1001 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); 1002 register kmp_team_t *team; 1003 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1004 register kmp_uint32 nproc; 1005 bool team_change = false; // indicates on-core barrier shouldn't be used 1006 1007 if (KMP_MASTER_TID(tid)) { 1008 team = __kmp_threads[gtid]->th.th_team; 1009 KMP_DEBUG_ASSERT(team != NULL); 1010 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master " 1011 "entered barrier type %d\n", 1012 gtid, team->t.t_id, tid, bt)); 1013 } else { // Worker threads 1014 // Wait for parent thread to release me 1015 if (!thr_bar->use_oncore_barrier || 1016 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 || 1017 thr_bar->team == NULL) { 1018 // Use traditional method of waiting on my own b_go flag 1019 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; 1020 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 1021 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1022 ANNOTATE_BARRIER_END(this_thr); 1023 TCW_8(thr_bar->b_go, 1024 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1025 } else { // Thread barrier data is initialized, this is a leaf, blocktime is 1026 // infinite, not nested 1027 // Wait on my "offset" bits on parent's b_go flag 1028 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; 1029 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, 1030 thr_bar->offset, bt, 1031 this_thr USE_ITT_BUILD_ARG(itt_sync_obj)); 1032 flag.wait(this_thr, TRUE); 1033 if (thr_bar->wait_flag == 1034 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go 1035 TCW_8(thr_bar->b_go, 1036 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1037 } else { // Reset my bits on parent's b_go flag 1038 ((char *)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0; 1039 } 1040 } 1041 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 1042 // Early exit for reaping threads releasing forkjoin barrier 1043 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1044 return; 1045 // The worker thread may now assume that the team is valid. 1046 team = __kmp_threads[gtid]->th.th_team; 1047 KMP_DEBUG_ASSERT(team != NULL); 1048 tid = __kmp_tid_from_gtid(gtid); 1049 1050 KA_TRACE( 1051 20, 1052 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 1053 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 1054 KMP_MB(); // Flush all pending memory write invalidates. 1055 } 1056 1057 nproc = this_thr->th.th_team_nproc; 1058 int level = team->t.t_level; 1059 #if OMP_40_ENABLED 1060 if (team->t.t_threads[0] 1061 ->th.th_teams_microtask) { // are we inside the teams construct? 1062 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 1063 this_thr->th.th_teams_level == level) 1064 ++level; // level was not increased in teams construct for team_of_workers 1065 if (this_thr->th.th_teams_size.nteams > 1) 1066 ++level; // level was not increased in teams construct for team_of_masters 1067 } 1068 #endif 1069 if (level == 1) 1070 thr_bar->use_oncore_barrier = 1; 1071 else 1072 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 1073 1074 // If the team size has increased, we still communicate with old leaves via 1075 // oncore barrier. 1076 unsigned short int old_leaf_kids = thr_bar->leaf_kids; 1077 kmp_uint64 old_leaf_state = thr_bar->leaf_state; 1078 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, 1079 tid, team); 1080 // But if the entire team changes, we won't use oncore barrier at all 1081 if (team_change) 1082 old_leaf_kids = 0; 1083 1084 #if KMP_BARRIER_ICV_PUSH 1085 if (propagate_icvs) { 1086 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 1087 FALSE); 1088 if (KMP_MASTER_TID( 1089 tid)) { // master already has copy in final destination; copy 1090 copy_icvs(&thr_bar->th_fixed_icvs, 1091 &team->t.t_implicit_task_taskdata[tid].td_icvs); 1092 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1093 thr_bar->use_oncore_barrier) { // optimization for inf blocktime 1094 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) 1095 // leaves (on-core children) pull parent's fixed ICVs directly to local 1096 // ICV store 1097 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1098 &thr_bar->parent_bar->th_fixed_icvs); 1099 // non-leaves will get ICVs piggybacked with b_go via NGO store 1100 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs 1101 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can 1102 // access 1103 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); 1104 else // leaves copy parent's fixed ICVs directly to local ICV store 1105 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1106 &thr_bar->parent_bar->th_fixed_icvs); 1107 } 1108 } 1109 #endif // KMP_BARRIER_ICV_PUSH 1110 1111 // Now, release my children 1112 if (thr_bar->my_level) { // not a leaf 1113 register kmp_int32 child_tid; 1114 kmp_uint32 last; 1115 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1116 thr_bar->use_oncore_barrier) { 1117 if (KMP_MASTER_TID(tid)) { // do a flat release 1118 // Set local b_go to bump children via NGO store of the cache line 1119 // containing IVCs and b_go. 1120 thr_bar->b_go = KMP_BARRIER_STATE_BUMP; 1121 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of 1122 // the cache line 1123 ngo_load(&thr_bar->th_fixed_icvs); 1124 // This loops over all the threads skipping only the leaf nodes in the 1125 // hierarchy 1126 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc; 1127 child_tid += thr_bar->skip_per_level[1]) { 1128 register kmp_bstate_t *child_bar = 1129 &team->t.t_threads[child_tid]->th.th_bar[bt].bb; 1130 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1131 "releasing T#%d(%d:%d)" 1132 " go(%p): %u => %u\n", 1133 gtid, team->t.t_id, tid, 1134 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1135 child_tid, &child_bar->b_go, child_bar->b_go, 1136 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1137 // Use ngo store (if available) to both store ICVs and release child 1138 // via child's b_go 1139 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 1140 } 1141 ngo_sync(); 1142 } 1143 TCW_8(thr_bar->b_go, 1144 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1145 // Now, release leaf children 1146 if (thr_bar->leaf_kids) { // if there are any 1147 // We test team_change on the off-chance that the level 1 team changed. 1148 if (team_change || 1149 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new 1150 if (old_leaf_kids) { // release old leaf kids 1151 thr_bar->b_go |= old_leaf_state; 1152 } 1153 // Release new leaf kids 1154 last = tid + thr_bar->skip_per_level[1]; 1155 if (last > nproc) 1156 last = nproc; 1157 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last; 1158 ++child_tid) { // skip_per_level[0]=1 1159 register kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1160 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1161 KA_TRACE( 1162 20, 1163 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 1164 " T#%d(%d:%d) go(%p): %u => %u\n", 1165 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1166 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1167 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1168 // Release child using child's b_go flag 1169 ANNOTATE_BARRIER_BEGIN(child_thr); 1170 kmp_flag_64 flag(&child_bar->b_go, child_thr); 1171 flag.release(); 1172 } 1173 } else { // Release all children at once with leaf_state bits on my own 1174 // b_go flag 1175 thr_bar->b_go |= thr_bar->leaf_state; 1176 } 1177 } 1178 } else { // Blocktime is not infinite; do a simple hierarchical release 1179 for (int d = thr_bar->my_level - 1; d >= 0; 1180 --d) { // Release highest level threads first 1181 last = tid + thr_bar->skip_per_level[d + 1]; 1182 kmp_uint32 skip = thr_bar->skip_per_level[d]; 1183 if (last > nproc) 1184 last = nproc; 1185 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1186 register kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1187 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1188 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1189 "releasing T#%d(%d:%d) go(%p): %u => %u\n", 1190 gtid, team->t.t_id, tid, 1191 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1192 child_tid, &child_bar->b_go, child_bar->b_go, 1193 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1194 // Release child using child's b_go flag 1195 ANNOTATE_BARRIER_BEGIN(child_thr); 1196 kmp_flag_64 flag(&child_bar->b_go, child_thr); 1197 flag.release(); 1198 } 1199 } 1200 } 1201 #if KMP_BARRIER_ICV_PUSH 1202 if (propagate_icvs && !KMP_MASTER_TID(tid)) 1203 // non-leaves copy ICVs from fixed ICVs to local dest 1204 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1205 &thr_bar->th_fixed_icvs); 1206 #endif // KMP_BARRIER_ICV_PUSH 1207 } 1208 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " 1209 "barrier type %d\n", 1210 gtid, team->t.t_id, tid, bt)); 1211 } 1212 1213 1214 // End of Barrier Algorithms 1215 1216 // Internal function to do a barrier. 1217 /* If is_split is true, do a split barrier, otherwise, do a plain barrier 1218 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split 1219 barrier 1220 Returns 0 if master thread, 1 if worker thread. */ 1221 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, 1222 size_t reduce_size, void *reduce_data, 1223 void (*reduce)(void *, void *)) { 1224 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); 1225 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1226 register int tid = __kmp_tid_from_gtid(gtid); 1227 register kmp_info_t *this_thr = __kmp_threads[gtid]; 1228 register kmp_team_t *team = this_thr->th.th_team; 1229 register int status = 0; 1230 ident_t *loc = __kmp_threads[gtid]->th.th_ident; 1231 #if OMPT_SUPPORT 1232 ompt_task_id_t my_task_id; 1233 ompt_parallel_id_t my_parallel_id; 1234 #endif 1235 1236 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid, 1237 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1238 1239 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 1240 #if OMPT_SUPPORT 1241 if (ompt_enabled) { 1242 #if OMPT_BLAME 1243 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id; 1244 my_parallel_id = team->t.ompt_team_info.parallel_id; 1245 1246 #if OMPT_TRACE 1247 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) { 1248 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) { 1249 ompt_callbacks.ompt_callback(ompt_event_single_others_end)( 1250 my_parallel_id, my_task_id); 1251 } 1252 } 1253 #endif 1254 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { 1255 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(my_parallel_id, 1256 my_task_id); 1257 } 1258 #endif 1259 // It is OK to report the barrier state after the barrier begin callback. 1260 // According to the OMPT specification, a compliant implementation may 1261 // even delay reporting this state until the barrier begins to wait. 1262 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; 1263 } 1264 #endif 1265 1266 if (!team->t.t_serialized) { 1267 #if USE_ITT_BUILD 1268 // This value will be used in itt notify events below. 1269 void *itt_sync_obj = NULL; 1270 #if USE_ITT_NOTIFY 1271 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1272 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1273 #endif 1274 #endif /* USE_ITT_BUILD */ 1275 if (__kmp_tasking_mode == tskm_extra_barrier) { 1276 __kmp_tasking_barrier(team, this_thr, gtid); 1277 KA_TRACE(15, 1278 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid, 1279 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1280 } 1281 1282 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 1283 access it when the team struct is not guaranteed to exist. */ 1284 // See note about the corresponding code in __kmp_join_barrier() being 1285 // performance-critical. 1286 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1287 #if KMP_USE_MONITOR 1288 this_thr->th.th_team_bt_intervals = 1289 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1290 this_thr->th.th_team_bt_set = 1291 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1292 #else 1293 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(); 1294 #endif 1295 } 1296 1297 #if USE_ITT_BUILD 1298 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1299 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1300 #endif /* USE_ITT_BUILD */ 1301 #if USE_DEBUGGER 1302 // Let the debugger know: the thread arrived to the barrier and waiting. 1303 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure. 1304 team->t.t_bar[bt].b_master_arrived += 1; 1305 } else { 1306 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; 1307 } // if 1308 #endif /* USE_DEBUGGER */ 1309 if (reduce != NULL) { 1310 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 1311 this_thr->th.th_local.reduce_data = reduce_data; 1312 } 1313 1314 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) 1315 __kmp_task_team_setup( 1316 this_thr, team, 1317 0); // use 0 to only setup the current team if nthreads > 1 1318 1319 switch (__kmp_barrier_gather_pattern[bt]) { 1320 case bp_hyper_bar: { 1321 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits 1322 // to 0; use linear 1323 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, 1324 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1325 break; 1326 } 1327 case bp_hierarchical_bar: { 1328 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, 1329 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1330 break; 1331 } 1332 case bp_tree_bar: { 1333 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits 1334 // to 0; use linear 1335 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, 1336 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1337 break; 1338 } 1339 default: { 1340 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, 1341 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1342 } 1343 } 1344 1345 KMP_MB(); 1346 1347 if (KMP_MASTER_TID(tid)) { 1348 status = 0; 1349 if (__kmp_tasking_mode != tskm_immediate_exec) { 1350 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1351 } 1352 #if USE_DEBUGGER 1353 // Let the debugger know: All threads are arrived and starting leaving the 1354 // barrier. 1355 team->t.t_bar[bt].b_team_arrived += 1; 1356 #endif 1357 1358 #if OMP_40_ENABLED 1359 // Reset cancellation flag for worksharing constructs 1360 if (team->t.t_cancel_request == cancel_loop || 1361 team->t.t_cancel_request == cancel_sections) { 1362 team->t.t_cancel_request = cancel_noreq; 1363 } 1364 #endif 1365 #if USE_ITT_BUILD 1366 /* TODO: In case of split reduction barrier, master thread may send 1367 acquired event early, before the final summation into the shared 1368 variable is done (final summation can be a long operation for array 1369 reductions). */ 1370 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1371 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1372 #endif /* USE_ITT_BUILD */ 1373 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1374 // Barrier - report frame end (only if active_level == 1) 1375 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 1376 __kmp_forkjoin_frames_mode && 1377 #if OMP_40_ENABLED 1378 this_thr->th.th_teams_microtask == NULL && 1379 #endif 1380 team->t.t_active_level == 1) { 1381 kmp_uint64 cur_time = __itt_get_timestamp(); 1382 kmp_info_t **other_threads = team->t.t_threads; 1383 int nproc = this_thr->th.th_team_nproc; 1384 int i; 1385 switch (__kmp_forkjoin_frames_mode) { 1386 case 1: 1387 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1388 loc, nproc); 1389 this_thr->th.th_frame_time = cur_time; 1390 break; 1391 case 2: // AC 2015-01-19: currently does not work for hierarchical (to 1392 // be fixed) 1393 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1394 1, loc, nproc); 1395 break; 1396 case 3: 1397 if (__itt_metadata_add_ptr) { 1398 // Initialize with master's wait time 1399 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1400 // Set arrive time to zero to be able to check it in 1401 // __kmp_invoke_task(); the same is done inside the loop below 1402 this_thr->th.th_bar_arrive_time = 0; 1403 for (i = 1; i < nproc; ++i) { 1404 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 1405 other_threads[i]->th.th_bar_arrive_time = 0; 1406 } 1407 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 1408 cur_time, delta, 1409 (kmp_uint64)(reduce != NULL)); 1410 } 1411 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1412 loc, nproc); 1413 this_thr->th.th_frame_time = cur_time; 1414 break; 1415 } 1416 } 1417 #endif /* USE_ITT_BUILD */ 1418 } else { 1419 status = 1; 1420 #if USE_ITT_BUILD 1421 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1422 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1423 #endif /* USE_ITT_BUILD */ 1424 } 1425 if (status == 1 || !is_split) { 1426 switch (__kmp_barrier_release_pattern[bt]) { 1427 case bp_hyper_bar: { 1428 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1429 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 1430 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1431 break; 1432 } 1433 case bp_hierarchical_bar: { 1434 __kmp_hierarchical_barrier_release( 1435 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1436 break; 1437 } 1438 case bp_tree_bar: { 1439 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1440 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 1441 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1442 break; 1443 } 1444 default: { 1445 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 1446 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1447 } 1448 } 1449 if (__kmp_tasking_mode != tskm_immediate_exec) { 1450 __kmp_task_team_sync(this_thr, team); 1451 } 1452 } 1453 1454 #if USE_ITT_BUILD 1455 /* GEH: TODO: Move this under if-condition above and also include in 1456 __kmp_end_split_barrier(). This will more accurately represent the actual 1457 release time of the threads for split barriers. */ 1458 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1459 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1460 #endif /* USE_ITT_BUILD */ 1461 } else { // Team is serialized. 1462 status = 0; 1463 if (__kmp_tasking_mode != tskm_immediate_exec) { 1464 #if OMP_45_ENABLED 1465 if (this_thr->th.th_task_team != NULL) { 1466 void *itt_sync_obj = NULL; 1467 #if USE_ITT_NOTIFY 1468 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1469 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1470 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1471 } 1472 #endif 1473 1474 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == 1475 TRUE); 1476 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1477 __kmp_task_team_setup(this_thr, team, 0); 1478 1479 #if USE_ITT_BUILD 1480 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1481 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1482 #endif /* USE_ITT_BUILD */ 1483 } 1484 #else 1485 // The task team should be NULL for serialized code (tasks will be 1486 // executed immediately) 1487 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL); 1488 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL); 1489 #endif 1490 } 1491 } 1492 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", 1493 gtid, __kmp_team_from_gtid(gtid)->t.t_id, 1494 __kmp_tid_from_gtid(gtid), status)); 1495 1496 #if OMPT_SUPPORT 1497 if (ompt_enabled) { 1498 #if OMPT_BLAME 1499 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { 1500 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(my_parallel_id, 1501 my_task_id); 1502 } 1503 #endif 1504 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1505 } 1506 #endif 1507 ANNOTATE_BARRIER_END(&team->t.t_bar); 1508 1509 return status; 1510 } 1511 1512 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { 1513 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); 1514 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1515 int tid = __kmp_tid_from_gtid(gtid); 1516 kmp_info_t *this_thr = __kmp_threads[gtid]; 1517 kmp_team_t *team = this_thr->th.th_team; 1518 1519 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 1520 if (!team->t.t_serialized) { 1521 if (KMP_MASTER_GTID(gtid)) { 1522 switch (__kmp_barrier_release_pattern[bt]) { 1523 case bp_hyper_bar: { 1524 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1525 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 1526 FALSE USE_ITT_BUILD_ARG(NULL)); 1527 break; 1528 } 1529 case bp_hierarchical_bar: { 1530 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, 1531 FALSE USE_ITT_BUILD_ARG(NULL)); 1532 break; 1533 } 1534 case bp_tree_bar: { 1535 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1536 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 1537 FALSE USE_ITT_BUILD_ARG(NULL)); 1538 break; 1539 } 1540 default: { 1541 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 1542 FALSE USE_ITT_BUILD_ARG(NULL)); 1543 } 1544 } 1545 if (__kmp_tasking_mode != tskm_immediate_exec) { 1546 __kmp_task_team_sync(this_thr, team); 1547 } // if 1548 } 1549 } 1550 ANNOTATE_BARRIER_END(&team->t.t_bar); 1551 } 1552 1553 void __kmp_join_barrier(int gtid) { 1554 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); 1555 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1556 register kmp_info_t *this_thr = __kmp_threads[gtid]; 1557 register kmp_team_t *team; 1558 register kmp_uint nproc; 1559 kmp_info_t *master_thread; 1560 int tid; 1561 #ifdef KMP_DEBUG 1562 int team_id; 1563 #endif /* KMP_DEBUG */ 1564 #if USE_ITT_BUILD 1565 void *itt_sync_obj = NULL; 1566 #if USE_ITT_NOTIFY 1567 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need 1568 // Get object created at fork_barrier 1569 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1570 #endif 1571 #endif /* USE_ITT_BUILD */ 1572 KMP_MB(); 1573 1574 // Get current info 1575 team = this_thr->th.th_team; 1576 nproc = this_thr->th.th_team_nproc; 1577 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); 1578 tid = __kmp_tid_from_gtid(gtid); 1579 #ifdef KMP_DEBUG 1580 team_id = team->t.t_id; 1581 #endif /* KMP_DEBUG */ 1582 master_thread = this_thr->th.th_team_master; 1583 #ifdef KMP_DEBUG 1584 if (master_thread != team->t.t_threads[0]) { 1585 __kmp_print_structure(); 1586 } 1587 #endif /* KMP_DEBUG */ 1588 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); 1589 KMP_MB(); 1590 1591 // Verify state 1592 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 1593 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); 1594 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); 1595 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); 1596 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", 1597 gtid, team_id, tid)); 1598 1599 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 1600 #if OMPT_SUPPORT 1601 #if OMPT_TRACE 1602 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { 1603 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( 1604 team->t.ompt_team_info.parallel_id, 1605 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1606 } 1607 #endif 1608 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; 1609 #endif 1610 1611 if (__kmp_tasking_mode == tskm_extra_barrier) { 1612 __kmp_tasking_barrier(team, this_thr, gtid); 1613 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, 1614 team_id, tid)); 1615 } 1616 #ifdef KMP_DEBUG 1617 if (__kmp_tasking_mode != tskm_immediate_exec) { 1618 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = " 1619 "%p, th_task_team = %p\n", 1620 __kmp_gtid_from_thread(this_thr), team_id, 1621 team->t.t_task_team[this_thr->th.th_task_state], 1622 this_thr->th.th_task_team)); 1623 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == 1624 team->t.t_task_team[this_thr->th.th_task_state]); 1625 } 1626 #endif /* KMP_DEBUG */ 1627 1628 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 1629 access it when the team struct is not guaranteed to exist. Doing these 1630 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround, 1631 we do not perform the copy if blocktime=infinite, since the values are not 1632 used by __kmp_wait_template() in that case. */ 1633 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1634 #if KMP_USE_MONITOR 1635 this_thr->th.th_team_bt_intervals = 1636 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1637 this_thr->th.th_team_bt_set = 1638 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1639 #else 1640 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(); 1641 #endif 1642 } 1643 1644 #if USE_ITT_BUILD 1645 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1646 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1647 #endif /* USE_ITT_BUILD */ 1648 1649 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { 1650 case bp_hyper_bar: { 1651 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1652 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1653 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1654 break; 1655 } 1656 case bp_hierarchical_bar: { 1657 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1658 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1659 break; 1660 } 1661 case bp_tree_bar: { 1662 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1663 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1664 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1665 break; 1666 } 1667 default: { 1668 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1669 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1670 } 1671 } 1672 1673 /* From this point on, the team data structure may be deallocated at any time 1674 by the master thread - it is unsafe to reference it in any of the worker 1675 threads. Any per-team data items that need to be referenced before the 1676 end of the barrier should be moved to the kmp_task_team_t structs. */ 1677 if (KMP_MASTER_TID(tid)) { 1678 if (__kmp_tasking_mode != tskm_immediate_exec) { 1679 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1680 } 1681 #if KMP_STATS_ENABLED 1682 // Have master thread flag the workers to indicate they are now waiting for 1683 // next parallel region, Also wake them up so they switch their timers to 1684 // idle. 1685 for (int i = 0; i < team->t.t_nproc; ++i) { 1686 kmp_info_t *team_thread = team->t.t_threads[i]; 1687 if (team_thread == this_thr) 1688 continue; 1689 team_thread->th.th_stats->setIdleFlag(); 1690 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && 1691 team_thread->th.th_sleep_loc != NULL) 1692 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), 1693 team_thread->th.th_sleep_loc); 1694 } 1695 #endif 1696 #if USE_ITT_BUILD 1697 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1698 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1699 #endif /* USE_ITT_BUILD */ 1700 1701 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1702 // Join barrier - report frame end 1703 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 1704 __kmp_forkjoin_frames_mode && 1705 #if OMP_40_ENABLED 1706 this_thr->th.th_teams_microtask == NULL && 1707 #endif 1708 team->t.t_active_level == 1) { 1709 kmp_uint64 cur_time = __itt_get_timestamp(); 1710 ident_t *loc = team->t.t_ident; 1711 kmp_info_t **other_threads = team->t.t_threads; 1712 int nproc = this_thr->th.th_team_nproc; 1713 int i; 1714 switch (__kmp_forkjoin_frames_mode) { 1715 case 1: 1716 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1717 loc, nproc); 1718 break; 1719 case 2: 1720 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, 1721 loc, nproc); 1722 break; 1723 case 3: 1724 if (__itt_metadata_add_ptr) { 1725 // Initialize with master's wait time 1726 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1727 // Set arrive time to zero to be able to check it in 1728 // __kmp_invoke_task(); the same is done inside the loop below 1729 this_thr->th.th_bar_arrive_time = 0; 1730 for (i = 1; i < nproc; ++i) { 1731 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 1732 other_threads[i]->th.th_bar_arrive_time = 0; 1733 } 1734 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 1735 cur_time, delta, 0); 1736 } 1737 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1738 loc, nproc); 1739 this_thr->th.th_frame_time = cur_time; 1740 break; 1741 } 1742 } 1743 #endif /* USE_ITT_BUILD */ 1744 } 1745 #if USE_ITT_BUILD 1746 else { 1747 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1748 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1749 } 1750 #endif /* USE_ITT_BUILD */ 1751 1752 #if KMP_DEBUG 1753 if (KMP_MASTER_TID(tid)) { 1754 KA_TRACE( 1755 15, 1756 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", 1757 gtid, team_id, tid, nproc)); 1758 } 1759 #endif /* KMP_DEBUG */ 1760 1761 // TODO now, mark worker threads as done so they may be disbanded 1762 KMP_MB(); // Flush all pending memory write invalidates. 1763 KA_TRACE(10, 1764 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); 1765 1766 #if OMPT_SUPPORT 1767 if (ompt_enabled) { 1768 #if OMPT_BLAME 1769 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { 1770 ompt_callbacks.ompt_callback(ompt_event_barrier_end)( 1771 team->t.ompt_team_info.parallel_id, 1772 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1773 } 1774 #endif 1775 1776 // return to default state 1777 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 1778 } 1779 #endif 1780 ANNOTATE_BARRIER_END(&team->t.t_bar); 1781 } 1782 1783 // TODO release worker threads' fork barriers as we are ready instead of all at 1784 // once 1785 void __kmp_fork_barrier(int gtid, int tid) { 1786 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); 1787 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1788 kmp_info_t *this_thr = __kmp_threads[gtid]; 1789 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; 1790 #if USE_ITT_BUILD 1791 void *itt_sync_obj = NULL; 1792 #endif /* USE_ITT_BUILD */ 1793 if (team) 1794 ANNOTATE_BARRIER_END(&team->t.t_bar); 1795 1796 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid, 1797 (team != NULL) ? team->t.t_id : -1, tid)); 1798 1799 // th_team pointer only valid for master thread here 1800 if (KMP_MASTER_TID(tid)) { 1801 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1802 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1803 // Create itt barrier object 1804 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); 1805 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing 1806 } 1807 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1808 1809 #ifdef KMP_DEBUG 1810 register kmp_info_t **other_threads = team->t.t_threads; 1811 register int i; 1812 1813 // Verify state 1814 KMP_MB(); 1815 1816 for (i = 1; i < team->t.t_nproc; ++i) { 1817 KA_TRACE(500, 1818 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " 1819 "== %u.\n", 1820 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, 1821 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, 1822 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); 1823 KMP_DEBUG_ASSERT( 1824 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) & 1825 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE); 1826 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); 1827 } 1828 #endif 1829 1830 if (__kmp_tasking_mode != tskm_immediate_exec) { 1831 // 0 indicates setup current task team if nthreads > 1 1832 __kmp_task_team_setup(this_thr, team, 0); 1833 } 1834 1835 /* The master thread may have changed its blocktime between the join barrier 1836 and the fork barrier. Copy the blocktime info to the thread, where 1837 __kmp_wait_template() can access it when the team struct is not 1838 guaranteed to exist. */ 1839 // See note about the corresponding code in __kmp_join_barrier() being 1840 // performance-critical 1841 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1842 #if KMP_USE_MONITOR 1843 this_thr->th.th_team_bt_intervals = 1844 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1845 this_thr->th.th_team_bt_set = 1846 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1847 #else 1848 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(); 1849 #endif 1850 } 1851 } // master 1852 1853 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { 1854 case bp_hyper_bar: { 1855 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1856 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1857 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1858 break; 1859 } 1860 case bp_hierarchical_bar: { 1861 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1862 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1863 break; 1864 } 1865 case bp_tree_bar: { 1866 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1867 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1868 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1869 break; 1870 } 1871 default: { 1872 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1873 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1874 } 1875 } 1876 1877 // Early exit for reaping threads releasing forkjoin barrier 1878 if (TCR_4(__kmp_global.g.g_done)) { 1879 this_thr->th.th_task_team = NULL; 1880 1881 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1882 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1883 if (!KMP_MASTER_TID(tid)) { 1884 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1885 if (itt_sync_obj) 1886 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1887 } 1888 } 1889 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1890 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); 1891 return; 1892 } 1893 1894 /* We can now assume that a valid team structure has been allocated by the 1895 master and propagated to all worker threads. The current thread, however, 1896 may not be part of the team, so we can't blindly assume that the team 1897 pointer is non-null. */ 1898 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); 1899 KMP_DEBUG_ASSERT(team != NULL); 1900 tid = __kmp_tid_from_gtid(gtid); 1901 1902 #if KMP_BARRIER_ICV_PULL 1903 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 1904 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's 1905 implicit task has this data before this function is called. We cannot 1906 modify __kmp_fork_call() to look at the fixed ICVs in the master's thread 1907 struct, because it is not always the case that the threads arrays have 1908 been allocated when __kmp_fork_call() is executed. */ 1909 { 1910 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 1911 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs 1912 // Copy the initial ICVs from the master's thread struct to the implicit 1913 // task for this tid. 1914 KA_TRACE(10, 1915 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); 1916 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, 1917 tid, FALSE); 1918 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1919 &team->t.t_threads[0] 1920 ->th.th_bar[bs_forkjoin_barrier] 1921 .bb.th_fixed_icvs); 1922 } 1923 } 1924 #endif // KMP_BARRIER_ICV_PULL 1925 1926 if (__kmp_tasking_mode != tskm_immediate_exec) { 1927 __kmp_task_team_sync(this_thr, team); 1928 } 1929 1930 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 1931 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 1932 if (proc_bind == proc_bind_intel) { 1933 #endif 1934 #if KMP_AFFINITY_SUPPORTED 1935 // Call dynamic affinity settings 1936 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) { 1937 __kmp_balanced_affinity(tid, team->t.t_nproc); 1938 } 1939 #endif // KMP_AFFINITY_SUPPORTED 1940 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 1941 } else if (proc_bind != proc_bind_false) { 1942 if (this_thr->th.th_new_place == this_thr->th.th_current_place) { 1943 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", 1944 __kmp_gtid_from_thread(this_thr), 1945 this_thr->th.th_current_place)); 1946 } else { 1947 __kmp_affinity_set_place(gtid); 1948 } 1949 } 1950 #endif 1951 1952 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1953 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1954 if (!KMP_MASTER_TID(tid)) { 1955 // Get correct barrier object 1956 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1957 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired 1958 } // (prepare called inside barrier_release) 1959 } 1960 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1961 ANNOTATE_BARRIER_END(&team->t.t_bar); 1962 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, 1963 team->t.t_id, tid)); 1964 } 1965 1966 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 1967 kmp_internal_control_t *new_icvs, ident_t *loc) { 1968 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); 1969 1970 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); 1971 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 1972 1973 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 1974 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's 1975 implicit task has this data before this function is called. */ 1976 #if KMP_BARRIER_ICV_PULL 1977 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains 1978 untouched), where all of the worker threads can access them and make their 1979 own copies after the barrier. */ 1980 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 1981 // allocated at this point 1982 copy_icvs( 1983 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, 1984 new_icvs); 1985 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0, 1986 team->t.t_threads[0], team)); 1987 #elif KMP_BARRIER_ICV_PUSH 1988 // The ICVs will be propagated in the fork barrier, so nothing needs to be 1989 // done here. 1990 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0, 1991 team->t.t_threads[0], team)); 1992 #else 1993 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) 1994 // time. 1995 ngo_load(new_icvs); 1996 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 1997 // allocated at this point 1998 for (int f = 1; f < new_nproc; ++f) { // Skip the master thread 1999 // TODO: GEH - pass in better source location info since usually NULL here 2000 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2001 f, team->t.t_threads[f], team)); 2002 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); 2003 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); 2004 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2005 f, team->t.t_threads[f], team)); 2006 } 2007 ngo_sync(); 2008 #endif // KMP_BARRIER_ICV_PULL 2009 } 2010