1 /* 2 * kmp_barrier.cpp 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_wait_release.h" 18 #include "kmp_stats.h" 19 #include "kmp_itt.h" 20 #include "kmp_os.h" 21 22 23 #if KMP_MIC 24 #include <immintrin.h> 25 #define USE_NGO_STORES 1 26 #endif // KMP_MIC 27 28 #include "tsan_annotations.h" 29 30 #if KMP_MIC && USE_NGO_STORES 31 // ICV copying 32 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 33 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 34 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 35 #define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory") 36 #else 37 #define ngo_load(src) ((void)0) 38 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 39 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 40 #define ngo_sync() ((void)0) 41 #endif /* KMP_MIC && USE_NGO_STORES */ 42 43 void __kmp_print_structure(void); // Forward declaration 44 45 // ---------------------------- Barrier Algorithms ---------------------------- 46 47 // Linear Barrier 48 static void 49 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 50 void (*reduce)(void *, void *) 51 USE_ITT_BUILD_ARG(void * itt_sync_obj) ) 52 { 53 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); 54 register kmp_team_t *team = this_thr->th.th_team; 55 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb; 56 register kmp_info_t **other_threads = team->t.t_threads; 57 58 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 59 gtid, team->t.t_id, tid, bt)); 60 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 61 62 #if USE_ITT_BUILD && USE_ITT_NOTIFY 63 // Barrier imbalance - save arrive time to the thread 64 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 65 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); 66 } 67 #endif 68 // We now perform a linear reduction to signal that all of the threads have arrived. 69 if (!KMP_MASTER_TID(tid)) { 70 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 71 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, 72 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived, 73 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 74 // Mark arrival to master thread 75 /* After performing this write, a worker thread may not assume that the team is valid 76 any more - it could be deallocated by the master thread at any time. */ 77 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]); 78 flag.release(); 79 } else { 80 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; 81 register int nproc = this_thr->th.th_team_nproc; 82 register int i; 83 // Don't have to worry about sleep bit here or atomic since team setting 84 register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; 85 86 // Collect all the worker team member threads. 87 for (i=1; i<nproc; ++i) { 88 #if KMP_CACHE_MANAGE 89 // Prefetch next thread's arrived count 90 if (i+1 < nproc) 91 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived); 92 #endif /* KMP_CACHE_MANAGE */ 93 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 94 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, 95 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 96 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); 97 98 // Wait for worker thread to arrive 99 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); 100 flag.wait(this_thr, FALSE 101 USE_ITT_BUILD_ARG(itt_sync_obj) ); 102 #if USE_ITT_BUILD && USE_ITT_NOTIFY 103 // Barrier imbalance - write min of the thread time and the other thread time to the thread. 104 if (__kmp_forkjoin_frames_mode == 2) { 105 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 106 other_threads[i]->th.th_bar_min_time); 107 } 108 #endif 109 if (reduce) { 110 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid, 111 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i)); 112 ANNOTATE_REDUCE_AFTER(reduce); 113 (*reduce)(this_thr->th.th_local.reduce_data, 114 other_threads[i]->th.th_local.reduce_data); 115 ANNOTATE_REDUCE_BEFORE(reduce); 116 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 117 } 118 } 119 // Don't have to worry about sleep bit here or atomic since team setting 120 team_bar->b_arrived = new_state; 121 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", 122 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state)); 123 } 124 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 125 gtid, team->t.t_id, tid, bt)); 126 } 127 128 static void 129 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 130 int propagate_icvs 131 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 132 { 133 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); 134 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 135 register kmp_team_t *team; 136 137 if (KMP_MASTER_TID(tid)) { 138 register unsigned int i; 139 register kmp_uint32 nproc = this_thr->th.th_team_nproc; 140 register kmp_info_t **other_threads; 141 142 team = __kmp_threads[gtid]->th.th_team; 143 KMP_DEBUG_ASSERT(team != NULL); 144 other_threads = team->t.t_threads; 145 146 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", 147 gtid, team->t.t_id, tid, bt)); 148 149 if (nproc > 1) { 150 #if KMP_BARRIER_ICV_PUSH 151 { 152 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 153 if (propagate_icvs) { 154 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); 155 for (i=1; i<nproc; ++i) { 156 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE); 157 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, 158 &team->t.t_implicit_task_taskdata[0].td_icvs); 159 } 160 ngo_sync(); 161 } 162 } 163 #endif // KMP_BARRIER_ICV_PUSH 164 165 // Now, release all of the worker threads 166 for (i=1; i<nproc; ++i) { 167 #if KMP_CACHE_MANAGE 168 // Prefetch next thread's go flag 169 if (i+1 < nproc) 170 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go); 171 #endif /* KMP_CACHE_MANAGE */ 172 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 173 "go(%p): %u => %u\n", gtid, team->t.t_id, tid, 174 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i, 175 &other_threads[i]->th.th_bar[bt].bb.b_go, 176 other_threads[i]->th.th_bar[bt].bb.b_go, 177 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); 178 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]); 179 flag.release(); 180 } 181 } 182 } else { // Wait for the MASTER thread to release us 183 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", 184 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 185 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 186 flag.wait(this_thr, TRUE 187 USE_ITT_BUILD_ARG(itt_sync_obj) ); 188 #if USE_ITT_BUILD && USE_ITT_NOTIFY 189 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 190 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled) 191 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 192 // Cancel wait on previous parallel region... 193 __kmp_itt_task_starting(itt_sync_obj); 194 195 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 196 return; 197 198 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 199 if (itt_sync_obj != NULL) 200 // Call prepare as early as possible for "new" barrier 201 __kmp_itt_task_finished(itt_sync_obj); 202 } else 203 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 204 // Early exit for reaping threads releasing forkjoin barrier 205 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) ) 206 return; 207 // The worker thread may now assume that the team is valid. 208 #ifdef KMP_DEBUG 209 tid = __kmp_tid_from_gtid(gtid); 210 team = __kmp_threads[gtid]->th.th_team; 211 #endif 212 KMP_DEBUG_ASSERT(team != NULL); 213 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 214 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 215 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 216 KMP_MB(); // Flush all pending memory write invalidates. 217 } 218 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 219 gtid, team->t.t_id, tid, bt)); 220 } 221 222 // Tree barrier 223 static void 224 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 225 void (*reduce)(void *, void *) 226 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 227 { 228 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); 229 register kmp_team_t *team = this_thr->th.th_team; 230 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 231 register kmp_info_t **other_threads = team->t.t_threads; 232 register kmp_uint32 nproc = this_thr->th.th_team_nproc; 233 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 234 register kmp_uint32 branch_factor = 1 << branch_bits; 235 register kmp_uint32 child; 236 register kmp_uint32 child_tid; 237 register kmp_uint64 new_state; 238 239 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 240 gtid, team->t.t_id, tid, bt)); 241 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 242 243 #if USE_ITT_BUILD && USE_ITT_NOTIFY 244 // Barrier imbalance - save arrive time to the thread 245 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 246 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); 247 } 248 #endif 249 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go 250 child_tid = (tid << branch_bits) + 1; 251 if (child_tid < nproc) { 252 // Parent threads wait for all their children to arrive 253 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 254 child = 1; 255 do { 256 register kmp_info_t *child_thr = other_threads[child_tid]; 257 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 258 #if KMP_CACHE_MANAGE 259 // Prefetch next thread's arrived count 260 if (child+1 <= branch_factor && child_tid+1 < nproc) 261 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived); 262 #endif /* KMP_CACHE_MANAGE */ 263 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 264 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, 265 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, 266 &child_bar->b_arrived, new_state)); 267 // Wait for child to arrive 268 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 269 flag.wait(this_thr, FALSE 270 USE_ITT_BUILD_ARG(itt_sync_obj) ); 271 #if USE_ITT_BUILD && USE_ITT_NOTIFY 272 // Barrier imbalance - write min of the thread time and a child time to the thread. 273 if (__kmp_forkjoin_frames_mode == 2) { 274 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 275 child_thr->th.th_bar_min_time); 276 } 277 #endif 278 if (reduce) { 279 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 280 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 281 team->t.t_id, child_tid)); 282 ANNOTATE_REDUCE_AFTER(reduce); 283 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); 284 ANNOTATE_REDUCE_BEFORE(reduce); 285 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 286 } 287 child++; 288 child_tid++; 289 } 290 while (child <= branch_factor && child_tid < nproc); 291 } 292 293 if (!KMP_MASTER_TID(tid)) { // Worker threads 294 register kmp_int32 parent_tid = (tid - 1) >> branch_bits; 295 296 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 297 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, 298 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid, 299 &thr_bar->b_arrived, thr_bar->b_arrived, 300 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 301 302 // Mark arrival to parent thread 303 /* After performing this write, a worker thread may not assume that the team is valid 304 any more - it could be deallocated by the master thread at any time. */ 305 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]); 306 flag.release(); 307 } else { 308 // Need to update the team arrived pointer if we are the master thread 309 if (nproc > 1) // New value was already computed above 310 team->t.t_bar[bt].b_arrived = new_state; 311 else 312 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 313 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", 314 gtid, team->t.t_id, tid, team->t.t_id, 315 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 316 } 317 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 318 gtid, team->t.t_id, tid, bt)); 319 } 320 321 static void 322 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 323 int propagate_icvs 324 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 325 { 326 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); 327 register kmp_team_t *team; 328 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 329 register kmp_uint32 nproc; 330 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 331 register kmp_uint32 branch_factor = 1 << branch_bits; 332 register kmp_uint32 child; 333 register kmp_uint32 child_tid; 334 335 // Perform a tree release for all of the threads that have been gathered 336 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet 337 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", 338 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 339 // Wait for parent thread to release us 340 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 341 flag.wait(this_thr, TRUE 342 USE_ITT_BUILD_ARG(itt_sync_obj) ); 343 #if USE_ITT_BUILD && USE_ITT_NOTIFY 344 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 345 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled) 346 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 347 // Cancel wait on previous parallel region... 348 __kmp_itt_task_starting(itt_sync_obj); 349 350 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 351 return; 352 353 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 354 if (itt_sync_obj != NULL) 355 // Call prepare as early as possible for "new" barrier 356 __kmp_itt_task_finished(itt_sync_obj); 357 } else 358 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 359 // Early exit for reaping threads releasing forkjoin barrier 360 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 361 return; 362 363 // The worker thread may now assume that the team is valid. 364 team = __kmp_threads[gtid]->th.th_team; 365 KMP_DEBUG_ASSERT(team != NULL); 366 tid = __kmp_tid_from_gtid(gtid); 367 368 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 369 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 370 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 371 KMP_MB(); // Flush all pending memory write invalidates. 372 } else { 373 team = __kmp_threads[gtid]->th.th_team; 374 KMP_DEBUG_ASSERT(team != NULL); 375 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", 376 gtid, team->t.t_id, tid, bt)); 377 } 378 nproc = this_thr->th.th_team_nproc; 379 child_tid = (tid << branch_bits) + 1; 380 381 if (child_tid < nproc) { 382 register kmp_info_t **other_threads = team->t.t_threads; 383 child = 1; 384 // Parent threads release all their children 385 do { 386 register kmp_info_t *child_thr = other_threads[child_tid]; 387 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 388 #if KMP_CACHE_MANAGE 389 // Prefetch next thread's go count 390 if (child+1 <= branch_factor && child_tid+1 < nproc) 391 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go); 392 #endif /* KMP_CACHE_MANAGE */ 393 394 #if KMP_BARRIER_ICV_PUSH 395 { 396 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 397 if (propagate_icvs) { 398 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid], 399 team, child_tid, FALSE); 400 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, 401 &team->t.t_implicit_task_taskdata[0].td_icvs); 402 } 403 } 404 #endif // KMP_BARRIER_ICV_PUSH 405 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 406 "go(%p): %u => %u\n", gtid, team->t.t_id, tid, 407 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 408 child_tid, &child_bar->b_go, child_bar->b_go, 409 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 410 // Release child from barrier 411 kmp_flag_64 flag(&child_bar->b_go, child_thr); 412 flag.release(); 413 child++; 414 child_tid++; 415 } 416 while (child <= branch_factor && child_tid < nproc); 417 } 418 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 419 gtid, team->t.t_id, tid, bt)); 420 } 421 422 423 // Hyper Barrier 424 static void 425 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 426 void (*reduce)(void *, void *) 427 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 428 { 429 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); 430 register kmp_team_t *team = this_thr->th.th_team; 431 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 432 register kmp_info_t **other_threads = team->t.t_threads; 433 register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; 434 register kmp_uint32 num_threads = this_thr->th.th_team_nproc; 435 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 436 register kmp_uint32 branch_factor = 1 << branch_bits; 437 register kmp_uint32 offset; 438 register kmp_uint32 level; 439 440 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 441 gtid, team->t.t_id, tid, bt)); 442 443 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 444 445 #if USE_ITT_BUILD && USE_ITT_NOTIFY 446 // Barrier imbalance - save arrive time to the thread 447 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 448 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); 449 } 450 #endif 451 /* Perform a hypercube-embedded tree gather to wait until all of the threads have 452 arrived, and reduce any required data as we go. */ 453 kmp_flag_64 p_flag(&thr_bar->b_arrived); 454 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits) 455 { 456 register kmp_uint32 child; 457 register kmp_uint32 child_tid; 458 459 if (((tid >> level) & (branch_factor - 1)) != 0) { 460 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1); 461 462 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 463 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, 464 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid, 465 &thr_bar->b_arrived, thr_bar->b_arrived, 466 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 467 // Mark arrival to parent thread 468 /* After performing this write (in the last iteration of the enclosing for loop), 469 a worker thread may not assume that the team is valid any more - it could be 470 deallocated by the master thread at any time. */ 471 p_flag.set_waiter(other_threads[parent_tid]); 472 p_flag.release(); 473 break; 474 } 475 476 // Parent threads wait for children to arrive 477 if (new_state == KMP_BARRIER_UNUSED_STATE) 478 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 479 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads; 480 child++, child_tid+=(1 << level)) 481 { 482 register kmp_info_t *child_thr = other_threads[child_tid]; 483 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 484 #if KMP_CACHE_MANAGE 485 register kmp_uint32 next_child_tid = child_tid + (1 << level); 486 // Prefetch next thread's arrived count 487 if (child+1 < branch_factor && next_child_tid < num_threads) 488 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); 489 #endif /* KMP_CACHE_MANAGE */ 490 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 491 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, 492 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, 493 &child_bar->b_arrived, new_state)); 494 // Wait for child to arrive 495 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state); 496 c_flag.wait(this_thr, FALSE 497 USE_ITT_BUILD_ARG(itt_sync_obj) ); 498 #if USE_ITT_BUILD && USE_ITT_NOTIFY 499 // Barrier imbalance - write min of the thread time and a child time to the thread. 500 if (__kmp_forkjoin_frames_mode == 2) { 501 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 502 child_thr->th.th_bar_min_time); 503 } 504 #endif 505 if (reduce) { 506 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 507 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 508 team->t.t_id, child_tid)); 509 ANNOTATE_REDUCE_AFTER(reduce); 510 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); 511 ANNOTATE_REDUCE_BEFORE(reduce); 512 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 513 } 514 } 515 } 516 517 if (KMP_MASTER_TID(tid)) { 518 // Need to update the team arrived pointer if we are the master thread 519 if (new_state == KMP_BARRIER_UNUSED_STATE) 520 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 521 else 522 team->t.t_bar[bt].b_arrived = new_state; 523 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", 524 gtid, team->t.t_id, tid, team->t.t_id, 525 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 526 } 527 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 528 gtid, team->t.t_id, tid, bt)); 529 } 530 531 // The reverse versions seem to beat the forward versions overall 532 #define KMP_REVERSE_HYPER_BAR 533 static void 534 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 535 int propagate_icvs 536 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 537 { 538 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); 539 register kmp_team_t *team; 540 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb; 541 register kmp_info_t **other_threads; 542 register kmp_uint32 num_threads; 543 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ]; 544 register kmp_uint32 branch_factor = 1 << branch_bits; 545 register kmp_uint32 child; 546 register kmp_uint32 child_tid; 547 register kmp_uint32 offset; 548 register kmp_uint32 level; 549 550 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered. 551 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse 552 order of the corresponding gather, otherwise threads are released in the same order. */ 553 if (KMP_MASTER_TID(tid)) { // master 554 team = __kmp_threads[gtid]->th.th_team; 555 KMP_DEBUG_ASSERT(team != NULL); 556 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", 557 gtid, team->t.t_id, tid, bt)); 558 #if KMP_BARRIER_ICV_PUSH 559 if (propagate_icvs) { // master already has ICVs in final destination; copy 560 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs); 561 } 562 #endif 563 } 564 else { // Handle fork barrier workers who aren't part of a team yet 565 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", 566 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 567 // Wait for parent thread to release us 568 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 569 flag.wait(this_thr, TRUE 570 USE_ITT_BUILD_ARG(itt_sync_obj) ); 571 #if USE_ITT_BUILD && USE_ITT_NOTIFY 572 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 573 // In fork barrier where we could not get the object reliably 574 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 575 // Cancel wait on previous parallel region... 576 __kmp_itt_task_starting(itt_sync_obj); 577 578 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 579 return; 580 581 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 582 if (itt_sync_obj != NULL) 583 // Call prepare as early as possible for "new" barrier 584 __kmp_itt_task_finished(itt_sync_obj); 585 } else 586 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 587 // Early exit for reaping threads releasing forkjoin barrier 588 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 589 return; 590 591 // The worker thread may now assume that the team is valid. 592 team = __kmp_threads[gtid]->th.th_team; 593 KMP_DEBUG_ASSERT(team != NULL); 594 tid = __kmp_tid_from_gtid(gtid); 595 596 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 597 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 598 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 599 KMP_MB(); // Flush all pending memory write invalidates. 600 } 601 num_threads = this_thr->th.th_team_nproc; 602 other_threads = team->t.t_threads; 603 604 #ifdef KMP_REVERSE_HYPER_BAR 605 // Count up to correct level for parent 606 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0); 607 level+=branch_bits, offset<<=branch_bits); 608 609 // Now go down from there 610 for (level-=branch_bits, offset>>=branch_bits; offset != 0; 611 level-=branch_bits, offset>>=branch_bits) 612 #else 613 // Go down the tree, level by level 614 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits) 615 #endif // KMP_REVERSE_HYPER_BAR 616 { 617 #ifdef KMP_REVERSE_HYPER_BAR 618 /* Now go in reverse order through the children, highest to lowest. 619 Initial setting of child is conservative here. */ 620 child = num_threads >> ((level==0)?level:level-1); 621 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level); 622 child>=1; child--, child_tid-=(1<<level)) 623 #else 624 if (((tid >> level) & (branch_factor - 1)) != 0) 625 // No need to go lower than this, since this is the level parent would be notified 626 break; 627 // Iterate through children on this level of the tree 628 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads; 629 child++, child_tid+=(1<<level)) 630 #endif // KMP_REVERSE_HYPER_BAR 631 { 632 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going 633 else { 634 register kmp_info_t *child_thr = other_threads[child_tid]; 635 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 636 #if KMP_CACHE_MANAGE 637 register kmp_uint32 next_child_tid = child_tid - (1 << level); 638 // Prefetch next thread's go count 639 # ifdef KMP_REVERSE_HYPER_BAR 640 if (child-1 >= 1 && next_child_tid < num_threads) 641 # else 642 if (child+1 < branch_factor && next_child_tid < num_threads) 643 # endif // KMP_REVERSE_HYPER_BAR 644 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); 645 #endif /* KMP_CACHE_MANAGE */ 646 647 #if KMP_BARRIER_ICV_PUSH 648 if (propagate_icvs) // push my fixed ICVs to my child 649 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 650 #endif // KMP_BARRIER_ICV_PUSH 651 652 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 653 "go(%p): %u => %u\n", gtid, team->t.t_id, tid, 654 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 655 child_tid, &child_bar->b_go, child_bar->b_go, 656 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 657 // Release child from barrier 658 kmp_flag_64 flag(&child_bar->b_go, child_thr); 659 flag.release(); 660 } 661 } 662 } 663 #if KMP_BARRIER_ICV_PUSH 664 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest 665 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); 666 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs); 667 } 668 #endif 669 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 670 gtid, team->t.t_id, tid, bt)); 671 } 672 673 // Hierarchical Barrier 674 675 // Initialize thread barrier data 676 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the 677 minimum amount of initialization required based on how the team has changed. Returns true if 678 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the 679 team size increases, threads already in the team will respond to on-core wakeup on their parent 680 thread, but threads newly added to the team will only be listening on the their local b_go. */ 681 static bool 682 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc, 683 int gtid, int tid, kmp_team_t *team) 684 { 685 // Checks to determine if (re-)initialization is needed 686 bool uninitialized = thr_bar->team == NULL; 687 bool team_changed = team != thr_bar->team; 688 bool team_sz_changed = nproc != thr_bar->nproc; 689 bool tid_changed = tid != thr_bar->old_tid; 690 bool retval = false; 691 692 if (uninitialized || team_sz_changed) { 693 __kmp_get_hierarchy(nproc, thr_bar); 694 } 695 696 if (uninitialized || team_sz_changed || tid_changed) { 697 thr_bar->my_level = thr_bar->depth-1; // default for master 698 thr_bar->parent_tid = -1; // default for master 699 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy 700 kmp_uint32 d=0; 701 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level 702 kmp_uint32 rem; 703 if (d == thr_bar->depth-2) { // reached level right below the master 704 thr_bar->parent_tid = 0; 705 thr_bar->my_level = d; 706 break; 707 } 708 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster? 709 // thread is not a subtree root at next level, so this is max 710 thr_bar->parent_tid = tid - rem; 711 thr_bar->my_level = d; 712 break; 713 } 714 ++d; 715 } 716 } 717 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1); 718 thr_bar->old_tid = tid; 719 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 720 thr_bar->team = team; 721 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 722 } 723 if (uninitialized || team_changed || tid_changed) { 724 thr_bar->team = team; 725 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 726 retval = true; 727 } 728 if (uninitialized || team_sz_changed || tid_changed) { 729 thr_bar->nproc = nproc; 730 thr_bar->leaf_kids = thr_bar->base_leaf_kids; 731 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0; 732 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc) 733 thr_bar->leaf_kids = nproc - tid - 1; 734 thr_bar->leaf_state = 0; 735 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1; 736 } 737 return retval; 738 } 739 740 static void 741 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, 742 int gtid, int tid, void (*reduce) (void *, void *) 743 USE_ITT_BUILD_ARG(void * itt_sync_obj) ) 744 { 745 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); 746 register kmp_team_t *team = this_thr->th.th_team; 747 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb; 748 register kmp_uint32 nproc = this_thr->th.th_team_nproc; 749 register kmp_info_t **other_threads = team->t.t_threads; 750 register kmp_uint64 new_state; 751 752 int level = team->t.t_level; 753 #if OMP_40_ENABLED 754 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct? 755 if (this_thr->th.th_teams_size.nteams > 1) 756 ++level; // level was not increased in teams construct for team_of_masters 757 #endif 758 if (level == 1) thr_bar->use_oncore_barrier = 1; 759 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 760 761 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 762 gtid, team->t.t_id, tid, bt)); 763 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 764 765 #if USE_ITT_BUILD && USE_ITT_NOTIFY 766 // Barrier imbalance - save arrive time to the thread 767 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 768 this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); 769 } 770 #endif 771 772 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team); 773 774 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) 775 register kmp_int32 child_tid; 776 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 777 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { 778 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag 779 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; 780 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n", 781 gtid, team->t.t_id, tid)); 782 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state); 783 flag.wait(this_thr, FALSE 784 USE_ITT_BUILD_ARG(itt_sync_obj) ); 785 if (reduce) { 786 ANNOTATE_REDUCE_AFTER(reduce); 787 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) { 788 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 789 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 790 team->t.t_id, child_tid)); 791 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data); 792 } 793 ANNOTATE_REDUCE_BEFORE(reduce); 794 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 795 } 796 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits 797 } 798 // Next, wait for higher level children on each child's b_arrived flag 799 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0 800 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d]; 801 if (last > nproc) last = nproc; 802 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { 803 register kmp_info_t *child_thr = other_threads[child_tid]; 804 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 805 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 806 "arrived(%p) == %llu\n", 807 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 808 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 809 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 810 flag.wait(this_thr, FALSE 811 USE_ITT_BUILD_ARG(itt_sync_obj) ); 812 if (reduce) { 813 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 814 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 815 team->t.t_id, child_tid)); 816 ANNOTATE_REDUCE_AFTER(reduce); 817 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); 818 ANNOTATE_REDUCE_BEFORE(reduce); 819 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 820 } 821 } 822 } 823 } 824 else { // Blocktime is not infinite 825 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first 826 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d]; 827 if (last > nproc) last = nproc; 828 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { 829 register kmp_info_t *child_thr = other_threads[child_tid]; 830 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 831 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 832 "arrived(%p) == %llu\n", 833 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 834 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 835 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 836 flag.wait(this_thr, FALSE 837 USE_ITT_BUILD_ARG(itt_sync_obj) ); 838 if (reduce) { 839 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 840 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 841 team->t.t_id, child_tid)); 842 ANNOTATE_REDUCE_AFTER(reduce); 843 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); 844 ANNOTATE_REDUCE_BEFORE(reduce); 845 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 846 } 847 } 848 } 849 } 850 } 851 // All subordinates are gathered; now release parent if not master thread 852 853 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy 854 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 855 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, 856 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid, 857 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP)); 858 /* Mark arrival to parent: After performing this write, a worker thread may not assume that 859 the team is valid any more - it could be deallocated by the master thread at any time. */ 860 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME 861 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it 862 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]); 863 flag.release(); 864 } 865 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag 866 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 867 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset); 868 flag.set_waiter(other_threads[thr_bar->parent_tid]); 869 flag.release(); 870 } 871 } else { // Master thread needs to update the team's b_arrived value 872 team->t.t_bar[bt].b_arrived = new_state; 873 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", 874 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 875 } 876 // Is the team access below unsafe or just technically invalid? 877 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 878 gtid, team->t.t_id, tid, bt)); 879 } 880 881 static void 882 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 883 int propagate_icvs 884 USE_ITT_BUILD_ARG(void * itt_sync_obj) ) 885 { 886 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); 887 register kmp_team_t *team; 888 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 889 register kmp_uint32 nproc; 890 bool team_change = false; // indicates on-core barrier shouldn't be used 891 892 if (KMP_MASTER_TID(tid)) { 893 team = __kmp_threads[gtid]->th.th_team; 894 KMP_DEBUG_ASSERT(team != NULL); 895 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n", 896 gtid, team->t.t_id, tid, bt)); 897 } 898 else { // Worker threads 899 // Wait for parent thread to release me 900 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME 901 || thr_bar->my_level != 0 || thr_bar->team == NULL) { 902 // Use traditional method of waiting on my own b_go flag 903 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; 904 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 905 flag.wait(this_thr, TRUE 906 USE_ITT_BUILD_ARG(itt_sync_obj) ); 907 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 908 } 909 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested 910 // Wait on my "offset" bits on parent's b_go flag 911 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; 912 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset, 913 bt, this_thr 914 USE_ITT_BUILD_ARG(itt_sync_obj) ); 915 flag.wait(this_thr, TRUE); 916 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go 917 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 918 } 919 else { // Reset my bits on parent's b_go flag 920 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0; 921 } 922 } 923 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 924 // Early exit for reaping threads releasing forkjoin barrier 925 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 926 return; 927 // The worker thread may now assume that the team is valid. 928 team = __kmp_threads[gtid]->th.th_team; 929 KMP_DEBUG_ASSERT(team != NULL); 930 tid = __kmp_tid_from_gtid(gtid); 931 932 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 933 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 934 KMP_MB(); // Flush all pending memory write invalidates. 935 } 936 937 nproc = this_thr->th.th_team_nproc; 938 int level = team->t.t_level; 939 #if OMP_40_ENABLED 940 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct? 941 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level) 942 ++level; // level was not increased in teams construct for team_of_workers 943 if( this_thr->th.th_teams_size.nteams > 1 ) 944 ++level; // level was not increased in teams construct for team_of_masters 945 } 946 #endif 947 if (level == 1) thr_bar->use_oncore_barrier = 1; 948 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 949 950 // If the team size has increased, we still communicate with old leaves via oncore barrier. 951 unsigned short int old_leaf_kids = thr_bar->leaf_kids; 952 kmp_uint64 old_leaf_state = thr_bar->leaf_state; 953 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team); 954 // But if the entire team changes, we won't use oncore barrier at all 955 if (team_change) old_leaf_kids = 0; 956 957 #if KMP_BARRIER_ICV_PUSH 958 if (propagate_icvs) { 959 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); 960 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy 961 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs); 962 } 963 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime 964 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) 965 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store 966 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 967 &thr_bar->parent_bar->th_fixed_icvs); 968 // non-leaves will get ICVs piggybacked with b_go via NGO store 969 } 970 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs 971 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access 972 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); 973 else // leaves copy parent's fixed ICVs directly to local ICV store 974 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 975 &thr_bar->parent_bar->th_fixed_icvs); 976 } 977 } 978 #endif // KMP_BARRIER_ICV_PUSH 979 980 // Now, release my children 981 if (thr_bar->my_level) { // not a leaf 982 register kmp_int32 child_tid; 983 kmp_uint32 last; 984 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { 985 if (KMP_MASTER_TID(tid)) { // do a flat release 986 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go. 987 thr_bar->b_go = KMP_BARRIER_STATE_BUMP; 988 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line 989 ngo_load(&thr_bar->th_fixed_icvs); 990 // This loops over all the threads skipping only the leaf nodes in the hierarchy 991 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) { 992 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb; 993 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)" 994 " go(%p): %u => %u\n", 995 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 996 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 997 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 998 // Use ngo store (if available) to both store ICVs and release child via child's b_go 999 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 1000 } 1001 ngo_sync(); 1002 } 1003 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1004 // Now, release leaf children 1005 if (thr_bar->leaf_kids) { // if there are any 1006 // We test team_change on the off-chance that the level 1 team changed. 1007 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new 1008 if (old_leaf_kids) { // release old leaf kids 1009 thr_bar->b_go |= old_leaf_state; 1010 } 1011 // Release new leaf kids 1012 last = tid+thr_bar->skip_per_level[1]; 1013 if (last > nproc) last = nproc; 1014 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1 1015 register kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1016 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1017 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 1018 " T#%d(%d:%d) go(%p): %u => %u\n", 1019 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1020 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1021 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1022 // Release child using child's b_go flag 1023 kmp_flag_64 flag(&child_bar->b_go, child_thr); 1024 flag.release(); 1025 } 1026 } 1027 else { // Release all children at once with leaf_state bits on my own b_go flag 1028 thr_bar->b_go |= thr_bar->leaf_state; 1029 } 1030 } 1031 } 1032 else { // Blocktime is not infinite; do a simple hierarchical release 1033 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first 1034 last = tid+thr_bar->skip_per_level[d+1]; 1035 kmp_uint32 skip = thr_bar->skip_per_level[d]; 1036 if (last > nproc) last = nproc; 1037 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { 1038 register kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1039 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1040 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)" 1041 " go(%p): %u => %u\n", 1042 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1043 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1044 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1045 // Release child using child's b_go flag 1046 kmp_flag_64 flag(&child_bar->b_go, child_thr); 1047 flag.release(); 1048 } 1049 } 1050 } 1051 #if KMP_BARRIER_ICV_PUSH 1052 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest 1053 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs); 1054 #endif // KMP_BARRIER_ICV_PUSH 1055 } 1056 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 1057 gtid, team->t.t_id, tid, bt)); 1058 } 1059 1060 // ---------------------------- End of Barrier Algorithms ---------------------------- 1061 1062 // Internal function to do a barrier. 1063 /* If is_split is true, do a split barrier, otherwise, do a plain barrier 1064 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier 1065 Returns 0 if master thread, 1 if worker thread. */ 1066 int 1067 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, 1068 void *reduce_data, void (*reduce)(void *, void *)) 1069 { 1070 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); 1071 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1072 register int tid = __kmp_tid_from_gtid(gtid); 1073 register kmp_info_t *this_thr = __kmp_threads[gtid]; 1074 register kmp_team_t *team = this_thr->th.th_team; 1075 register int status = 0; 1076 ident_t *loc = __kmp_threads[gtid]->th.th_ident; 1077 #if OMPT_SUPPORT 1078 ompt_task_id_t my_task_id; 1079 ompt_parallel_id_t my_parallel_id; 1080 #endif 1081 1082 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", 1083 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1084 1085 ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar); 1086 #if OMPT_SUPPORT 1087 if (ompt_enabled) { 1088 #if OMPT_BLAME 1089 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id; 1090 my_parallel_id = team->t.ompt_team_info.parallel_id; 1091 1092 #if OMPT_TRACE 1093 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) { 1094 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) { 1095 ompt_callbacks.ompt_callback(ompt_event_single_others_end)( 1096 my_parallel_id, my_task_id); 1097 } 1098 } 1099 #endif 1100 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { 1101 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( 1102 my_parallel_id, my_task_id); 1103 } 1104 #endif 1105 // It is OK to report the barrier state after the barrier begin callback. 1106 // According to the OMPT specification, a compliant implementation may 1107 // even delay reporting this state until the barrier begins to wait. 1108 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; 1109 } 1110 #endif 1111 1112 if (! team->t.t_serialized) { 1113 #if USE_ITT_BUILD 1114 // This value will be used in itt notify events below. 1115 void *itt_sync_obj = NULL; 1116 # if USE_ITT_NOTIFY 1117 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1118 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1119 # endif 1120 #endif /* USE_ITT_BUILD */ 1121 if (__kmp_tasking_mode == tskm_extra_barrier) { 1122 __kmp_tasking_barrier(team, this_thr, gtid); 1123 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", 1124 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1125 } 1126 1127 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when 1128 the team struct is not guaranteed to exist. */ 1129 // See note about the corresponding code in __kmp_join_barrier() being performance-critical. 1130 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1131 #if KMP_USE_MONITOR 1132 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1133 #endif 1134 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1135 } 1136 1137 #if USE_ITT_BUILD 1138 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1139 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1140 #endif /* USE_ITT_BUILD */ 1141 #if USE_DEBUGGER 1142 // Let the debugger know: the thread arrived to the barrier and waiting. 1143 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure. 1144 team->t.t_bar[bt].b_master_arrived += 1; 1145 } else { 1146 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; 1147 } // if 1148 #endif /* USE_DEBUGGER */ 1149 if (reduce != NULL) { 1150 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 1151 this_thr->th.th_local.reduce_data = reduce_data; 1152 } 1153 1154 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) 1155 __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1 1156 1157 switch (__kmp_barrier_gather_pattern[bt]) { 1158 case bp_hyper_bar: { 1159 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear 1160 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce 1161 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1162 break; 1163 } 1164 case bp_hierarchical_bar: { 1165 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce 1166 USE_ITT_BUILD_ARG(itt_sync_obj)); 1167 break; 1168 } 1169 case bp_tree_bar: { 1170 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear 1171 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce 1172 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1173 break; 1174 } 1175 default: { 1176 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce 1177 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1178 } 1179 } 1180 1181 KMP_MB(); 1182 1183 if (KMP_MASTER_TID(tid)) { 1184 status = 0; 1185 if (__kmp_tasking_mode != tskm_immediate_exec) { 1186 __kmp_task_team_wait(this_thr, team 1187 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1188 } 1189 #if USE_DEBUGGER 1190 // Let the debugger know: All threads are arrived and starting leaving the barrier. 1191 team->t.t_bar[bt].b_team_arrived += 1; 1192 #endif 1193 1194 #if USE_ITT_BUILD 1195 /* TODO: In case of split reduction barrier, master thread may send acquired event early, 1196 before the final summation into the shared variable is done (final summation can be a 1197 long operation for array reductions). */ 1198 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1199 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1200 #endif /* USE_ITT_BUILD */ 1201 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1202 // Barrier - report frame end (only if active_level == 1) 1203 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && 1204 #if OMP_40_ENABLED 1205 this_thr->th.th_teams_microtask == NULL && 1206 #endif 1207 team->t.t_active_level == 1) 1208 { 1209 kmp_uint64 cur_time = __itt_get_timestamp(); 1210 kmp_info_t **other_threads = team->t.t_threads; 1211 int nproc = this_thr->th.th_team_nproc; 1212 int i; 1213 switch(__kmp_forkjoin_frames_mode) { 1214 case 1: 1215 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); 1216 this_thr->th.th_frame_time = cur_time; 1217 break; 1218 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed) 1219 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc); 1220 break; 1221 case 3: 1222 if( __itt_metadata_add_ptr ) { 1223 // Initialize with master's wait time 1224 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1225 // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below 1226 this_thr->th.th_bar_arrive_time = 0; 1227 for (i=1; i<nproc; ++i) { 1228 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time ); 1229 other_threads[i]->th.th_bar_arrive_time = 0; 1230 } 1231 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL)); 1232 } 1233 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); 1234 this_thr->th.th_frame_time = cur_time; 1235 break; 1236 } 1237 } 1238 #endif /* USE_ITT_BUILD */ 1239 } else { 1240 status = 1; 1241 #if USE_ITT_BUILD 1242 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1243 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1244 #endif /* USE_ITT_BUILD */ 1245 } 1246 if (status == 1 || ! is_split) { 1247 switch (__kmp_barrier_release_pattern[bt]) { 1248 case bp_hyper_bar: { 1249 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1250 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE 1251 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1252 break; 1253 } 1254 case bp_hierarchical_bar: { 1255 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE 1256 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1257 break; 1258 } 1259 case bp_tree_bar: { 1260 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1261 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE 1262 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1263 break; 1264 } 1265 default: { 1266 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE 1267 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1268 } 1269 } 1270 if (__kmp_tasking_mode != tskm_immediate_exec) { 1271 __kmp_task_team_sync(this_thr, team); 1272 } 1273 } 1274 1275 #if USE_ITT_BUILD 1276 /* GEH: TODO: Move this under if-condition above and also include in 1277 __kmp_end_split_barrier(). This will more accurately represent the actual release time 1278 of the threads for split barriers. */ 1279 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1280 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1281 #endif /* USE_ITT_BUILD */ 1282 } else { // Team is serialized. 1283 status = 0; 1284 if (__kmp_tasking_mode != tskm_immediate_exec) { 1285 #if OMP_45_ENABLED 1286 if ( this_thr->th.th_task_team != NULL ) { 1287 void *itt_sync_obj = NULL; 1288 #if USE_ITT_NOTIFY 1289 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1290 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1291 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1292 } 1293 #endif 1294 1295 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE); 1296 __kmp_task_team_wait(this_thr, team 1297 USE_ITT_BUILD_ARG(itt_sync_obj)); 1298 __kmp_task_team_setup(this_thr, team, 0); 1299 1300 #if USE_ITT_BUILD 1301 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1302 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1303 #endif /* USE_ITT_BUILD */ 1304 } 1305 #else 1306 // The task team should be NULL for serialized code (tasks will be executed immediately) 1307 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL); 1308 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL); 1309 #endif 1310 } 1311 } 1312 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", 1313 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status)); 1314 1315 #if OMPT_SUPPORT 1316 if (ompt_enabled) { 1317 #if OMPT_BLAME 1318 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { 1319 ompt_callbacks.ompt_callback(ompt_event_barrier_end)( 1320 my_parallel_id, my_task_id); 1321 } 1322 #endif 1323 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1324 } 1325 #endif 1326 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar); 1327 1328 return status; 1329 } 1330 1331 1332 void 1333 __kmp_end_split_barrier(enum barrier_type bt, int gtid) 1334 { 1335 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); 1336 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1337 int tid = __kmp_tid_from_gtid(gtid); 1338 kmp_info_t *this_thr = __kmp_threads[gtid]; 1339 kmp_team_t *team = this_thr->th.th_team; 1340 1341 ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar); 1342 if (!team->t.t_serialized) { 1343 if (KMP_MASTER_GTID(gtid)) { 1344 switch (__kmp_barrier_release_pattern[bt]) { 1345 case bp_hyper_bar: { 1346 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1347 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE 1348 USE_ITT_BUILD_ARG(NULL) ); 1349 break; 1350 } 1351 case bp_hierarchical_bar: { 1352 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE 1353 USE_ITT_BUILD_ARG(NULL)); 1354 break; 1355 } 1356 case bp_tree_bar: { 1357 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1358 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE 1359 USE_ITT_BUILD_ARG(NULL) ); 1360 break; 1361 } 1362 default: { 1363 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE 1364 USE_ITT_BUILD_ARG(NULL) ); 1365 } 1366 } 1367 if (__kmp_tasking_mode != tskm_immediate_exec) { 1368 __kmp_task_team_sync(this_thr, team); 1369 } // if 1370 } 1371 } 1372 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar); 1373 } 1374 1375 1376 void 1377 __kmp_join_barrier(int gtid) 1378 { 1379 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); 1380 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1381 register kmp_info_t *this_thr = __kmp_threads[gtid]; 1382 register kmp_team_t *team; 1383 register kmp_uint nproc; 1384 kmp_info_t *master_thread; 1385 int tid; 1386 #ifdef KMP_DEBUG 1387 int team_id; 1388 #endif /* KMP_DEBUG */ 1389 #if USE_ITT_BUILD 1390 void *itt_sync_obj = NULL; 1391 # if USE_ITT_NOTIFY 1392 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need 1393 // Get object created at fork_barrier 1394 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1395 # endif 1396 #endif /* USE_ITT_BUILD */ 1397 KMP_MB(); 1398 1399 // Get current info 1400 team = this_thr->th.th_team; 1401 nproc = this_thr->th.th_team_nproc; 1402 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); 1403 tid = __kmp_tid_from_gtid(gtid); 1404 #ifdef KMP_DEBUG 1405 team_id = team->t.t_id; 1406 #endif /* KMP_DEBUG */ 1407 master_thread = this_thr->th.th_team_master; 1408 #ifdef KMP_DEBUG 1409 if (master_thread != team->t.t_threads[0]) { 1410 __kmp_print_structure(); 1411 } 1412 #endif /* KMP_DEBUG */ 1413 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); 1414 KMP_MB(); 1415 1416 // Verify state 1417 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 1418 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); 1419 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); 1420 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); 1421 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid)); 1422 1423 ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar); 1424 #if OMPT_SUPPORT 1425 #if OMPT_TRACE 1426 if (ompt_enabled && 1427 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { 1428 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( 1429 team->t.ompt_team_info.parallel_id, 1430 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1431 } 1432 #endif 1433 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; 1434 #endif 1435 1436 if (__kmp_tasking_mode == tskm_extra_barrier) { 1437 __kmp_tasking_barrier(team, this_thr, gtid); 1438 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid)); 1439 } 1440 # ifdef KMP_DEBUG 1441 if (__kmp_tasking_mode != tskm_immediate_exec) { 1442 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n", 1443 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state], 1444 this_thr->th.th_task_team)); 1445 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]); 1446 } 1447 # endif /* KMP_DEBUG */ 1448 1449 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the 1450 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows 1451 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite, 1452 since the values are not used by __kmp_wait_template() in that case. */ 1453 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1454 #if KMP_USE_MONITOR 1455 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1456 #endif 1457 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1458 } 1459 1460 #if USE_ITT_BUILD 1461 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1462 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1463 #endif /* USE_ITT_BUILD */ 1464 1465 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { 1466 case bp_hyper_bar: { 1467 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1468 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL 1469 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1470 break; 1471 } 1472 case bp_hierarchical_bar: { 1473 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL 1474 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1475 break; 1476 } 1477 case bp_tree_bar: { 1478 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1479 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL 1480 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1481 break; 1482 } 1483 default: { 1484 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL 1485 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1486 } 1487 } 1488 1489 /* From this point on, the team data structure may be deallocated at any time by the 1490 master thread - it is unsafe to reference it in any of the worker threads. Any per-team 1491 data items that need to be referenced before the end of the barrier should be moved to 1492 the kmp_task_team_t structs. */ 1493 if (KMP_MASTER_TID(tid)) { 1494 if (__kmp_tasking_mode != tskm_immediate_exec) { 1495 __kmp_task_team_wait(this_thr, team 1496 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1497 } 1498 #if KMP_STATS_ENABLED 1499 // Have master thread flag the workers to indicate they are now waiting for 1500 // next parallel region, Also wake them up so they switch their timers to idle. 1501 for (int i=0; i<team->t.t_nproc; ++i) { 1502 kmp_info_t* team_thread = team->t.t_threads[i]; 1503 if (team_thread == this_thr) 1504 continue; 1505 team_thread->th.th_stats->setIdleFlag(); 1506 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL) 1507 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc); 1508 } 1509 #endif 1510 #if USE_ITT_BUILD 1511 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1512 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1513 #endif /* USE_ITT_BUILD */ 1514 1515 # if USE_ITT_BUILD && USE_ITT_NOTIFY 1516 // Join barrier - report frame end 1517 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && 1518 #if OMP_40_ENABLED 1519 this_thr->th.th_teams_microtask == NULL && 1520 #endif 1521 team->t.t_active_level == 1) 1522 { 1523 kmp_uint64 cur_time = __itt_get_timestamp(); 1524 ident_t * loc = team->t.t_ident; 1525 kmp_info_t **other_threads = team->t.t_threads; 1526 int nproc = this_thr->th.th_team_nproc; 1527 int i; 1528 switch(__kmp_forkjoin_frames_mode) { 1529 case 1: 1530 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); 1531 break; 1532 case 2: 1533 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc); 1534 break; 1535 case 3: 1536 if( __itt_metadata_add_ptr ) { 1537 // Initialize with master's wait time 1538 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1539 // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below 1540 this_thr->th.th_bar_arrive_time = 0; 1541 for (i=1; i<nproc; ++i) { 1542 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time ); 1543 other_threads[i]->th.th_bar_arrive_time = 0; 1544 } 1545 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0); 1546 } 1547 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); 1548 this_thr->th.th_frame_time = cur_time; 1549 break; 1550 } 1551 } 1552 # endif /* USE_ITT_BUILD */ 1553 } 1554 #if USE_ITT_BUILD 1555 else { 1556 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1557 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1558 } 1559 #endif /* USE_ITT_BUILD */ 1560 1561 #if KMP_DEBUG 1562 if (KMP_MASTER_TID(tid)) { 1563 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", 1564 gtid, team_id, tid, nproc)); 1565 } 1566 #endif /* KMP_DEBUG */ 1567 1568 // TODO now, mark worker threads as done so they may be disbanded 1569 KMP_MB(); // Flush all pending memory write invalidates. 1570 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); 1571 1572 #if OMPT_SUPPORT 1573 if (ompt_enabled) { 1574 #if OMPT_BLAME 1575 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { 1576 ompt_callbacks.ompt_callback(ompt_event_barrier_end)( 1577 team->t.ompt_team_info.parallel_id, 1578 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1579 } 1580 #endif 1581 1582 // return to default state 1583 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 1584 } 1585 #endif 1586 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar); 1587 } 1588 1589 1590 // TODO release worker threads' fork barriers as we are ready instead of all at once 1591 void 1592 __kmp_fork_barrier(int gtid, int tid) 1593 { 1594 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); 1595 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1596 kmp_info_t *this_thr = __kmp_threads[gtid]; 1597 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; 1598 #if USE_ITT_BUILD 1599 void * itt_sync_obj = NULL; 1600 #endif /* USE_ITT_BUILD */ 1601 if (team) 1602 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar); 1603 1604 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", 1605 gtid, (team != NULL) ? team->t.t_id : -1, tid)); 1606 1607 // th_team pointer only valid for master thread here 1608 if (KMP_MASTER_TID(tid)) { 1609 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1610 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1611 // Create itt barrier object 1612 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); 1613 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing 1614 } 1615 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1616 1617 #ifdef KMP_DEBUG 1618 register kmp_info_t **other_threads = team->t.t_threads; 1619 register int i; 1620 1621 // Verify state 1622 KMP_MB(); 1623 1624 for(i=1; i<team->t.t_nproc; ++i) { 1625 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n", 1626 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, 1627 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, 1628 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); 1629 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) 1630 & ~(KMP_BARRIER_SLEEP_STATE)) 1631 == KMP_INIT_BARRIER_STATE); 1632 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); 1633 } 1634 #endif 1635 1636 if (__kmp_tasking_mode != tskm_immediate_exec) { 1637 __kmp_task_team_setup(this_thr, team, 0); // 0 indicates setup current task team if nthreads > 1 1638 } 1639 1640 /* The master thread may have changed its blocktime between the join barrier and the 1641 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can 1642 access it when the team struct is not guaranteed to exist. */ 1643 // See note about the corresponding code in __kmp_join_barrier() being performance-critical 1644 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1645 #if KMP_USE_MONITOR 1646 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1647 #endif 1648 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1649 } 1650 } // master 1651 1652 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { 1653 case bp_hyper_bar: { 1654 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1655 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE 1656 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1657 break; 1658 } 1659 case bp_hierarchical_bar: { 1660 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE 1661 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1662 break; 1663 } 1664 case bp_tree_bar: { 1665 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1666 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE 1667 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1668 break; 1669 } 1670 default: { 1671 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE 1672 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1673 } 1674 } 1675 1676 // Early exit for reaping threads releasing forkjoin barrier 1677 if (TCR_4(__kmp_global.g.g_done)) { 1678 this_thr->th.th_task_team = NULL; 1679 1680 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1681 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1682 if (!KMP_MASTER_TID(tid)) { 1683 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1684 if (itt_sync_obj) 1685 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1686 } 1687 } 1688 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1689 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); 1690 return; 1691 } 1692 1693 /* We can now assume that a valid team structure has been allocated by the master and 1694 propagated to all worker threads. The current thread, however, may not be part of the 1695 team, so we can't blindly assume that the team pointer is non-null. */ 1696 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); 1697 KMP_DEBUG_ASSERT(team != NULL); 1698 tid = __kmp_tid_from_gtid(gtid); 1699 1700 1701 #if KMP_BARRIER_ICV_PULL 1702 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 1703 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has 1704 this data before this function is called. We cannot modify __kmp_fork_call() to look at 1705 the fixed ICVs in the master's thread struct, because it is not always the case that the 1706 threads arrays have been allocated when __kmp_fork_call() is executed. */ 1707 { 1708 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 1709 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs 1710 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid. 1711 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); 1712 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); 1713 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1714 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs); 1715 } 1716 } 1717 #endif // KMP_BARRIER_ICV_PULL 1718 1719 if (__kmp_tasking_mode != tskm_immediate_exec) { 1720 __kmp_task_team_sync(this_thr, team); 1721 } 1722 1723 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 1724 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 1725 if (proc_bind == proc_bind_intel) { 1726 #endif 1727 #if KMP_AFFINITY_SUPPORTED 1728 // Call dynamic affinity settings 1729 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) { 1730 __kmp_balanced_affinity(tid, team->t.t_nproc); 1731 } 1732 #endif // KMP_AFFINITY_SUPPORTED 1733 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 1734 } 1735 else if (proc_bind != proc_bind_false) { 1736 if (this_thr->th.th_new_place == this_thr->th.th_current_place) { 1737 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", 1738 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place)); 1739 } 1740 else { 1741 __kmp_affinity_set_place(gtid); 1742 } 1743 } 1744 #endif 1745 1746 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1747 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1748 if (!KMP_MASTER_TID(tid)) { 1749 // Get correct barrier object 1750 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1751 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired 1752 } // (prepare called inside barrier_release) 1753 } 1754 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1755 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar); 1756 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid)); 1757 } 1758 1759 1760 void 1761 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc ) 1762 { 1763 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); 1764 1765 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); 1766 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 1767 1768 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 1769 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has 1770 this data before this function is called. */ 1771 #if KMP_BARRIER_ICV_PULL 1772 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where 1773 all of the worker threads can access them and make their own copies after the barrier. */ 1774 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point 1775 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs); 1776 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 1777 0, team->t.t_threads[0], team)); 1778 #elif KMP_BARRIER_ICV_PUSH 1779 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here. 1780 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 1781 0, team->t.t_threads[0], team)); 1782 #else 1783 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time. 1784 ngo_load(new_icvs); 1785 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point 1786 for (int f=1; f<new_nproc; ++f) { // Skip the master thread 1787 // TODO: GEH - pass in better source location info since usually NULL here 1788 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 1789 f, team->t.t_threads[f], team)); 1790 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); 1791 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); 1792 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 1793 f, team->t.t_threads[f], team)); 1794 } 1795 ngo_sync(); 1796 #endif // KMP_BARRIER_ICV_PULL 1797 } 1798