1 /* 2 * kmp_barrier.cpp 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_wait_release.h" 18 #include "kmp_stats.h" 19 #include "kmp_itt.h" 20 #include "kmp_os.h" 21 22 23 #if KMP_MIC 24 #include <immintrin.h> 25 #define USE_NGO_STORES 1 26 #endif // KMP_MIC 27 28 #if KMP_MIC && USE_NGO_STORES 29 // ICV copying 30 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 31 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 32 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 33 #define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory") 34 #else 35 #define ngo_load(src) ((void)0) 36 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 37 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 38 #define ngo_sync() ((void)0) 39 #endif /* KMP_MIC && USE_NGO_STORES */ 40 41 void __kmp_print_structure(void); // Forward declaration 42 43 // ---------------------------- Barrier Algorithms ---------------------------- 44 45 // Linear Barrier 46 static void 47 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 48 void (*reduce)(void *, void *) 49 USE_ITT_BUILD_ARG(void * itt_sync_obj) ) 50 { 51 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather); 52 register kmp_team_t *team = this_thr->th.th_team; 53 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb; 54 register kmp_info_t **other_threads = team->t.t_threads; 55 56 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 57 gtid, team->t.t_id, tid, bt)); 58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 59 60 #if USE_ITT_BUILD && USE_ITT_NOTIFY 61 // Barrier imbalance - save arrive time to the thread 62 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 63 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); 64 } 65 #endif 66 // We now perform a linear reduction to signal that all of the threads have arrived. 67 if (!KMP_MASTER_TID(tid)) { 68 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 69 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, 70 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived, 71 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 72 // Mark arrival to master thread 73 /* After performing this write, a worker thread may not assume that the team is valid 74 any more - it could be deallocated by the master thread at any time. */ 75 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]); 76 flag.release(); 77 } else { 78 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; 79 register int nproc = this_thr->th.th_team_nproc; 80 register int i; 81 // Don't have to worry about sleep bit here or atomic since team setting 82 register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; 83 84 // Collect all the worker team member threads. 85 for (i=1; i<nproc; ++i) { 86 #if KMP_CACHE_MANAGE 87 // Prefetch next thread's arrived count 88 if (i+1 < nproc) 89 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived); 90 #endif /* KMP_CACHE_MANAGE */ 91 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 92 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, 93 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 94 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); 95 96 // Wait for worker thread to arrive 97 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); 98 flag.wait(this_thr, FALSE 99 USE_ITT_BUILD_ARG(itt_sync_obj) ); 100 #if USE_ITT_BUILD && USE_ITT_NOTIFY 101 // Barrier imbalance - write min of the thread time and the other thread time to the thread. 102 if (__kmp_forkjoin_frames_mode == 2) { 103 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 104 other_threads[i]->th.th_bar_min_time); 105 } 106 #endif 107 if (reduce) { 108 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid, 109 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i)); 110 (*reduce)(this_thr->th.th_local.reduce_data, 111 other_threads[i]->th.th_local.reduce_data); 112 } 113 } 114 // Don't have to worry about sleep bit here or atomic since team setting 115 team_bar->b_arrived = new_state; 116 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", 117 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state)); 118 } 119 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 120 gtid, team->t.t_id, tid, bt)); 121 } 122 123 static void 124 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 125 int propagate_icvs 126 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 127 { 128 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release); 129 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 130 register kmp_team_t *team; 131 132 if (KMP_MASTER_TID(tid)) { 133 register unsigned int i; 134 register kmp_uint32 nproc = this_thr->th.th_team_nproc; 135 register kmp_info_t **other_threads; 136 137 team = __kmp_threads[gtid]->th.th_team; 138 KMP_DEBUG_ASSERT(team != NULL); 139 other_threads = team->t.t_threads; 140 141 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", 142 gtid, team->t.t_id, tid, bt)); 143 144 if (nproc > 1) { 145 #if KMP_BARRIER_ICV_PUSH 146 { 147 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); 148 if (propagate_icvs) { 149 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); 150 for (i=1; i<nproc; ++i) { 151 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE); 152 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, 153 &team->t.t_implicit_task_taskdata[0].td_icvs); 154 } 155 ngo_sync(); 156 } 157 } 158 #endif // KMP_BARRIER_ICV_PUSH 159 160 // Now, release all of the worker threads 161 for (i=1; i<nproc; ++i) { 162 #if KMP_CACHE_MANAGE 163 // Prefetch next thread's go flag 164 if (i+1 < nproc) 165 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go); 166 #endif /* KMP_CACHE_MANAGE */ 167 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 168 "go(%p): %u => %u\n", gtid, team->t.t_id, tid, 169 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i, 170 &other_threads[i]->th.th_bar[bt].bb.b_go, 171 other_threads[i]->th.th_bar[bt].bb.b_go, 172 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); 173 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]); 174 flag.release(); 175 } 176 } 177 } else { // Wait for the MASTER thread to release us 178 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", 179 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 180 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 181 flag.wait(this_thr, TRUE 182 USE_ITT_BUILD_ARG(itt_sync_obj) ); 183 #if USE_ITT_BUILD && USE_ITT_NOTIFY 184 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 185 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled) 186 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 187 // Cancel wait on previous parallel region... 188 __kmp_itt_task_starting(itt_sync_obj); 189 190 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 191 return; 192 193 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 194 if (itt_sync_obj != NULL) 195 // Call prepare as early as possible for "new" barrier 196 __kmp_itt_task_finished(itt_sync_obj); 197 } else 198 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 199 // Early exit for reaping threads releasing forkjoin barrier 200 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) ) 201 return; 202 // The worker thread may now assume that the team is valid. 203 #ifdef KMP_DEBUG 204 tid = __kmp_tid_from_gtid(gtid); 205 team = __kmp_threads[gtid]->th.th_team; 206 #endif 207 KMP_DEBUG_ASSERT(team != NULL); 208 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 209 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 210 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 211 KMP_MB(); // Flush all pending memory write invalidates. 212 } 213 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 214 gtid, team->t.t_id, tid, bt)); 215 } 216 217 // Tree barrier 218 static void 219 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 220 void (*reduce)(void *, void *) 221 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 222 { 223 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather); 224 register kmp_team_t *team = this_thr->th.th_team; 225 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 226 register kmp_info_t **other_threads = team->t.t_threads; 227 register kmp_uint32 nproc = this_thr->th.th_team_nproc; 228 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 229 register kmp_uint32 branch_factor = 1 << branch_bits; 230 register kmp_uint32 child; 231 register kmp_uint32 child_tid; 232 register kmp_uint64 new_state; 233 234 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 235 gtid, team->t.t_id, tid, bt)); 236 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 237 238 #if USE_ITT_BUILD && USE_ITT_NOTIFY 239 // Barrier imbalance - save arrive time to the thread 240 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 241 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); 242 } 243 #endif 244 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go 245 child_tid = (tid << branch_bits) + 1; 246 if (child_tid < nproc) { 247 // Parent threads wait for all their children to arrive 248 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 249 child = 1; 250 do { 251 register kmp_info_t *child_thr = other_threads[child_tid]; 252 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 253 #if KMP_CACHE_MANAGE 254 // Prefetch next thread's arrived count 255 if (child+1 <= branch_factor && child_tid+1 < nproc) 256 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived); 257 #endif /* KMP_CACHE_MANAGE */ 258 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 259 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, 260 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, 261 &child_bar->b_arrived, new_state)); 262 // Wait for child to arrive 263 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 264 flag.wait(this_thr, FALSE 265 USE_ITT_BUILD_ARG(itt_sync_obj) ); 266 #if USE_ITT_BUILD && USE_ITT_NOTIFY 267 // Barrier imbalance - write min of the thread time and a child time to the thread. 268 if (__kmp_forkjoin_frames_mode == 2) { 269 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 270 child_thr->th.th_bar_min_time); 271 } 272 #endif 273 if (reduce) { 274 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 275 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 276 team->t.t_id, child_tid)); 277 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); 278 } 279 child++; 280 child_tid++; 281 } 282 while (child <= branch_factor && child_tid < nproc); 283 } 284 285 if (!KMP_MASTER_TID(tid)) { // Worker threads 286 register kmp_int32 parent_tid = (tid - 1) >> branch_bits; 287 288 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 289 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, 290 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid, 291 &thr_bar->b_arrived, thr_bar->b_arrived, 292 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 293 294 // Mark arrival to parent thread 295 /* After performing this write, a worker thread may not assume that the team is valid 296 any more - it could be deallocated by the master thread at any time. */ 297 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]); 298 flag.release(); 299 } else { 300 // Need to update the team arrived pointer if we are the master thread 301 if (nproc > 1) // New value was already computed above 302 team->t.t_bar[bt].b_arrived = new_state; 303 else 304 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 305 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", 306 gtid, team->t.t_id, tid, team->t.t_id, 307 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 308 } 309 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 310 gtid, team->t.t_id, tid, bt)); 311 } 312 313 static void 314 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 315 int propagate_icvs 316 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 317 { 318 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release); 319 register kmp_team_t *team; 320 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 321 register kmp_uint32 nproc; 322 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 323 register kmp_uint32 branch_factor = 1 << branch_bits; 324 register kmp_uint32 child; 325 register kmp_uint32 child_tid; 326 327 // Perform a tree release for all of the threads that have been gathered 328 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet 329 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", 330 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 331 // Wait for parent thread to release us 332 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 333 flag.wait(this_thr, TRUE 334 USE_ITT_BUILD_ARG(itt_sync_obj) ); 335 #if USE_ITT_BUILD && USE_ITT_NOTIFY 336 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 337 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled) 338 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 339 // Cancel wait on previous parallel region... 340 __kmp_itt_task_starting(itt_sync_obj); 341 342 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 343 return; 344 345 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 346 if (itt_sync_obj != NULL) 347 // Call prepare as early as possible for "new" barrier 348 __kmp_itt_task_finished(itt_sync_obj); 349 } else 350 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 351 // Early exit for reaping threads releasing forkjoin barrier 352 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 353 return; 354 355 // The worker thread may now assume that the team is valid. 356 team = __kmp_threads[gtid]->th.th_team; 357 KMP_DEBUG_ASSERT(team != NULL); 358 tid = __kmp_tid_from_gtid(gtid); 359 360 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 361 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 362 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 363 KMP_MB(); // Flush all pending memory write invalidates. 364 } else { 365 team = __kmp_threads[gtid]->th.th_team; 366 KMP_DEBUG_ASSERT(team != NULL); 367 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", 368 gtid, team->t.t_id, tid, bt)); 369 } 370 nproc = this_thr->th.th_team_nproc; 371 child_tid = (tid << branch_bits) + 1; 372 373 if (child_tid < nproc) { 374 register kmp_info_t **other_threads = team->t.t_threads; 375 child = 1; 376 // Parent threads release all their children 377 do { 378 register kmp_info_t *child_thr = other_threads[child_tid]; 379 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 380 #if KMP_CACHE_MANAGE 381 // Prefetch next thread's go count 382 if (child+1 <= branch_factor && child_tid+1 < nproc) 383 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go); 384 #endif /* KMP_CACHE_MANAGE */ 385 386 #if KMP_BARRIER_ICV_PUSH 387 { 388 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); 389 if (propagate_icvs) { 390 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid], 391 team, child_tid, FALSE); 392 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, 393 &team->t.t_implicit_task_taskdata[0].td_icvs); 394 } 395 } 396 #endif // KMP_BARRIER_ICV_PUSH 397 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 398 "go(%p): %u => %u\n", gtid, team->t.t_id, tid, 399 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 400 child_tid, &child_bar->b_go, child_bar->b_go, 401 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 402 // Release child from barrier 403 kmp_flag_64 flag(&child_bar->b_go, child_thr); 404 flag.release(); 405 child++; 406 child_tid++; 407 } 408 while (child <= branch_factor && child_tid < nproc); 409 } 410 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 411 gtid, team->t.t_id, tid, bt)); 412 } 413 414 415 // Hyper Barrier 416 static void 417 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 418 void (*reduce)(void *, void *) 419 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 420 { 421 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather); 422 register kmp_team_t *team = this_thr->th.th_team; 423 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 424 register kmp_info_t **other_threads = team->t.t_threads; 425 register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; 426 register kmp_uint32 num_threads = this_thr->th.th_team_nproc; 427 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 428 register kmp_uint32 branch_factor = 1 << branch_bits; 429 register kmp_uint32 offset; 430 register kmp_uint32 level; 431 432 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 433 gtid, team->t.t_id, tid, bt)); 434 435 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 436 437 #if USE_ITT_BUILD && USE_ITT_NOTIFY 438 // Barrier imbalance - save arrive time to the thread 439 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 440 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); 441 } 442 #endif 443 /* Perform a hypercube-embedded tree gather to wait until all of the threads have 444 arrived, and reduce any required data as we go. */ 445 kmp_flag_64 p_flag(&thr_bar->b_arrived); 446 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits) 447 { 448 register kmp_uint32 child; 449 register kmp_uint32 child_tid; 450 451 if (((tid >> level) & (branch_factor - 1)) != 0) { 452 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1); 453 454 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 455 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, 456 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid, 457 &thr_bar->b_arrived, thr_bar->b_arrived, 458 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 459 // Mark arrival to parent thread 460 /* After performing this write (in the last iteration of the enclosing for loop), 461 a worker thread may not assume that the team is valid any more - it could be 462 deallocated by the master thread at any time. */ 463 p_flag.set_waiter(other_threads[parent_tid]); 464 p_flag.release(); 465 break; 466 } 467 468 // Parent threads wait for children to arrive 469 if (new_state == KMP_BARRIER_UNUSED_STATE) 470 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 471 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads; 472 child++, child_tid+=(1 << level)) 473 { 474 register kmp_info_t *child_thr = other_threads[child_tid]; 475 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 476 #if KMP_CACHE_MANAGE 477 register kmp_uint32 next_child_tid = child_tid + (1 << level); 478 // Prefetch next thread's arrived count 479 if (child+1 < branch_factor && next_child_tid < num_threads) 480 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); 481 #endif /* KMP_CACHE_MANAGE */ 482 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 483 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, 484 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, 485 &child_bar->b_arrived, new_state)); 486 // Wait for child to arrive 487 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state); 488 c_flag.wait(this_thr, FALSE 489 USE_ITT_BUILD_ARG(itt_sync_obj) ); 490 #if USE_ITT_BUILD && USE_ITT_NOTIFY 491 // Barrier imbalance - write min of the thread time and a child time to the thread. 492 if (__kmp_forkjoin_frames_mode == 2) { 493 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 494 child_thr->th.th_bar_min_time); 495 } 496 #endif 497 if (reduce) { 498 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 499 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 500 team->t.t_id, child_tid)); 501 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); 502 } 503 } 504 } 505 506 if (KMP_MASTER_TID(tid)) { 507 // Need to update the team arrived pointer if we are the master thread 508 if (new_state == KMP_BARRIER_UNUSED_STATE) 509 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 510 else 511 team->t.t_bar[bt].b_arrived = new_state; 512 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", 513 gtid, team->t.t_id, tid, team->t.t_id, 514 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 515 } 516 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 517 gtid, team->t.t_id, tid, bt)); 518 } 519 520 // The reverse versions seem to beat the forward versions overall 521 #define KMP_REVERSE_HYPER_BAR 522 static void 523 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 524 int propagate_icvs 525 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 526 { 527 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release); 528 register kmp_team_t *team; 529 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb; 530 register kmp_info_t **other_threads; 531 register kmp_uint32 num_threads; 532 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ]; 533 register kmp_uint32 branch_factor = 1 << branch_bits; 534 register kmp_uint32 child; 535 register kmp_uint32 child_tid; 536 register kmp_uint32 offset; 537 register kmp_uint32 level; 538 539 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered. 540 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse 541 order of the corresponding gather, otherwise threads are released in the same order. */ 542 if (KMP_MASTER_TID(tid)) { // master 543 team = __kmp_threads[gtid]->th.th_team; 544 KMP_DEBUG_ASSERT(team != NULL); 545 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", 546 gtid, team->t.t_id, tid, bt)); 547 #if KMP_BARRIER_ICV_PUSH 548 if (propagate_icvs) { // master already has ICVs in final destination; copy 549 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs); 550 } 551 #endif 552 } 553 else { // Handle fork barrier workers who aren't part of a team yet 554 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", 555 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 556 // Wait for parent thread to release us 557 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 558 flag.wait(this_thr, TRUE 559 USE_ITT_BUILD_ARG(itt_sync_obj) ); 560 #if USE_ITT_BUILD && USE_ITT_NOTIFY 561 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 562 // In fork barrier where we could not get the object reliably 563 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 564 // Cancel wait on previous parallel region... 565 __kmp_itt_task_starting(itt_sync_obj); 566 567 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 568 return; 569 570 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 571 if (itt_sync_obj != NULL) 572 // Call prepare as early as possible for "new" barrier 573 __kmp_itt_task_finished(itt_sync_obj); 574 } else 575 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 576 // Early exit for reaping threads releasing forkjoin barrier 577 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 578 return; 579 580 // The worker thread may now assume that the team is valid. 581 team = __kmp_threads[gtid]->th.th_team; 582 KMP_DEBUG_ASSERT(team != NULL); 583 tid = __kmp_tid_from_gtid(gtid); 584 585 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 586 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 587 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 588 KMP_MB(); // Flush all pending memory write invalidates. 589 } 590 num_threads = this_thr->th.th_team_nproc; 591 other_threads = team->t.t_threads; 592 593 #ifdef KMP_REVERSE_HYPER_BAR 594 // Count up to correct level for parent 595 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0); 596 level+=branch_bits, offset<<=branch_bits); 597 598 // Now go down from there 599 for (level-=branch_bits, offset>>=branch_bits; offset != 0; 600 level-=branch_bits, offset>>=branch_bits) 601 #else 602 // Go down the tree, level by level 603 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits) 604 #endif // KMP_REVERSE_HYPER_BAR 605 { 606 #ifdef KMP_REVERSE_HYPER_BAR 607 /* Now go in reverse order through the children, highest to lowest. 608 Initial setting of child is conservative here. */ 609 child = num_threads >> ((level==0)?level:level-1); 610 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level); 611 child>=1; child--, child_tid-=(1<<level)) 612 #else 613 if (((tid >> level) & (branch_factor - 1)) != 0) 614 // No need to go lower than this, since this is the level parent would be notified 615 break; 616 // Iterate through children on this level of the tree 617 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads; 618 child++, child_tid+=(1<<level)) 619 #endif // KMP_REVERSE_HYPER_BAR 620 { 621 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going 622 else { 623 register kmp_info_t *child_thr = other_threads[child_tid]; 624 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 625 #if KMP_CACHE_MANAGE 626 register kmp_uint32 next_child_tid = child_tid - (1 << level); 627 // Prefetch next thread's go count 628 # ifdef KMP_REVERSE_HYPER_BAR 629 if (child-1 >= 1 && next_child_tid < num_threads) 630 # else 631 if (child+1 < branch_factor && next_child_tid < num_threads) 632 # endif // KMP_REVERSE_HYPER_BAR 633 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); 634 #endif /* KMP_CACHE_MANAGE */ 635 636 #if KMP_BARRIER_ICV_PUSH 637 if (propagate_icvs) // push my fixed ICVs to my child 638 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 639 #endif // KMP_BARRIER_ICV_PUSH 640 641 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 642 "go(%p): %u => %u\n", gtid, team->t.t_id, tid, 643 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 644 child_tid, &child_bar->b_go, child_bar->b_go, 645 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 646 // Release child from barrier 647 kmp_flag_64 flag(&child_bar->b_go, child_thr); 648 flag.release(); 649 } 650 } 651 } 652 #if KMP_BARRIER_ICV_PUSH 653 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest 654 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); 655 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs); 656 } 657 #endif 658 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 659 gtid, team->t.t_id, tid, bt)); 660 } 661 662 // Hierarchical Barrier 663 664 // Initialize thread barrier data 665 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the 666 minimum amount of initialization required based on how the team has changed. Returns true if 667 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the 668 team size increases, threads already in the team will respond to on-core wakeup on their parent 669 thread, but threads newly added to the team will only be listening on the their local b_go. */ 670 static bool 671 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc, 672 int gtid, int tid, kmp_team_t *team) 673 { 674 // Checks to determine if (re-)initialization is needed 675 bool uninitialized = thr_bar->team == NULL; 676 bool team_changed = team != thr_bar->team; 677 bool team_sz_changed = nproc != thr_bar->nproc; 678 bool tid_changed = tid != thr_bar->old_tid; 679 bool retval = false; 680 681 if (uninitialized || team_sz_changed) { 682 __kmp_get_hierarchy(nproc, thr_bar); 683 } 684 685 if (uninitialized || team_sz_changed || tid_changed) { 686 thr_bar->my_level = thr_bar->depth-1; // default for master 687 thr_bar->parent_tid = -1; // default for master 688 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy 689 kmp_uint32 d=0; 690 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level 691 kmp_uint32 rem; 692 if (d == thr_bar->depth-2) { // reached level right below the master 693 thr_bar->parent_tid = 0; 694 thr_bar->my_level = d; 695 break; 696 } 697 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster? 698 // thread is not a subtree root at next level, so this is max 699 thr_bar->parent_tid = tid - rem; 700 thr_bar->my_level = d; 701 break; 702 } 703 ++d; 704 } 705 } 706 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1); 707 thr_bar->old_tid = tid; 708 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 709 } 710 if (uninitialized || team_changed || tid_changed) { 711 thr_bar->team = team; 712 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 713 retval = true; 714 } 715 if (uninitialized || team_sz_changed || tid_changed) { 716 thr_bar->nproc = nproc; 717 thr_bar->leaf_kids = thr_bar->base_leaf_kids; 718 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0; 719 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc) 720 thr_bar->leaf_kids = nproc - tid - 1; 721 thr_bar->leaf_state = 0; 722 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1; 723 } 724 return retval; 725 } 726 727 static void 728 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, 729 int gtid, int tid, void (*reduce) (void *, void *) 730 USE_ITT_BUILD_ARG(void * itt_sync_obj) ) 731 { 732 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather); 733 register kmp_team_t *team = this_thr->th.th_team; 734 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb; 735 register kmp_uint32 nproc = this_thr->th.th_team_nproc; 736 register kmp_info_t **other_threads = team->t.t_threads; 737 register kmp_uint64 new_state; 738 739 int level = team->t.t_level; 740 #if OMP_40_ENABLED 741 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct? 742 if (this_thr->th.th_teams_size.nteams > 1) 743 ++level; // level was not increased in teams construct for team_of_masters 744 #endif 745 if (level == 1) thr_bar->use_oncore_barrier = 1; 746 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 747 748 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 749 gtid, team->t.t_id, tid, bt)); 750 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 751 752 #if USE_ITT_BUILD && USE_ITT_NOTIFY 753 // Barrier imbalance - save arrive time to the thread 754 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 755 this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); 756 } 757 #endif 758 759 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team); 760 761 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) 762 register kmp_int32 child_tid; 763 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 764 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { 765 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag 766 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; 767 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state); 768 flag.wait(this_thr, FALSE 769 USE_ITT_BUILD_ARG(itt_sync_obj) ); 770 if (reduce) { 771 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) { 772 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 773 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 774 team->t.t_id, child_tid)); 775 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data); 776 } 777 } 778 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits 779 } 780 // Next, wait for higher level children on each child's b_arrived flag 781 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0 782 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d]; 783 if (last > nproc) last = nproc; 784 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { 785 register kmp_info_t *child_thr = other_threads[child_tid]; 786 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 787 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 788 "arrived(%p) == %llu\n", 789 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 790 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 791 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 792 flag.wait(this_thr, FALSE 793 USE_ITT_BUILD_ARG(itt_sync_obj) ); 794 if (reduce) { 795 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 796 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 797 team->t.t_id, child_tid)); 798 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); 799 } 800 } 801 } 802 } 803 else { // Blocktime is not infinite 804 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first 805 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d]; 806 if (last > nproc) last = nproc; 807 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { 808 register kmp_info_t *child_thr = other_threads[child_tid]; 809 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 810 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 811 "arrived(%p) == %llu\n", 812 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 813 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 814 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 815 flag.wait(this_thr, FALSE 816 USE_ITT_BUILD_ARG(itt_sync_obj) ); 817 if (reduce) { 818 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 819 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 820 team->t.t_id, child_tid)); 821 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); 822 } 823 } 824 } 825 } 826 } 827 // All subordinates are gathered; now release parent if not master thread 828 829 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy 830 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 831 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, 832 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid, 833 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP)); 834 /* Mark arrival to parent: After performing this write, a worker thread may not assume that 835 the team is valid any more - it could be deallocated by the master thread at any time. */ 836 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME 837 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it 838 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]); 839 flag.release(); 840 } 841 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag 842 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 843 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset); 844 flag.set_waiter(other_threads[thr_bar->parent_tid]); 845 flag.release(); 846 } 847 } else { // Master thread needs to update the team's b_arrived value 848 team->t.t_bar[bt].b_arrived = new_state; 849 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", 850 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 851 } 852 // Is the team access below unsafe or just technically invalid? 853 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 854 gtid, team->t.t_id, tid, bt)); 855 } 856 857 static void 858 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 859 int propagate_icvs 860 USE_ITT_BUILD_ARG(void * itt_sync_obj) ) 861 { 862 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release); 863 register kmp_team_t *team; 864 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 865 register kmp_uint32 nproc; 866 bool team_change = false; // indicates on-core barrier shouldn't be used 867 868 if (KMP_MASTER_TID(tid)) { 869 team = __kmp_threads[gtid]->th.th_team; 870 KMP_DEBUG_ASSERT(team != NULL); 871 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n", 872 gtid, team->t.t_id, tid, bt)); 873 } 874 else { // Worker threads 875 // Wait for parent thread to release me 876 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME 877 || thr_bar->my_level != 0 || thr_bar->team == NULL) { 878 // Use traditional method of waiting on my own b_go flag 879 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; 880 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 881 flag.wait(this_thr, TRUE 882 USE_ITT_BUILD_ARG(itt_sync_obj) ); 883 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 884 } 885 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested 886 // Wait on my "offset" bits on parent's b_go flag 887 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; 888 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset, 889 bt, this_thr 890 USE_ITT_BUILD_ARG(itt_sync_obj) ); 891 flag.wait(this_thr, TRUE); 892 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go 893 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 894 } 895 else { // Reset my bits on parent's b_go flag 896 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0; 897 } 898 } 899 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 900 // Early exit for reaping threads releasing forkjoin barrier 901 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 902 return; 903 // The worker thread may now assume that the team is valid. 904 team = __kmp_threads[gtid]->th.th_team; 905 KMP_DEBUG_ASSERT(team != NULL); 906 tid = __kmp_tid_from_gtid(gtid); 907 908 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 909 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 910 KMP_MB(); // Flush all pending memory write invalidates. 911 } 912 913 nproc = this_thr->th.th_team_nproc; 914 int level = team->t.t_level; 915 #if OMP_40_ENABLED 916 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct? 917 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level) 918 ++level; // level was not increased in teams construct for team_of_workers 919 if( this_thr->th.th_teams_size.nteams > 1 ) 920 ++level; // level was not increased in teams construct for team_of_masters 921 } 922 #endif 923 if (level == 1) thr_bar->use_oncore_barrier = 1; 924 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 925 926 // If the team size has increased, we still communicate with old leaves via oncore barrier. 927 unsigned short int old_leaf_kids = thr_bar->leaf_kids; 928 kmp_uint64 old_leaf_state = thr_bar->leaf_state; 929 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team); 930 // But if the entire team changes, we won't use oncore barrier at all 931 if (team_change) old_leaf_kids = 0; 932 933 #if KMP_BARRIER_ICV_PUSH 934 if (propagate_icvs) { 935 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); 936 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy 937 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs); 938 } 939 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime 940 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) 941 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store 942 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 943 &thr_bar->parent_bar->th_fixed_icvs); 944 // non-leaves will get ICVs piggybacked with b_go via NGO store 945 } 946 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs 947 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access 948 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); 949 else // leaves copy parent's fixed ICVs directly to local ICV store 950 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 951 &thr_bar->parent_bar->th_fixed_icvs); 952 } 953 } 954 #endif // KMP_BARRIER_ICV_PUSH 955 956 // Now, release my children 957 if (thr_bar->my_level) { // not a leaf 958 register kmp_int32 child_tid; 959 kmp_uint32 last; 960 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { 961 if (KMP_MASTER_TID(tid)) { // do a flat release 962 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go. 963 thr_bar->b_go = KMP_BARRIER_STATE_BUMP; 964 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line 965 ngo_load(&thr_bar->th_fixed_icvs); 966 // This loops over all the threads skipping only the leaf nodes in the hierarchy 967 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) { 968 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb; 969 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)" 970 " go(%p): %u => %u\n", 971 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 972 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 973 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 974 // Use ngo store (if available) to both store ICVs and release child via child's b_go 975 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 976 } 977 ngo_sync(); 978 } 979 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 980 // Now, release leaf children 981 if (thr_bar->leaf_kids) { // if there are any 982 // We test team_change on the off-chance that the level 1 team changed. 983 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new 984 if (old_leaf_kids) { // release old leaf kids 985 thr_bar->b_go |= old_leaf_state; 986 } 987 // Release new leaf kids 988 last = tid+thr_bar->skip_per_level[1]; 989 if (last > nproc) last = nproc; 990 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1 991 register kmp_info_t *child_thr = team->t.t_threads[child_tid]; 992 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 993 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 994 " T#%d(%d:%d) go(%p): %u => %u\n", 995 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 996 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 997 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 998 // Release child using child's b_go flag 999 kmp_flag_64 flag(&child_bar->b_go, child_thr); 1000 flag.release(); 1001 } 1002 } 1003 else { // Release all children at once with leaf_state bits on my own b_go flag 1004 thr_bar->b_go |= thr_bar->leaf_state; 1005 } 1006 } 1007 } 1008 else { // Blocktime is not infinite; do a simple hierarchical release 1009 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first 1010 last = tid+thr_bar->skip_per_level[d+1]; 1011 kmp_uint32 skip = thr_bar->skip_per_level[d]; 1012 if (last > nproc) last = nproc; 1013 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { 1014 register kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1015 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1016 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)" 1017 " go(%p): %u => %u\n", 1018 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1019 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1020 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1021 // Release child using child's b_go flag 1022 kmp_flag_64 flag(&child_bar->b_go, child_thr); 1023 flag.release(); 1024 } 1025 } 1026 } 1027 #if KMP_BARRIER_ICV_PUSH 1028 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest 1029 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs); 1030 #endif // KMP_BARRIER_ICV_PUSH 1031 } 1032 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 1033 gtid, team->t.t_id, tid, bt)); 1034 } 1035 1036 // ---------------------------- End of Barrier Algorithms ---------------------------- 1037 1038 // Internal function to do a barrier. 1039 /* If is_split is true, do a split barrier, otherwise, do a plain barrier 1040 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier 1041 Returns 0 if master thread, 1 if worker thread. */ 1042 int 1043 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, 1044 void *reduce_data, void (*reduce)(void *, void *)) 1045 { 1046 KMP_TIME_DEVELOPER_BLOCK(KMP_barrier); 1047 register int tid = __kmp_tid_from_gtid(gtid); 1048 register kmp_info_t *this_thr = __kmp_threads[gtid]; 1049 register kmp_team_t *team = this_thr->th.th_team; 1050 register int status = 0; 1051 ident_t *loc = __kmp_threads[gtid]->th.th_ident; 1052 #if OMPT_SUPPORT 1053 ompt_task_id_t my_task_id; 1054 ompt_parallel_id_t my_parallel_id; 1055 #endif 1056 1057 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", 1058 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1059 1060 #if OMPT_SUPPORT 1061 if (ompt_enabled) { 1062 #if OMPT_BLAME 1063 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id; 1064 my_parallel_id = team->t.ompt_team_info.parallel_id; 1065 1066 #if OMPT_TRACE 1067 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) { 1068 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) { 1069 ompt_callbacks.ompt_callback(ompt_event_single_others_end)( 1070 my_parallel_id, my_task_id); 1071 } 1072 } 1073 #endif 1074 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { 1075 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( 1076 my_parallel_id, my_task_id); 1077 } 1078 #endif 1079 // It is OK to report the barrier state after the barrier begin callback. 1080 // According to the OMPT specification, a compliant implementation may 1081 // even delay reporting this state until the barrier begins to wait. 1082 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; 1083 } 1084 #endif 1085 1086 if (! team->t.t_serialized) { 1087 #if USE_ITT_BUILD 1088 // This value will be used in itt notify events below. 1089 void *itt_sync_obj = NULL; 1090 # if USE_ITT_NOTIFY 1091 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1092 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1093 # endif 1094 #endif /* USE_ITT_BUILD */ 1095 if (__kmp_tasking_mode == tskm_extra_barrier) { 1096 __kmp_tasking_barrier(team, this_thr, gtid); 1097 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", 1098 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1099 } 1100 1101 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when 1102 the team struct is not guaranteed to exist. */ 1103 // See note about the corresponding code in __kmp_join_barrier() being performance-critical. 1104 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1105 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1106 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1107 } 1108 1109 #if USE_ITT_BUILD 1110 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1111 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1112 #endif /* USE_ITT_BUILD */ 1113 #if USE_DEBUGGER 1114 // Let the debugger know: the thread arrived to the barrier and waiting. 1115 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure. 1116 team->t.t_bar[bt].b_master_arrived += 1; 1117 } else { 1118 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; 1119 } // if 1120 #endif /* USE_DEBUGGER */ 1121 if (reduce != NULL) { 1122 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 1123 this_thr->th.th_local.reduce_data = reduce_data; 1124 } 1125 switch (__kmp_barrier_gather_pattern[bt]) { 1126 case bp_hyper_bar: { 1127 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear 1128 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce 1129 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1130 break; 1131 } 1132 case bp_hierarchical_bar: { 1133 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce 1134 USE_ITT_BUILD_ARG(itt_sync_obj)); 1135 break; 1136 } 1137 case bp_tree_bar: { 1138 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear 1139 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce 1140 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1141 break; 1142 } 1143 default: { 1144 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce 1145 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1146 } 1147 } 1148 1149 KMP_MB(); 1150 1151 if (KMP_MASTER_TID(tid)) { 1152 status = 0; 1153 if (__kmp_tasking_mode != tskm_immediate_exec) { 1154 __kmp_task_team_wait(this_thr, team 1155 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1156 __kmp_task_team_setup(this_thr, team, 0, 0); // use 0,0 to only setup the current team if nthreads > 1 1157 } 1158 #if USE_DEBUGGER 1159 // Let the debugger know: All threads are arrived and starting leaving the barrier. 1160 team->t.t_bar[bt].b_team_arrived += 1; 1161 #endif 1162 1163 #if USE_ITT_BUILD 1164 /* TODO: In case of split reduction barrier, master thread may send acquired event early, 1165 before the final summation into the shared variable is done (final summation can be a 1166 long operation for array reductions). */ 1167 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1168 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1169 #endif /* USE_ITT_BUILD */ 1170 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1171 // Barrier - report frame end (only if active_level == 1) 1172 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && 1173 #if OMP_40_ENABLED 1174 this_thr->th.th_teams_microtask == NULL && 1175 #endif 1176 team->t.t_active_level == 1) 1177 { 1178 kmp_uint64 cur_time = __itt_get_timestamp(); 1179 kmp_info_t **other_threads = team->t.t_threads; 1180 int nproc = this_thr->th.th_team_nproc; 1181 int i; 1182 switch(__kmp_forkjoin_frames_mode) { 1183 case 1: 1184 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); 1185 this_thr->th.th_frame_time = cur_time; 1186 break; 1187 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed) 1188 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc); 1189 break; 1190 case 3: 1191 if( __itt_metadata_add_ptr ) { 1192 // Initialize with master's wait time 1193 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1194 for (i=1; i<nproc; ++i) { 1195 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time ); 1196 } 1197 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL)); 1198 } 1199 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); 1200 this_thr->th.th_frame_time = cur_time; 1201 break; 1202 } 1203 } 1204 #endif /* USE_ITT_BUILD */ 1205 } else { 1206 status = 1; 1207 #if USE_ITT_BUILD 1208 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1209 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1210 #endif /* USE_ITT_BUILD */ 1211 } 1212 if (status == 1 || ! is_split) { 1213 switch (__kmp_barrier_release_pattern[bt]) { 1214 case bp_hyper_bar: { 1215 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1216 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE 1217 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1218 break; 1219 } 1220 case bp_hierarchical_bar: { 1221 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE 1222 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1223 break; 1224 } 1225 case bp_tree_bar: { 1226 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1227 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE 1228 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1229 break; 1230 } 1231 default: { 1232 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE 1233 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1234 } 1235 } 1236 if (__kmp_tasking_mode != tskm_immediate_exec) { 1237 __kmp_task_team_sync(this_thr, team); 1238 } 1239 } 1240 1241 #if USE_ITT_BUILD 1242 /* GEH: TODO: Move this under if-condition above and also include in 1243 __kmp_end_split_barrier(). This will more accurately represent the actual release time 1244 of the threads for split barriers. */ 1245 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1246 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1247 #endif /* USE_ITT_BUILD */ 1248 } else { // Team is serialized. 1249 status = 0; 1250 if (__kmp_tasking_mode != tskm_immediate_exec) { 1251 #if OMP_41_ENABLED 1252 if ( this_thr->th.th_task_team != NULL ) { 1253 void *itt_sync_obj = NULL; 1254 #if USE_ITT_NOTIFY 1255 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1256 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1257 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1258 } 1259 #endif 1260 1261 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE); 1262 __kmp_task_team_wait(this_thr, team 1263 USE_ITT_BUILD_ARG(itt_sync_obj)); 1264 __kmp_task_team_setup(this_thr, team, 0, 0); 1265 1266 #if USE_ITT_BUILD 1267 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1268 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1269 #endif /* USE_ITT_BUILD */ 1270 } 1271 #else 1272 // The task team should be NULL for serialized code (tasks will be executed immediately) 1273 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL); 1274 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL); 1275 #endif 1276 } 1277 } 1278 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", 1279 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status)); 1280 1281 #if OMPT_SUPPORT 1282 if (ompt_enabled) { 1283 #if OMPT_BLAME 1284 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { 1285 ompt_callbacks.ompt_callback(ompt_event_barrier_end)( 1286 my_parallel_id, my_task_id); 1287 } 1288 #endif 1289 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1290 } 1291 #endif 1292 1293 return status; 1294 } 1295 1296 1297 void 1298 __kmp_end_split_barrier(enum barrier_type bt, int gtid) 1299 { 1300 KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier); 1301 int tid = __kmp_tid_from_gtid(gtid); 1302 kmp_info_t *this_thr = __kmp_threads[gtid]; 1303 kmp_team_t *team = this_thr->th.th_team; 1304 1305 if (!team->t.t_serialized) { 1306 if (KMP_MASTER_GTID(gtid)) { 1307 switch (__kmp_barrier_release_pattern[bt]) { 1308 case bp_hyper_bar: { 1309 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1310 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE 1311 USE_ITT_BUILD_ARG(NULL) ); 1312 break; 1313 } 1314 case bp_hierarchical_bar: { 1315 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE 1316 USE_ITT_BUILD_ARG(NULL)); 1317 break; 1318 } 1319 case bp_tree_bar: { 1320 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1321 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE 1322 USE_ITT_BUILD_ARG(NULL) ); 1323 break; 1324 } 1325 default: { 1326 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE 1327 USE_ITT_BUILD_ARG(NULL) ); 1328 } 1329 } 1330 if (__kmp_tasking_mode != tskm_immediate_exec) { 1331 __kmp_task_team_sync(this_thr, team); 1332 } // if 1333 } 1334 } 1335 } 1336 1337 1338 void 1339 __kmp_join_barrier(int gtid) 1340 { 1341 KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier); 1342 register kmp_info_t *this_thr = __kmp_threads[gtid]; 1343 register kmp_team_t *team; 1344 register kmp_uint nproc; 1345 kmp_info_t *master_thread; 1346 int tid; 1347 #ifdef KMP_DEBUG 1348 int team_id; 1349 #endif /* KMP_DEBUG */ 1350 #if USE_ITT_BUILD 1351 void *itt_sync_obj = NULL; 1352 # if USE_ITT_NOTIFY 1353 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need 1354 // Get object created at fork_barrier 1355 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1356 # endif 1357 #endif /* USE_ITT_BUILD */ 1358 KMP_MB(); 1359 1360 // Get current info 1361 team = this_thr->th.th_team; 1362 nproc = this_thr->th.th_team_nproc; 1363 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); 1364 tid = __kmp_tid_from_gtid(gtid); 1365 #ifdef KMP_DEBUG 1366 team_id = team->t.t_id; 1367 #endif /* KMP_DEBUG */ 1368 master_thread = this_thr->th.th_team_master; 1369 #ifdef KMP_DEBUG 1370 if (master_thread != team->t.t_threads[0]) { 1371 __kmp_print_structure(); 1372 } 1373 #endif /* KMP_DEBUG */ 1374 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); 1375 KMP_MB(); 1376 1377 // Verify state 1378 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 1379 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); 1380 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); 1381 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); 1382 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid)); 1383 1384 #if OMPT_SUPPORT 1385 #if OMPT_TRACE 1386 if (ompt_enabled && 1387 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { 1388 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( 1389 team->t.ompt_team_info.parallel_id, 1390 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1391 } 1392 #endif 1393 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; 1394 #endif 1395 1396 if (__kmp_tasking_mode == tskm_extra_barrier) { 1397 __kmp_tasking_barrier(team, this_thr, gtid); 1398 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid)); 1399 } 1400 # ifdef KMP_DEBUG 1401 if (__kmp_tasking_mode != tskm_immediate_exec) { 1402 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n", 1403 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state], 1404 this_thr->th.th_task_team)); 1405 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]); 1406 } 1407 # endif /* KMP_DEBUG */ 1408 1409 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the 1410 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows 1411 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite, 1412 since the values are not used by __kmp_wait_template() in that case. */ 1413 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1414 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1415 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1416 } 1417 1418 #if USE_ITT_BUILD 1419 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1420 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1421 #endif /* USE_ITT_BUILD */ 1422 1423 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { 1424 case bp_hyper_bar: { 1425 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1426 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL 1427 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1428 break; 1429 } 1430 case bp_hierarchical_bar: { 1431 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL 1432 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1433 break; 1434 } 1435 case bp_tree_bar: { 1436 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1437 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL 1438 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1439 break; 1440 } 1441 default: { 1442 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL 1443 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1444 } 1445 } 1446 1447 /* From this point on, the team data structure may be deallocated at any time by the 1448 master thread - it is unsafe to reference it in any of the worker threads. Any per-team 1449 data items that need to be referenced before the end of the barrier should be moved to 1450 the kmp_task_team_t structs. */ 1451 if (KMP_MASTER_TID(tid)) { 1452 if (__kmp_tasking_mode != tskm_immediate_exec) { 1453 // Master shouldn't call decrease_load(). // TODO: enable master threads. 1454 // Master should have th_may_decrease_load == 0. // TODO: enable master threads. 1455 __kmp_task_team_wait(this_thr, team 1456 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1457 } 1458 #if USE_ITT_BUILD 1459 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1460 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1461 #endif /* USE_ITT_BUILD */ 1462 1463 # if USE_ITT_BUILD && USE_ITT_NOTIFY 1464 // Join barrier - report frame end 1465 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && 1466 #if OMP_40_ENABLED 1467 this_thr->th.th_teams_microtask == NULL && 1468 #endif 1469 team->t.t_active_level == 1) 1470 { 1471 kmp_uint64 cur_time = __itt_get_timestamp(); 1472 ident_t * loc = team->t.t_ident; 1473 kmp_info_t **other_threads = team->t.t_threads; 1474 int nproc = this_thr->th.th_team_nproc; 1475 int i; 1476 switch(__kmp_forkjoin_frames_mode) { 1477 case 1: 1478 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); 1479 break; 1480 case 2: 1481 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc); 1482 break; 1483 case 3: 1484 if( __itt_metadata_add_ptr ) { 1485 // Initialize with master's wait time 1486 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1487 for (i=1; i<nproc; ++i) { 1488 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time ); 1489 } 1490 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0); 1491 } 1492 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); 1493 this_thr->th.th_frame_time = cur_time; 1494 break; 1495 } 1496 } 1497 # endif /* USE_ITT_BUILD */ 1498 } 1499 #if USE_ITT_BUILD 1500 else { 1501 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1502 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1503 } 1504 #endif /* USE_ITT_BUILD */ 1505 1506 #if KMP_DEBUG 1507 if (KMP_MASTER_TID(tid)) { 1508 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", 1509 gtid, team_id, tid, nproc)); 1510 } 1511 #endif /* KMP_DEBUG */ 1512 1513 // TODO now, mark worker threads as done so they may be disbanded 1514 KMP_MB(); // Flush all pending memory write invalidates. 1515 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); 1516 1517 #if OMPT_SUPPORT 1518 if (ompt_enabled) { 1519 #if OMPT_BLAME 1520 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { 1521 ompt_callbacks.ompt_callback(ompt_event_barrier_end)( 1522 team->t.ompt_team_info.parallel_id, 1523 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1524 } 1525 #endif 1526 1527 // return to default state 1528 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 1529 } 1530 #endif 1531 } 1532 1533 1534 // TODO release worker threads' fork barriers as we are ready instead of all at once 1535 void 1536 __kmp_fork_barrier(int gtid, int tid) 1537 { 1538 KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier); 1539 kmp_info_t *this_thr = __kmp_threads[gtid]; 1540 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; 1541 #if USE_ITT_BUILD 1542 void * itt_sync_obj = NULL; 1543 #endif /* USE_ITT_BUILD */ 1544 1545 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", 1546 gtid, (team != NULL) ? team->t.t_id : -1, tid)); 1547 1548 // th_team pointer only valid for master thread here 1549 if (KMP_MASTER_TID(tid)) { 1550 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1551 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1552 // Create itt barrier object 1553 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); 1554 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing 1555 } 1556 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1557 1558 #ifdef KMP_DEBUG 1559 register kmp_info_t **other_threads = team->t.t_threads; 1560 register int i; 1561 1562 // Verify state 1563 KMP_MB(); 1564 1565 for(i=1; i<team->t.t_nproc; ++i) { 1566 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n", 1567 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, 1568 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, 1569 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); 1570 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) 1571 & ~(KMP_BARRIER_SLEEP_STATE)) 1572 == KMP_INIT_BARRIER_STATE); 1573 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); 1574 } 1575 #endif 1576 1577 if (__kmp_tasking_mode != tskm_immediate_exec) { 1578 __kmp_task_team_setup(this_thr, team, 1, 0); // 1,0 indicates setup both task teams if nthreads > 1 1579 } 1580 1581 /* The master thread may have changed its blocktime between the join barrier and the 1582 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can 1583 access it when the team struct is not guaranteed to exist. */ 1584 // See note about the corresponding code in __kmp_join_barrier() being performance-critical 1585 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1586 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1587 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1588 } 1589 } // master 1590 1591 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { 1592 case bp_hyper_bar: { 1593 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1594 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE 1595 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1596 break; 1597 } 1598 case bp_hierarchical_bar: { 1599 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE 1600 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1601 break; 1602 } 1603 case bp_tree_bar: { 1604 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1605 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE 1606 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1607 break; 1608 } 1609 default: { 1610 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE 1611 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1612 } 1613 } 1614 1615 // Early exit for reaping threads releasing forkjoin barrier 1616 if (TCR_4(__kmp_global.g.g_done)) { 1617 if (this_thr->th.th_task_team != NULL) { 1618 if (KMP_MASTER_TID(tid)) { 1619 TCW_PTR(this_thr->th.th_task_team, NULL); 1620 } 1621 else { 1622 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr); 1623 } 1624 } 1625 1626 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1627 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1628 if (!KMP_MASTER_TID(tid)) { 1629 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1630 if (itt_sync_obj) 1631 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1632 } 1633 } 1634 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1635 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); 1636 return; 1637 } 1638 1639 /* We can now assume that a valid team structure has been allocated by the master and 1640 propagated to all worker threads. The current thread, however, may not be part of the 1641 team, so we can't blindly assume that the team pointer is non-null. */ 1642 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); 1643 KMP_DEBUG_ASSERT(team != NULL); 1644 tid = __kmp_tid_from_gtid(gtid); 1645 1646 1647 #if KMP_BARRIER_ICV_PULL 1648 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 1649 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has 1650 this data before this function is called. We cannot modify __kmp_fork_call() to look at 1651 the fixed ICVs in the master's thread struct, because it is not always the case that the 1652 threads arrays have been allocated when __kmp_fork_call() is executed. */ 1653 { 1654 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); 1655 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs 1656 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid. 1657 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); 1658 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); 1659 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1660 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs); 1661 } 1662 } 1663 #endif // KMP_BARRIER_ICV_PULL 1664 1665 if (__kmp_tasking_mode != tskm_immediate_exec) { 1666 __kmp_task_team_sync(this_thr, team); 1667 } 1668 1669 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 1670 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 1671 if (proc_bind == proc_bind_intel) { 1672 #endif 1673 #if KMP_AFFINITY_SUPPORTED 1674 // Call dynamic affinity settings 1675 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) { 1676 __kmp_balanced_affinity(tid, team->t.t_nproc); 1677 } 1678 #endif // KMP_AFFINITY_SUPPORTED 1679 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 1680 } 1681 else if (proc_bind != proc_bind_false) { 1682 if (this_thr->th.th_new_place == this_thr->th.th_current_place) { 1683 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", 1684 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place)); 1685 } 1686 else { 1687 __kmp_affinity_set_place(gtid); 1688 } 1689 } 1690 #endif 1691 1692 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1693 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1694 if (!KMP_MASTER_TID(tid)) { 1695 // Get correct barrier object 1696 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1697 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired 1698 } // (prepare called inside barrier_release) 1699 } 1700 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1701 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid)); 1702 } 1703 1704 1705 void 1706 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc ) 1707 { 1708 KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy); 1709 1710 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); 1711 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 1712 1713 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 1714 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has 1715 this data before this function is called. */ 1716 #if KMP_BARRIER_ICV_PULL 1717 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where 1718 all of the worker threads can access them and make their own copies after the barrier. */ 1719 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point 1720 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs); 1721 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 1722 0, team->t.t_threads[0], team)); 1723 #elif KMP_BARRIER_ICV_PUSH 1724 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here. 1725 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 1726 0, team->t.t_threads[0], team)); 1727 #else 1728 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time. 1729 ngo_load(new_icvs); 1730 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point 1731 for (int f=1; f<new_nproc; ++f) { // Skip the master thread 1732 // TODO: GEH - pass in better source location info since usually NULL here 1733 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 1734 f, team->t.t_threads[f], team)); 1735 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); 1736 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); 1737 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 1738 f, team->t.t_threads[f], team)); 1739 } 1740 ngo_sync(); 1741 #endif // KMP_BARRIER_ICV_PULL 1742 } 1743