1 /* 2 * kmp_barrier.cpp 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_wait_release.h" 18 #include "kmp_stats.h" 19 #include "kmp_itt.h" 20 #include "kmp_os.h" 21 22 23 #if KMP_MIC 24 #include <immintrin.h> 25 #define USE_NGO_STORES 1 26 #endif // KMP_MIC 27 28 #if KMP_MIC && USE_NGO_STORES 29 // ICV copying 30 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 31 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 32 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 33 #define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory") 34 #else 35 #define ngo_load(src) ((void)0) 36 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 37 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 38 #define ngo_sync() ((void)0) 39 #endif /* KMP_MIC && USE_NGO_STORES */ 40 41 void __kmp_print_structure(void); // Forward declaration 42 43 // ---------------------------- Barrier Algorithms ---------------------------- 44 45 // Linear Barrier 46 static void 47 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 48 void (*reduce)(void *, void *) 49 USE_ITT_BUILD_ARG(void * itt_sync_obj) ) 50 { 51 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather); 52 register kmp_team_t *team = this_thr->th.th_team; 53 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb; 54 register kmp_info_t **other_threads = team->t.t_threads; 55 56 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 57 gtid, team->t.t_id, tid, bt)); 58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 59 60 #if USE_ITT_BUILD && USE_ITT_NOTIFY 61 // Barrier imbalance - save arrive time to the thread 62 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 63 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); 64 } 65 #endif 66 // We now perform a linear reduction to signal that all of the threads have arrived. 67 if (!KMP_MASTER_TID(tid)) { 68 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 69 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, 70 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived, 71 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 72 // Mark arrival to master thread 73 /* After performing this write, a worker thread may not assume that the team is valid 74 any more - it could be deallocated by the master thread at any time. */ 75 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]); 76 flag.release(); 77 } else { 78 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; 79 register int nproc = this_thr->th.th_team_nproc; 80 register int i; 81 // Don't have to worry about sleep bit here or atomic since team setting 82 register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; 83 84 // Collect all the worker team member threads. 85 for (i=1; i<nproc; ++i) { 86 #if KMP_CACHE_MANAGE 87 // Prefetch next thread's arrived count 88 if (i+1 < nproc) 89 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived); 90 #endif /* KMP_CACHE_MANAGE */ 91 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 92 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, 93 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 94 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); 95 96 // Wait for worker thread to arrive 97 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); 98 flag.wait(this_thr, FALSE 99 USE_ITT_BUILD_ARG(itt_sync_obj) ); 100 #if USE_ITT_BUILD && USE_ITT_NOTIFY 101 // Barrier imbalance - write min of the thread time and the other thread time to the thread. 102 if (__kmp_forkjoin_frames_mode == 2) { 103 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 104 other_threads[i]->th.th_bar_min_time); 105 } 106 #endif 107 if (reduce) { 108 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid, 109 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i)); 110 (*reduce)(this_thr->th.th_local.reduce_data, 111 other_threads[i]->th.th_local.reduce_data); 112 } 113 } 114 // Don't have to worry about sleep bit here or atomic since team setting 115 team_bar->b_arrived = new_state; 116 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", 117 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state)); 118 } 119 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 120 gtid, team->t.t_id, tid, bt)); 121 } 122 123 static void 124 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 125 int propagate_icvs 126 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 127 { 128 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release); 129 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 130 register kmp_team_t *team; 131 132 if (KMP_MASTER_TID(tid)) { 133 register unsigned int i; 134 register kmp_uint32 nproc = this_thr->th.th_team_nproc; 135 register kmp_info_t **other_threads; 136 137 team = __kmp_threads[gtid]->th.th_team; 138 KMP_DEBUG_ASSERT(team != NULL); 139 other_threads = team->t.t_threads; 140 141 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", 142 gtid, team->t.t_id, tid, bt)); 143 144 if (nproc > 1) { 145 #if KMP_BARRIER_ICV_PUSH 146 { 147 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); 148 if (propagate_icvs) { 149 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); 150 for (i=1; i<nproc; ++i) { 151 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE); 152 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, 153 &team->t.t_implicit_task_taskdata[0].td_icvs); 154 } 155 ngo_sync(); 156 } 157 } 158 #endif // KMP_BARRIER_ICV_PUSH 159 160 // Now, release all of the worker threads 161 for (i=1; i<nproc; ++i) { 162 #if KMP_CACHE_MANAGE 163 // Prefetch next thread's go flag 164 if (i+1 < nproc) 165 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go); 166 #endif /* KMP_CACHE_MANAGE */ 167 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 168 "go(%p): %u => %u\n", gtid, team->t.t_id, tid, 169 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i, 170 &other_threads[i]->th.th_bar[bt].bb.b_go, 171 other_threads[i]->th.th_bar[bt].bb.b_go, 172 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); 173 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]); 174 flag.release(); 175 } 176 } 177 } else { // Wait for the MASTER thread to release us 178 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", 179 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 180 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 181 flag.wait(this_thr, TRUE 182 USE_ITT_BUILD_ARG(itt_sync_obj) ); 183 #if USE_ITT_BUILD && USE_ITT_NOTIFY 184 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 185 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled) 186 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 187 // Cancel wait on previous parallel region... 188 __kmp_itt_task_starting(itt_sync_obj); 189 190 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 191 return; 192 193 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 194 if (itt_sync_obj != NULL) 195 // Call prepare as early as possible for "new" barrier 196 __kmp_itt_task_finished(itt_sync_obj); 197 } else 198 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 199 // Early exit for reaping threads releasing forkjoin barrier 200 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) ) 201 return; 202 // The worker thread may now assume that the team is valid. 203 #ifdef KMP_DEBUG 204 tid = __kmp_tid_from_gtid(gtid); 205 team = __kmp_threads[gtid]->th.th_team; 206 #endif 207 KMP_DEBUG_ASSERT(team != NULL); 208 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 209 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 210 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 211 KMP_MB(); // Flush all pending memory write invalidates. 212 } 213 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 214 gtid, team->t.t_id, tid, bt)); 215 } 216 217 // Tree barrier 218 static void 219 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 220 void (*reduce)(void *, void *) 221 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 222 { 223 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather); 224 register kmp_team_t *team = this_thr->th.th_team; 225 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 226 register kmp_info_t **other_threads = team->t.t_threads; 227 register kmp_uint32 nproc = this_thr->th.th_team_nproc; 228 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 229 register kmp_uint32 branch_factor = 1 << branch_bits; 230 register kmp_uint32 child; 231 register kmp_uint32 child_tid; 232 register kmp_uint64 new_state; 233 234 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 235 gtid, team->t.t_id, tid, bt)); 236 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 237 238 #if USE_ITT_BUILD && USE_ITT_NOTIFY 239 // Barrier imbalance - save arrive time to the thread 240 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 241 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); 242 } 243 #endif 244 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go 245 child_tid = (tid << branch_bits) + 1; 246 if (child_tid < nproc) { 247 // Parent threads wait for all their children to arrive 248 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 249 child = 1; 250 do { 251 register kmp_info_t *child_thr = other_threads[child_tid]; 252 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 253 #if KMP_CACHE_MANAGE 254 // Prefetch next thread's arrived count 255 if (child+1 <= branch_factor && child_tid+1 < nproc) 256 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived); 257 #endif /* KMP_CACHE_MANAGE */ 258 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 259 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, 260 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, 261 &child_bar->b_arrived, new_state)); 262 // Wait for child to arrive 263 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 264 flag.wait(this_thr, FALSE 265 USE_ITT_BUILD_ARG(itt_sync_obj) ); 266 #if USE_ITT_BUILD && USE_ITT_NOTIFY 267 // Barrier imbalance - write min of the thread time and a child time to the thread. 268 if (__kmp_forkjoin_frames_mode == 2) { 269 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 270 child_thr->th.th_bar_min_time); 271 } 272 #endif 273 if (reduce) { 274 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 275 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 276 team->t.t_id, child_tid)); 277 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); 278 } 279 child++; 280 child_tid++; 281 } 282 while (child <= branch_factor && child_tid < nproc); 283 } 284 285 if (!KMP_MASTER_TID(tid)) { // Worker threads 286 register kmp_int32 parent_tid = (tid - 1) >> branch_bits; 287 288 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 289 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, 290 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid, 291 &thr_bar->b_arrived, thr_bar->b_arrived, 292 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 293 294 // Mark arrival to parent thread 295 /* After performing this write, a worker thread may not assume that the team is valid 296 any more - it could be deallocated by the master thread at any time. */ 297 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]); 298 flag.release(); 299 } else { 300 // Need to update the team arrived pointer if we are the master thread 301 if (nproc > 1) // New value was already computed above 302 team->t.t_bar[bt].b_arrived = new_state; 303 else 304 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 305 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", 306 gtid, team->t.t_id, tid, team->t.t_id, 307 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 308 } 309 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 310 gtid, team->t.t_id, tid, bt)); 311 } 312 313 static void 314 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 315 int propagate_icvs 316 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 317 { 318 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release); 319 register kmp_team_t *team; 320 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 321 register kmp_uint32 nproc; 322 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 323 register kmp_uint32 branch_factor = 1 << branch_bits; 324 register kmp_uint32 child; 325 register kmp_uint32 child_tid; 326 327 // Perform a tree release for all of the threads that have been gathered 328 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet 329 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", 330 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 331 // Wait for parent thread to release us 332 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 333 flag.wait(this_thr, TRUE 334 USE_ITT_BUILD_ARG(itt_sync_obj) ); 335 #if USE_ITT_BUILD && USE_ITT_NOTIFY 336 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 337 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled) 338 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 339 // Cancel wait on previous parallel region... 340 __kmp_itt_task_starting(itt_sync_obj); 341 342 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 343 return; 344 345 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 346 if (itt_sync_obj != NULL) 347 // Call prepare as early as possible for "new" barrier 348 __kmp_itt_task_finished(itt_sync_obj); 349 } else 350 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 351 // Early exit for reaping threads releasing forkjoin barrier 352 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 353 return; 354 355 // The worker thread may now assume that the team is valid. 356 team = __kmp_threads[gtid]->th.th_team; 357 KMP_DEBUG_ASSERT(team != NULL); 358 tid = __kmp_tid_from_gtid(gtid); 359 360 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 361 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 362 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 363 KMP_MB(); // Flush all pending memory write invalidates. 364 } else { 365 team = __kmp_threads[gtid]->th.th_team; 366 KMP_DEBUG_ASSERT(team != NULL); 367 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", 368 gtid, team->t.t_id, tid, bt)); 369 } 370 nproc = this_thr->th.th_team_nproc; 371 child_tid = (tid << branch_bits) + 1; 372 373 if (child_tid < nproc) { 374 register kmp_info_t **other_threads = team->t.t_threads; 375 child = 1; 376 // Parent threads release all their children 377 do { 378 register kmp_info_t *child_thr = other_threads[child_tid]; 379 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 380 #if KMP_CACHE_MANAGE 381 // Prefetch next thread's go count 382 if (child+1 <= branch_factor && child_tid+1 < nproc) 383 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go); 384 #endif /* KMP_CACHE_MANAGE */ 385 386 #if KMP_BARRIER_ICV_PUSH 387 { 388 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); 389 if (propagate_icvs) { 390 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid], 391 team, child_tid, FALSE); 392 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, 393 &team->t.t_implicit_task_taskdata[0].td_icvs); 394 } 395 } 396 #endif // KMP_BARRIER_ICV_PUSH 397 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 398 "go(%p): %u => %u\n", gtid, team->t.t_id, tid, 399 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 400 child_tid, &child_bar->b_go, child_bar->b_go, 401 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 402 // Release child from barrier 403 kmp_flag_64 flag(&child_bar->b_go, child_thr); 404 flag.release(); 405 child++; 406 child_tid++; 407 } 408 while (child <= branch_factor && child_tid < nproc); 409 } 410 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 411 gtid, team->t.t_id, tid, bt)); 412 } 413 414 415 // Hyper Barrier 416 static void 417 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 418 void (*reduce)(void *, void *) 419 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 420 { 421 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather); 422 register kmp_team_t *team = this_thr->th.th_team; 423 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 424 register kmp_info_t **other_threads = team->t.t_threads; 425 register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; 426 register kmp_uint32 num_threads = this_thr->th.th_team_nproc; 427 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 428 register kmp_uint32 branch_factor = 1 << branch_bits; 429 register kmp_uint32 offset; 430 register kmp_uint32 level; 431 432 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 433 gtid, team->t.t_id, tid, bt)); 434 435 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 436 437 #if USE_ITT_BUILD && USE_ITT_NOTIFY 438 // Barrier imbalance - save arrive time to the thread 439 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 440 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); 441 } 442 #endif 443 /* Perform a hypercube-embedded tree gather to wait until all of the threads have 444 arrived, and reduce any required data as we go. */ 445 kmp_flag_64 p_flag(&thr_bar->b_arrived); 446 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits) 447 { 448 register kmp_uint32 child; 449 register kmp_uint32 child_tid; 450 451 if (((tid >> level) & (branch_factor - 1)) != 0) { 452 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1); 453 454 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 455 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, 456 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid, 457 &thr_bar->b_arrived, thr_bar->b_arrived, 458 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 459 // Mark arrival to parent thread 460 /* After performing this write (in the last iteration of the enclosing for loop), 461 a worker thread may not assume that the team is valid any more - it could be 462 deallocated by the master thread at any time. */ 463 p_flag.set_waiter(other_threads[parent_tid]); 464 p_flag.release(); 465 break; 466 } 467 468 // Parent threads wait for children to arrive 469 if (new_state == KMP_BARRIER_UNUSED_STATE) 470 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 471 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads; 472 child++, child_tid+=(1 << level)) 473 { 474 register kmp_info_t *child_thr = other_threads[child_tid]; 475 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 476 #if KMP_CACHE_MANAGE 477 register kmp_uint32 next_child_tid = child_tid + (1 << level); 478 // Prefetch next thread's arrived count 479 if (child+1 < branch_factor && next_child_tid < num_threads) 480 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); 481 #endif /* KMP_CACHE_MANAGE */ 482 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 483 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, 484 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, 485 &child_bar->b_arrived, new_state)); 486 // Wait for child to arrive 487 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state); 488 c_flag.wait(this_thr, FALSE 489 USE_ITT_BUILD_ARG(itt_sync_obj) ); 490 #if USE_ITT_BUILD && USE_ITT_NOTIFY 491 // Barrier imbalance - write min of the thread time and a child time to the thread. 492 if (__kmp_forkjoin_frames_mode == 2) { 493 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 494 child_thr->th.th_bar_min_time); 495 } 496 #endif 497 if (reduce) { 498 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 499 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 500 team->t.t_id, child_tid)); 501 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); 502 } 503 } 504 } 505 506 if (KMP_MASTER_TID(tid)) { 507 // Need to update the team arrived pointer if we are the master thread 508 if (new_state == KMP_BARRIER_UNUSED_STATE) 509 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 510 else 511 team->t.t_bar[bt].b_arrived = new_state; 512 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", 513 gtid, team->t.t_id, tid, team->t.t_id, 514 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 515 } 516 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 517 gtid, team->t.t_id, tid, bt)); 518 } 519 520 // The reverse versions seem to beat the forward versions overall 521 #define KMP_REVERSE_HYPER_BAR 522 static void 523 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 524 int propagate_icvs 525 USE_ITT_BUILD_ARG(void *itt_sync_obj) ) 526 { 527 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release); 528 register kmp_team_t *team; 529 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb; 530 register kmp_info_t **other_threads; 531 register kmp_uint32 num_threads; 532 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ]; 533 register kmp_uint32 branch_factor = 1 << branch_bits; 534 register kmp_uint32 child; 535 register kmp_uint32 child_tid; 536 register kmp_uint32 offset; 537 register kmp_uint32 level; 538 539 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered. 540 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse 541 order of the corresponding gather, otherwise threads are released in the same order. */ 542 if (KMP_MASTER_TID(tid)) { // master 543 team = __kmp_threads[gtid]->th.th_team; 544 KMP_DEBUG_ASSERT(team != NULL); 545 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", 546 gtid, team->t.t_id, tid, bt)); 547 #if KMP_BARRIER_ICV_PUSH 548 if (propagate_icvs) { // master already has ICVs in final destination; copy 549 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs); 550 } 551 #endif 552 } 553 else { // Handle fork barrier workers who aren't part of a team yet 554 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", 555 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 556 // Wait for parent thread to release us 557 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 558 flag.wait(this_thr, TRUE 559 USE_ITT_BUILD_ARG(itt_sync_obj) ); 560 #if USE_ITT_BUILD && USE_ITT_NOTIFY 561 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 562 // In fork barrier where we could not get the object reliably 563 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 564 // Cancel wait on previous parallel region... 565 __kmp_itt_task_starting(itt_sync_obj); 566 567 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 568 return; 569 570 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 571 if (itt_sync_obj != NULL) 572 // Call prepare as early as possible for "new" barrier 573 __kmp_itt_task_finished(itt_sync_obj); 574 } else 575 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 576 // Early exit for reaping threads releasing forkjoin barrier 577 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 578 return; 579 580 // The worker thread may now assume that the team is valid. 581 team = __kmp_threads[gtid]->th.th_team; 582 KMP_DEBUG_ASSERT(team != NULL); 583 tid = __kmp_tid_from_gtid(gtid); 584 585 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 586 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 587 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 588 KMP_MB(); // Flush all pending memory write invalidates. 589 } 590 num_threads = this_thr->th.th_team_nproc; 591 other_threads = team->t.t_threads; 592 593 #ifdef KMP_REVERSE_HYPER_BAR 594 // Count up to correct level for parent 595 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0); 596 level+=branch_bits, offset<<=branch_bits); 597 598 // Now go down from there 599 for (level-=branch_bits, offset>>=branch_bits; offset != 0; 600 level-=branch_bits, offset>>=branch_bits) 601 #else 602 // Go down the tree, level by level 603 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits) 604 #endif // KMP_REVERSE_HYPER_BAR 605 { 606 #ifdef KMP_REVERSE_HYPER_BAR 607 /* Now go in reverse order through the children, highest to lowest. 608 Initial setting of child is conservative here. */ 609 child = num_threads >> ((level==0)?level:level-1); 610 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level); 611 child>=1; child--, child_tid-=(1<<level)) 612 #else 613 if (((tid >> level) & (branch_factor - 1)) != 0) 614 // No need to go lower than this, since this is the level parent would be notified 615 break; 616 // Iterate through children on this level of the tree 617 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads; 618 child++, child_tid+=(1<<level)) 619 #endif // KMP_REVERSE_HYPER_BAR 620 { 621 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going 622 else { 623 register kmp_info_t *child_thr = other_threads[child_tid]; 624 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 625 #if KMP_CACHE_MANAGE 626 register kmp_uint32 next_child_tid = child_tid - (1 << level); 627 // Prefetch next thread's go count 628 # ifdef KMP_REVERSE_HYPER_BAR 629 if (child-1 >= 1 && next_child_tid < num_threads) 630 # else 631 if (child+1 < branch_factor && next_child_tid < num_threads) 632 # endif // KMP_REVERSE_HYPER_BAR 633 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); 634 #endif /* KMP_CACHE_MANAGE */ 635 636 #if KMP_BARRIER_ICV_PUSH 637 if (propagate_icvs) // push my fixed ICVs to my child 638 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 639 #endif // KMP_BARRIER_ICV_PUSH 640 641 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 642 "go(%p): %u => %u\n", gtid, team->t.t_id, tid, 643 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 644 child_tid, &child_bar->b_go, child_bar->b_go, 645 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 646 // Release child from barrier 647 kmp_flag_64 flag(&child_bar->b_go, child_thr); 648 flag.release(); 649 } 650 } 651 } 652 #if KMP_BARRIER_ICV_PUSH 653 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest 654 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); 655 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs); 656 } 657 #endif 658 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 659 gtid, team->t.t_id, tid, bt)); 660 } 661 662 // Hierarchical Barrier 663 664 // Initialize thread barrier data 665 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the 666 minimum amount of initialization required based on how the team has changed. Returns true if 667 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the 668 team size increases, threads already in the team will respond to on-core wakeup on their parent 669 thread, but threads newly added to the team will only be listening on the their local b_go. */ 670 static bool 671 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc, 672 int gtid, int tid, kmp_team_t *team) 673 { 674 // Checks to determine if (re-)initialization is needed 675 bool uninitialized = thr_bar->team == NULL; 676 bool team_changed = team != thr_bar->team; 677 bool team_sz_changed = nproc != thr_bar->nproc; 678 bool tid_changed = tid != thr_bar->old_tid; 679 bool retval = false; 680 681 if (uninitialized || team_sz_changed) { 682 __kmp_get_hierarchy(nproc, thr_bar); 683 } 684 685 if (uninitialized || team_sz_changed || tid_changed) { 686 thr_bar->my_level = thr_bar->depth-1; // default for master 687 thr_bar->parent_tid = -1; // default for master 688 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy 689 kmp_uint32 d=0; 690 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level 691 kmp_uint32 rem; 692 if (d == thr_bar->depth-2) { // reached level right below the master 693 thr_bar->parent_tid = 0; 694 thr_bar->my_level = d; 695 break; 696 } 697 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster? 698 // thread is not a subtree root at next level, so this is max 699 thr_bar->parent_tid = tid - rem; 700 thr_bar->my_level = d; 701 break; 702 } 703 ++d; 704 } 705 } 706 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1); 707 thr_bar->old_tid = tid; 708 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 709 thr_bar->team = team; 710 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 711 } 712 if (uninitialized || team_changed || tid_changed) { 713 thr_bar->team = team; 714 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 715 retval = true; 716 } 717 if (uninitialized || team_sz_changed || tid_changed) { 718 thr_bar->nproc = nproc; 719 thr_bar->leaf_kids = thr_bar->base_leaf_kids; 720 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0; 721 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc) 722 thr_bar->leaf_kids = nproc - tid - 1; 723 thr_bar->leaf_state = 0; 724 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1; 725 } 726 return retval; 727 } 728 729 static void 730 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, 731 int gtid, int tid, void (*reduce) (void *, void *) 732 USE_ITT_BUILD_ARG(void * itt_sync_obj) ) 733 { 734 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather); 735 register kmp_team_t *team = this_thr->th.th_team; 736 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb; 737 register kmp_uint32 nproc = this_thr->th.th_team_nproc; 738 register kmp_info_t **other_threads = team->t.t_threads; 739 register kmp_uint64 new_state; 740 741 int level = team->t.t_level; 742 #if OMP_40_ENABLED 743 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct? 744 if (this_thr->th.th_teams_size.nteams > 1) 745 ++level; // level was not increased in teams construct for team_of_masters 746 #endif 747 if (level == 1) thr_bar->use_oncore_barrier = 1; 748 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 749 750 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 751 gtid, team->t.t_id, tid, bt)); 752 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 753 754 #if USE_ITT_BUILD && USE_ITT_NOTIFY 755 // Barrier imbalance - save arrive time to the thread 756 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 757 this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); 758 } 759 #endif 760 761 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team); 762 763 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) 764 register kmp_int32 child_tid; 765 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 766 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { 767 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag 768 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; 769 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n", 770 gtid, team->t.t_id, tid)); 771 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state); 772 flag.wait(this_thr, FALSE 773 USE_ITT_BUILD_ARG(itt_sync_obj) ); 774 if (reduce) { 775 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) { 776 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 777 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 778 team->t.t_id, child_tid)); 779 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data); 780 } 781 } 782 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits 783 } 784 // Next, wait for higher level children on each child's b_arrived flag 785 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0 786 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d]; 787 if (last > nproc) last = nproc; 788 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { 789 register kmp_info_t *child_thr = other_threads[child_tid]; 790 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 791 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 792 "arrived(%p) == %llu\n", 793 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 794 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 795 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 796 flag.wait(this_thr, FALSE 797 USE_ITT_BUILD_ARG(itt_sync_obj) ); 798 if (reduce) { 799 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 800 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 801 team->t.t_id, child_tid)); 802 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); 803 } 804 } 805 } 806 } 807 else { // Blocktime is not infinite 808 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first 809 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d]; 810 if (last > nproc) last = nproc; 811 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { 812 register kmp_info_t *child_thr = other_threads[child_tid]; 813 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 814 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 815 "arrived(%p) == %llu\n", 816 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 817 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 818 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 819 flag.wait(this_thr, FALSE 820 USE_ITT_BUILD_ARG(itt_sync_obj) ); 821 if (reduce) { 822 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 823 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 824 team->t.t_id, child_tid)); 825 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); 826 } 827 } 828 } 829 } 830 } 831 // All subordinates are gathered; now release parent if not master thread 832 833 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy 834 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 835 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, 836 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid, 837 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP)); 838 /* Mark arrival to parent: After performing this write, a worker thread may not assume that 839 the team is valid any more - it could be deallocated by the master thread at any time. */ 840 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME 841 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it 842 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]); 843 flag.release(); 844 } 845 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag 846 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 847 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset); 848 flag.set_waiter(other_threads[thr_bar->parent_tid]); 849 flag.release(); 850 } 851 } else { // Master thread needs to update the team's b_arrived value 852 team->t.t_bar[bt].b_arrived = new_state; 853 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", 854 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 855 } 856 // Is the team access below unsafe or just technically invalid? 857 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 858 gtid, team->t.t_id, tid, bt)); 859 } 860 861 static void 862 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 863 int propagate_icvs 864 USE_ITT_BUILD_ARG(void * itt_sync_obj) ) 865 { 866 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release); 867 register kmp_team_t *team; 868 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 869 register kmp_uint32 nproc; 870 bool team_change = false; // indicates on-core barrier shouldn't be used 871 872 if (KMP_MASTER_TID(tid)) { 873 team = __kmp_threads[gtid]->th.th_team; 874 KMP_DEBUG_ASSERT(team != NULL); 875 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n", 876 gtid, team->t.t_id, tid, bt)); 877 } 878 else { // Worker threads 879 // Wait for parent thread to release me 880 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME 881 || thr_bar->my_level != 0 || thr_bar->team == NULL) { 882 // Use traditional method of waiting on my own b_go flag 883 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; 884 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 885 flag.wait(this_thr, TRUE 886 USE_ITT_BUILD_ARG(itt_sync_obj) ); 887 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 888 } 889 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested 890 // Wait on my "offset" bits on parent's b_go flag 891 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; 892 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset, 893 bt, this_thr 894 USE_ITT_BUILD_ARG(itt_sync_obj) ); 895 flag.wait(this_thr, TRUE); 896 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go 897 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 898 } 899 else { // Reset my bits on parent's b_go flag 900 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0; 901 } 902 } 903 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 904 // Early exit for reaping threads releasing forkjoin barrier 905 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 906 return; 907 // The worker thread may now assume that the team is valid. 908 team = __kmp_threads[gtid]->th.th_team; 909 KMP_DEBUG_ASSERT(team != NULL); 910 tid = __kmp_tid_from_gtid(gtid); 911 912 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 913 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 914 KMP_MB(); // Flush all pending memory write invalidates. 915 } 916 917 nproc = this_thr->th.th_team_nproc; 918 int level = team->t.t_level; 919 #if OMP_40_ENABLED 920 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct? 921 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level) 922 ++level; // level was not increased in teams construct for team_of_workers 923 if( this_thr->th.th_teams_size.nteams > 1 ) 924 ++level; // level was not increased in teams construct for team_of_masters 925 } 926 #endif 927 if (level == 1) thr_bar->use_oncore_barrier = 1; 928 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 929 930 // If the team size has increased, we still communicate with old leaves via oncore barrier. 931 unsigned short int old_leaf_kids = thr_bar->leaf_kids; 932 kmp_uint64 old_leaf_state = thr_bar->leaf_state; 933 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team); 934 // But if the entire team changes, we won't use oncore barrier at all 935 if (team_change) old_leaf_kids = 0; 936 937 #if KMP_BARRIER_ICV_PUSH 938 if (propagate_icvs) { 939 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); 940 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy 941 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs); 942 } 943 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime 944 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) 945 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store 946 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 947 &thr_bar->parent_bar->th_fixed_icvs); 948 // non-leaves will get ICVs piggybacked with b_go via NGO store 949 } 950 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs 951 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access 952 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); 953 else // leaves copy parent's fixed ICVs directly to local ICV store 954 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 955 &thr_bar->parent_bar->th_fixed_icvs); 956 } 957 } 958 #endif // KMP_BARRIER_ICV_PUSH 959 960 // Now, release my children 961 if (thr_bar->my_level) { // not a leaf 962 register kmp_int32 child_tid; 963 kmp_uint32 last; 964 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { 965 if (KMP_MASTER_TID(tid)) { // do a flat release 966 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go. 967 thr_bar->b_go = KMP_BARRIER_STATE_BUMP; 968 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line 969 ngo_load(&thr_bar->th_fixed_icvs); 970 // This loops over all the threads skipping only the leaf nodes in the hierarchy 971 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) { 972 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb; 973 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)" 974 " go(%p): %u => %u\n", 975 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 976 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 977 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 978 // Use ngo store (if available) to both store ICVs and release child via child's b_go 979 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 980 } 981 ngo_sync(); 982 } 983 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 984 // Now, release leaf children 985 if (thr_bar->leaf_kids) { // if there are any 986 // We test team_change on the off-chance that the level 1 team changed. 987 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new 988 if (old_leaf_kids) { // release old leaf kids 989 thr_bar->b_go |= old_leaf_state; 990 } 991 // Release new leaf kids 992 last = tid+thr_bar->skip_per_level[1]; 993 if (last > nproc) last = nproc; 994 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1 995 register kmp_info_t *child_thr = team->t.t_threads[child_tid]; 996 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 997 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 998 " T#%d(%d:%d) go(%p): %u => %u\n", 999 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1000 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1001 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1002 // Release child using child's b_go flag 1003 kmp_flag_64 flag(&child_bar->b_go, child_thr); 1004 flag.release(); 1005 } 1006 } 1007 else { // Release all children at once with leaf_state bits on my own b_go flag 1008 thr_bar->b_go |= thr_bar->leaf_state; 1009 } 1010 } 1011 } 1012 else { // Blocktime is not infinite; do a simple hierarchical release 1013 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first 1014 last = tid+thr_bar->skip_per_level[d+1]; 1015 kmp_uint32 skip = thr_bar->skip_per_level[d]; 1016 if (last > nproc) last = nproc; 1017 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { 1018 register kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1019 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1020 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)" 1021 " go(%p): %u => %u\n", 1022 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1023 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1024 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1025 // Release child using child's b_go flag 1026 kmp_flag_64 flag(&child_bar->b_go, child_thr); 1027 flag.release(); 1028 } 1029 } 1030 } 1031 #if KMP_BARRIER_ICV_PUSH 1032 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest 1033 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs); 1034 #endif // KMP_BARRIER_ICV_PUSH 1035 } 1036 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 1037 gtid, team->t.t_id, tid, bt)); 1038 } 1039 1040 // ---------------------------- End of Barrier Algorithms ---------------------------- 1041 1042 // Internal function to do a barrier. 1043 /* If is_split is true, do a split barrier, otherwise, do a plain barrier 1044 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier 1045 Returns 0 if master thread, 1 if worker thread. */ 1046 int 1047 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, 1048 void *reduce_data, void (*reduce)(void *, void *)) 1049 { 1050 KMP_TIME_DEVELOPER_BLOCK(KMP_barrier); 1051 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1052 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); 1053 register int tid = __kmp_tid_from_gtid(gtid); 1054 register kmp_info_t *this_thr = __kmp_threads[gtid]; 1055 register kmp_team_t *team = this_thr->th.th_team; 1056 register int status = 0; 1057 ident_t *loc = __kmp_threads[gtid]->th.th_ident; 1058 #if OMPT_SUPPORT 1059 ompt_task_id_t my_task_id; 1060 ompt_parallel_id_t my_parallel_id; 1061 #endif 1062 1063 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", 1064 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1065 1066 #if OMPT_SUPPORT 1067 if (ompt_enabled) { 1068 #if OMPT_BLAME 1069 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id; 1070 my_parallel_id = team->t.ompt_team_info.parallel_id; 1071 1072 #if OMPT_TRACE 1073 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) { 1074 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) { 1075 ompt_callbacks.ompt_callback(ompt_event_single_others_end)( 1076 my_parallel_id, my_task_id); 1077 } 1078 } 1079 #endif 1080 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { 1081 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( 1082 my_parallel_id, my_task_id); 1083 } 1084 #endif 1085 // It is OK to report the barrier state after the barrier begin callback. 1086 // According to the OMPT specification, a compliant implementation may 1087 // even delay reporting this state until the barrier begins to wait. 1088 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; 1089 } 1090 #endif 1091 1092 if (! team->t.t_serialized) { 1093 #if USE_ITT_BUILD 1094 // This value will be used in itt notify events below. 1095 void *itt_sync_obj = NULL; 1096 # if USE_ITT_NOTIFY 1097 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1098 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1099 # endif 1100 #endif /* USE_ITT_BUILD */ 1101 if (__kmp_tasking_mode == tskm_extra_barrier) { 1102 __kmp_tasking_barrier(team, this_thr, gtid); 1103 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", 1104 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1105 } 1106 1107 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when 1108 the team struct is not guaranteed to exist. */ 1109 // See note about the corresponding code in __kmp_join_barrier() being performance-critical. 1110 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1111 #if KMP_USE_MONITOR 1112 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1113 #endif 1114 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1115 } 1116 1117 #if USE_ITT_BUILD 1118 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1119 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1120 #endif /* USE_ITT_BUILD */ 1121 #if USE_DEBUGGER 1122 // Let the debugger know: the thread arrived to the barrier and waiting. 1123 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure. 1124 team->t.t_bar[bt].b_master_arrived += 1; 1125 } else { 1126 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; 1127 } // if 1128 #endif /* USE_DEBUGGER */ 1129 if (reduce != NULL) { 1130 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 1131 this_thr->th.th_local.reduce_data = reduce_data; 1132 } 1133 1134 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) 1135 __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1 1136 1137 switch (__kmp_barrier_gather_pattern[bt]) { 1138 case bp_hyper_bar: { 1139 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear 1140 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce 1141 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1142 break; 1143 } 1144 case bp_hierarchical_bar: { 1145 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce 1146 USE_ITT_BUILD_ARG(itt_sync_obj)); 1147 break; 1148 } 1149 case bp_tree_bar: { 1150 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear 1151 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce 1152 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1153 break; 1154 } 1155 default: { 1156 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce 1157 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1158 } 1159 } 1160 1161 KMP_MB(); 1162 1163 if (KMP_MASTER_TID(tid)) { 1164 status = 0; 1165 if (__kmp_tasking_mode != tskm_immediate_exec) { 1166 __kmp_task_team_wait(this_thr, team 1167 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1168 } 1169 #if USE_DEBUGGER 1170 // Let the debugger know: All threads are arrived and starting leaving the barrier. 1171 team->t.t_bar[bt].b_team_arrived += 1; 1172 #endif 1173 1174 #if USE_ITT_BUILD 1175 /* TODO: In case of split reduction barrier, master thread may send acquired event early, 1176 before the final summation into the shared variable is done (final summation can be a 1177 long operation for array reductions). */ 1178 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1179 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1180 #endif /* USE_ITT_BUILD */ 1181 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1182 // Barrier - report frame end (only if active_level == 1) 1183 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && 1184 #if OMP_40_ENABLED 1185 this_thr->th.th_teams_microtask == NULL && 1186 #endif 1187 team->t.t_active_level == 1) 1188 { 1189 kmp_uint64 cur_time = __itt_get_timestamp(); 1190 kmp_info_t **other_threads = team->t.t_threads; 1191 int nproc = this_thr->th.th_team_nproc; 1192 int i; 1193 switch(__kmp_forkjoin_frames_mode) { 1194 case 1: 1195 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); 1196 this_thr->th.th_frame_time = cur_time; 1197 break; 1198 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed) 1199 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc); 1200 break; 1201 case 3: 1202 if( __itt_metadata_add_ptr ) { 1203 // Initialize with master's wait time 1204 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1205 // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below 1206 this_thr->th.th_bar_arrive_time = 0; 1207 for (i=1; i<nproc; ++i) { 1208 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time ); 1209 other_threads[i]->th.th_bar_arrive_time = 0; 1210 } 1211 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL)); 1212 } 1213 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); 1214 this_thr->th.th_frame_time = cur_time; 1215 break; 1216 } 1217 } 1218 #endif /* USE_ITT_BUILD */ 1219 } else { 1220 status = 1; 1221 #if USE_ITT_BUILD 1222 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1223 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1224 #endif /* USE_ITT_BUILD */ 1225 } 1226 if (status == 1 || ! is_split) { 1227 switch (__kmp_barrier_release_pattern[bt]) { 1228 case bp_hyper_bar: { 1229 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1230 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE 1231 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1232 break; 1233 } 1234 case bp_hierarchical_bar: { 1235 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE 1236 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1237 break; 1238 } 1239 case bp_tree_bar: { 1240 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1241 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE 1242 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1243 break; 1244 } 1245 default: { 1246 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE 1247 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1248 } 1249 } 1250 if (__kmp_tasking_mode != tskm_immediate_exec) { 1251 __kmp_task_team_sync(this_thr, team); 1252 } 1253 } 1254 1255 #if USE_ITT_BUILD 1256 /* GEH: TODO: Move this under if-condition above and also include in 1257 __kmp_end_split_barrier(). This will more accurately represent the actual release time 1258 of the threads for split barriers. */ 1259 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1260 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1261 #endif /* USE_ITT_BUILD */ 1262 } else { // Team is serialized. 1263 status = 0; 1264 if (__kmp_tasking_mode != tskm_immediate_exec) { 1265 #if OMP_45_ENABLED 1266 if ( this_thr->th.th_task_team != NULL ) { 1267 void *itt_sync_obj = NULL; 1268 #if USE_ITT_NOTIFY 1269 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1270 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1271 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1272 } 1273 #endif 1274 1275 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE); 1276 __kmp_task_team_wait(this_thr, team 1277 USE_ITT_BUILD_ARG(itt_sync_obj)); 1278 __kmp_task_team_setup(this_thr, team, 0); 1279 1280 #if USE_ITT_BUILD 1281 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1282 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1283 #endif /* USE_ITT_BUILD */ 1284 } 1285 #else 1286 // The task team should be NULL for serialized code (tasks will be executed immediately) 1287 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL); 1288 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL); 1289 #endif 1290 } 1291 } 1292 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", 1293 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status)); 1294 1295 #if OMPT_SUPPORT 1296 if (ompt_enabled) { 1297 #if OMPT_BLAME 1298 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { 1299 ompt_callbacks.ompt_callback(ompt_event_barrier_end)( 1300 my_parallel_id, my_task_id); 1301 } 1302 #endif 1303 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1304 } 1305 #endif 1306 1307 return status; 1308 } 1309 1310 1311 void 1312 __kmp_end_split_barrier(enum barrier_type bt, int gtid) 1313 { 1314 KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier); 1315 int tid = __kmp_tid_from_gtid(gtid); 1316 kmp_info_t *this_thr = __kmp_threads[gtid]; 1317 kmp_team_t *team = this_thr->th.th_team; 1318 1319 if (!team->t.t_serialized) { 1320 if (KMP_MASTER_GTID(gtid)) { 1321 switch (__kmp_barrier_release_pattern[bt]) { 1322 case bp_hyper_bar: { 1323 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1324 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE 1325 USE_ITT_BUILD_ARG(NULL) ); 1326 break; 1327 } 1328 case bp_hierarchical_bar: { 1329 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE 1330 USE_ITT_BUILD_ARG(NULL)); 1331 break; 1332 } 1333 case bp_tree_bar: { 1334 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1335 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE 1336 USE_ITT_BUILD_ARG(NULL) ); 1337 break; 1338 } 1339 default: { 1340 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE 1341 USE_ITT_BUILD_ARG(NULL) ); 1342 } 1343 } 1344 if (__kmp_tasking_mode != tskm_immediate_exec) { 1345 __kmp_task_team_sync(this_thr, team); 1346 } // if 1347 } 1348 } 1349 } 1350 1351 1352 void 1353 __kmp_join_barrier(int gtid) 1354 { 1355 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier); 1356 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1357 KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier); 1358 register kmp_info_t *this_thr = __kmp_threads[gtid]; 1359 register kmp_team_t *team; 1360 register kmp_uint nproc; 1361 kmp_info_t *master_thread; 1362 int tid; 1363 #ifdef KMP_DEBUG 1364 int team_id; 1365 #endif /* KMP_DEBUG */ 1366 #if USE_ITT_BUILD 1367 void *itt_sync_obj = NULL; 1368 # if USE_ITT_NOTIFY 1369 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need 1370 // Get object created at fork_barrier 1371 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1372 # endif 1373 #endif /* USE_ITT_BUILD */ 1374 KMP_MB(); 1375 1376 // Get current info 1377 team = this_thr->th.th_team; 1378 nproc = this_thr->th.th_team_nproc; 1379 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); 1380 tid = __kmp_tid_from_gtid(gtid); 1381 #ifdef KMP_DEBUG 1382 team_id = team->t.t_id; 1383 #endif /* KMP_DEBUG */ 1384 master_thread = this_thr->th.th_team_master; 1385 #ifdef KMP_DEBUG 1386 if (master_thread != team->t.t_threads[0]) { 1387 __kmp_print_structure(); 1388 } 1389 #endif /* KMP_DEBUG */ 1390 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); 1391 KMP_MB(); 1392 1393 // Verify state 1394 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 1395 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); 1396 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); 1397 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); 1398 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid)); 1399 1400 #if OMPT_SUPPORT 1401 #if OMPT_TRACE 1402 if (ompt_enabled && 1403 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { 1404 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( 1405 team->t.ompt_team_info.parallel_id, 1406 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1407 } 1408 #endif 1409 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; 1410 #endif 1411 1412 if (__kmp_tasking_mode == tskm_extra_barrier) { 1413 __kmp_tasking_barrier(team, this_thr, gtid); 1414 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid)); 1415 } 1416 # ifdef KMP_DEBUG 1417 if (__kmp_tasking_mode != tskm_immediate_exec) { 1418 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n", 1419 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state], 1420 this_thr->th.th_task_team)); 1421 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]); 1422 } 1423 # endif /* KMP_DEBUG */ 1424 1425 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the 1426 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows 1427 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite, 1428 since the values are not used by __kmp_wait_template() in that case. */ 1429 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1430 #if KMP_USE_MONITOR 1431 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1432 #endif 1433 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1434 } 1435 1436 #if USE_ITT_BUILD 1437 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1438 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1439 #endif /* USE_ITT_BUILD */ 1440 1441 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { 1442 case bp_hyper_bar: { 1443 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1444 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL 1445 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1446 break; 1447 } 1448 case bp_hierarchical_bar: { 1449 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL 1450 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1451 break; 1452 } 1453 case bp_tree_bar: { 1454 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1455 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL 1456 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1457 break; 1458 } 1459 default: { 1460 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL 1461 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1462 } 1463 } 1464 1465 /* From this point on, the team data structure may be deallocated at any time by the 1466 master thread - it is unsafe to reference it in any of the worker threads. Any per-team 1467 data items that need to be referenced before the end of the barrier should be moved to 1468 the kmp_task_team_t structs. */ 1469 if (KMP_MASTER_TID(tid)) { 1470 if (__kmp_tasking_mode != tskm_immediate_exec) { 1471 __kmp_task_team_wait(this_thr, team 1472 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1473 } 1474 #if KMP_STATS_ENABLED 1475 // Have master thread flag the workers to indicate they are now waiting for 1476 // next parallel region, Also wake them up so they switch their timers to idle. 1477 for (int i=0; i<team->t.t_nproc; ++i) { 1478 kmp_info_t* team_thread = team->t.t_threads[i]; 1479 if (team_thread == this_thr) 1480 continue; 1481 team_thread->th.th_stats->setIdleFlag(); 1482 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL) 1483 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc); 1484 } 1485 #endif 1486 #if USE_ITT_BUILD 1487 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1488 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1489 #endif /* USE_ITT_BUILD */ 1490 1491 # if USE_ITT_BUILD && USE_ITT_NOTIFY 1492 // Join barrier - report frame end 1493 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && 1494 #if OMP_40_ENABLED 1495 this_thr->th.th_teams_microtask == NULL && 1496 #endif 1497 team->t.t_active_level == 1) 1498 { 1499 kmp_uint64 cur_time = __itt_get_timestamp(); 1500 ident_t * loc = team->t.t_ident; 1501 kmp_info_t **other_threads = team->t.t_threads; 1502 int nproc = this_thr->th.th_team_nproc; 1503 int i; 1504 switch(__kmp_forkjoin_frames_mode) { 1505 case 1: 1506 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); 1507 break; 1508 case 2: 1509 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc); 1510 break; 1511 case 3: 1512 if( __itt_metadata_add_ptr ) { 1513 // Initialize with master's wait time 1514 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1515 // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below 1516 this_thr->th.th_bar_arrive_time = 0; 1517 for (i=1; i<nproc; ++i) { 1518 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time ); 1519 other_threads[i]->th.th_bar_arrive_time = 0; 1520 } 1521 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0); 1522 } 1523 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); 1524 this_thr->th.th_frame_time = cur_time; 1525 break; 1526 } 1527 } 1528 # endif /* USE_ITT_BUILD */ 1529 } 1530 #if USE_ITT_BUILD 1531 else { 1532 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1533 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1534 } 1535 #endif /* USE_ITT_BUILD */ 1536 1537 #if KMP_DEBUG 1538 if (KMP_MASTER_TID(tid)) { 1539 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", 1540 gtid, team_id, tid, nproc)); 1541 } 1542 #endif /* KMP_DEBUG */ 1543 1544 // TODO now, mark worker threads as done so they may be disbanded 1545 KMP_MB(); // Flush all pending memory write invalidates. 1546 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); 1547 1548 #if OMPT_SUPPORT 1549 if (ompt_enabled) { 1550 #if OMPT_BLAME 1551 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { 1552 ompt_callbacks.ompt_callback(ompt_event_barrier_end)( 1553 team->t.ompt_team_info.parallel_id, 1554 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1555 } 1556 #endif 1557 1558 // return to default state 1559 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 1560 } 1561 #endif 1562 } 1563 1564 1565 // TODO release worker threads' fork barriers as we are ready instead of all at once 1566 void 1567 __kmp_fork_barrier(int gtid, int tid) 1568 { 1569 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier); 1570 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1571 KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier); 1572 kmp_info_t *this_thr = __kmp_threads[gtid]; 1573 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; 1574 #if USE_ITT_BUILD 1575 void * itt_sync_obj = NULL; 1576 #endif /* USE_ITT_BUILD */ 1577 1578 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", 1579 gtid, (team != NULL) ? team->t.t_id : -1, tid)); 1580 1581 // th_team pointer only valid for master thread here 1582 if (KMP_MASTER_TID(tid)) { 1583 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1584 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1585 // Create itt barrier object 1586 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); 1587 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing 1588 } 1589 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1590 1591 #ifdef KMP_DEBUG 1592 register kmp_info_t **other_threads = team->t.t_threads; 1593 register int i; 1594 1595 // Verify state 1596 KMP_MB(); 1597 1598 for(i=1; i<team->t.t_nproc; ++i) { 1599 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n", 1600 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, 1601 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, 1602 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); 1603 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) 1604 & ~(KMP_BARRIER_SLEEP_STATE)) 1605 == KMP_INIT_BARRIER_STATE); 1606 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); 1607 } 1608 #endif 1609 1610 if (__kmp_tasking_mode != tskm_immediate_exec) { 1611 __kmp_task_team_setup(this_thr, team, 0); // 0 indicates setup current task team if nthreads > 1 1612 } 1613 1614 /* The master thread may have changed its blocktime between the join barrier and the 1615 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can 1616 access it when the team struct is not guaranteed to exist. */ 1617 // See note about the corresponding code in __kmp_join_barrier() being performance-critical 1618 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1619 #if KMP_USE_MONITOR 1620 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1621 #endif 1622 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1623 } 1624 } // master 1625 1626 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { 1627 case bp_hyper_bar: { 1628 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1629 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE 1630 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1631 break; 1632 } 1633 case bp_hierarchical_bar: { 1634 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE 1635 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1636 break; 1637 } 1638 case bp_tree_bar: { 1639 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1640 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE 1641 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1642 break; 1643 } 1644 default: { 1645 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE 1646 USE_ITT_BUILD_ARG(itt_sync_obj) ); 1647 } 1648 } 1649 1650 // Early exit for reaping threads releasing forkjoin barrier 1651 if (TCR_4(__kmp_global.g.g_done)) { 1652 this_thr->th.th_task_team = NULL; 1653 1654 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1655 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1656 if (!KMP_MASTER_TID(tid)) { 1657 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1658 if (itt_sync_obj) 1659 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1660 } 1661 } 1662 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1663 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); 1664 return; 1665 } 1666 1667 /* We can now assume that a valid team structure has been allocated by the master and 1668 propagated to all worker threads. The current thread, however, may not be part of the 1669 team, so we can't blindly assume that the team pointer is non-null. */ 1670 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); 1671 KMP_DEBUG_ASSERT(team != NULL); 1672 tid = __kmp_tid_from_gtid(gtid); 1673 1674 1675 #if KMP_BARRIER_ICV_PULL 1676 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 1677 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has 1678 this data before this function is called. We cannot modify __kmp_fork_call() to look at 1679 the fixed ICVs in the master's thread struct, because it is not always the case that the 1680 threads arrays have been allocated when __kmp_fork_call() is executed. */ 1681 { 1682 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); 1683 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs 1684 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid. 1685 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); 1686 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); 1687 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1688 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs); 1689 } 1690 } 1691 #endif // KMP_BARRIER_ICV_PULL 1692 1693 if (__kmp_tasking_mode != tskm_immediate_exec) { 1694 __kmp_task_team_sync(this_thr, team); 1695 } 1696 1697 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 1698 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 1699 if (proc_bind == proc_bind_intel) { 1700 #endif 1701 #if KMP_AFFINITY_SUPPORTED 1702 // Call dynamic affinity settings 1703 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) { 1704 __kmp_balanced_affinity(tid, team->t.t_nproc); 1705 } 1706 #endif // KMP_AFFINITY_SUPPORTED 1707 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 1708 } 1709 else if (proc_bind != proc_bind_false) { 1710 if (this_thr->th.th_new_place == this_thr->th.th_current_place) { 1711 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", 1712 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place)); 1713 } 1714 else { 1715 __kmp_affinity_set_place(gtid); 1716 } 1717 } 1718 #endif 1719 1720 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1721 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1722 if (!KMP_MASTER_TID(tid)) { 1723 // Get correct barrier object 1724 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1725 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired 1726 } // (prepare called inside barrier_release) 1727 } 1728 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1729 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid)); 1730 } 1731 1732 1733 void 1734 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc ) 1735 { 1736 KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy); 1737 1738 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); 1739 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 1740 1741 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 1742 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has 1743 this data before this function is called. */ 1744 #if KMP_BARRIER_ICV_PULL 1745 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where 1746 all of the worker threads can access them and make their own copies after the barrier. */ 1747 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point 1748 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs); 1749 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 1750 0, team->t.t_threads[0], team)); 1751 #elif KMP_BARRIER_ICV_PUSH 1752 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here. 1753 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 1754 0, team->t.t_threads[0], team)); 1755 #else 1756 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time. 1757 ngo_load(new_icvs); 1758 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point 1759 for (int f=1; f<new_nproc; ++f) { // Skip the master thread 1760 // TODO: GEH - pass in better source location info since usually NULL here 1761 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 1762 f, team->t.t_threads[f], team)); 1763 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); 1764 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); 1765 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 1766 f, team->t.t_threads[f], team)); 1767 } 1768 ngo_sync(); 1769 #endif // KMP_BARRIER_ICV_PULL 1770 } 1771