1 /*
2  * kmp_barrier.cpp
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_stats.h"
19 #include "kmp_itt.h"
20 
21 #if KMP_MIC
22 #include <immintrin.h>
23 #define USE_NGO_STORES 1
24 #endif // KMP_MIC
25 
26 #if KMP_MIC && USE_NGO_STORES
27 // ICV copying
28 #define ngo_load(src)            __m512d Vt = _mm512_load_pd((void *)(src))
29 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
30 #define ngo_store_go(dst, src)   _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_sync()               __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
32 #else
33 #define ngo_load(src)            ((void)0)
34 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
35 #define ngo_store_go(dst, src)   memcpy((dst), (src), CACHE_LINE)
36 #define ngo_sync()               ((void)0)
37 #endif /* KMP_MIC && USE_NGO_STORES */
38 
39 void __kmp_print_structure(void); // Forward declaration
40 
41 // ---------------------------- Barrier Algorithms ----------------------------
42 
43 // Linear Barrier
44 static void
45 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
46                             void (*reduce)(void *, void *)
47                             USE_ITT_BUILD_ARG(void * itt_sync_obj) )
48 {
49     KMP_TIME_BLOCK(KMP_linear_gather);
50     register kmp_team_t *team = this_thr->th.th_team;
51     register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
52     register kmp_info_t **other_threads = team->t.t_threads;
53 
54     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55                   gtid, team->t.t_id, tid, bt));
56     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
57 
58 #if USE_ITT_BUILD && USE_ITT_NOTIFY
59     // Barrier imbalance - save arrive time to the thread
60     if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
61         this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
62     }
63 #endif
64     // We now perform a linear reduction to signal that all of the threads have arrived.
65     if (!KMP_MASTER_TID(tid)) {
66         KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
67                       "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
68                       __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
69                       thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
70         // Mark arrival to master thread
71         /* After performing this write, a worker thread may not assume that the team is valid
72            any more - it could be deallocated by the master thread at any time. */
73         kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
74         flag.release();
75     } else {
76         register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
77         register int nproc = this_thr->th.th_team_nproc;
78         register int i;
79         // Don't have to worry about sleep bit here or atomic since team setting
80         register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
81 
82         // Collect all the worker team member threads.
83         for (i=1; i<nproc; ++i) {
84 #if KMP_CACHE_MANAGE
85             // Prefetch next thread's arrived count
86             if (i+1 < nproc)
87                 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
88 #endif /* KMP_CACHE_MANAGE */
89             KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
90                           "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
91                             __kmp_gtid_from_tid(i, team), team->t.t_id, i,
92                             &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
93 
94             // Wait for worker thread to arrive
95             kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
96             flag.wait(this_thr, FALSE
97                       USE_ITT_BUILD_ARG(itt_sync_obj) );
98 #if USE_ITT_BUILD && USE_ITT_NOTIFY
99             // Barrier imbalance - write min of the thread time and the other thread time to the thread.
100             if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
101                 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
102                                                           other_threads[i]->th.th_bar_min_time);
103             }
104 #endif
105             if (reduce) {
106                 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
107                                team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
108                 (*reduce)(this_thr->th.th_local.reduce_data,
109                           other_threads[i]->th.th_local.reduce_data);
110             }
111         }
112         // Don't have to worry about sleep bit here or atomic since team setting
113         team_bar->b_arrived = new_state;
114         KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
115                       gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
116     }
117     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
118                   gtid, team->t.t_id, tid, bt));
119 }
120 
121 static void
122 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
123                              int propagate_icvs
124                              USE_ITT_BUILD_ARG(void *itt_sync_obj) )
125 {
126     KMP_TIME_BLOCK(KMP_linear_release);
127     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
128     register kmp_team_t *team;
129 
130     if (KMP_MASTER_TID(tid)) {
131         register unsigned int i;
132         register kmp_uint32 nproc = this_thr->th.th_team_nproc;
133         register kmp_info_t **other_threads;
134 
135         team = __kmp_threads[gtid]->th.th_team;
136         KMP_DEBUG_ASSERT(team != NULL);
137         other_threads = team->t.t_threads;
138 
139         KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
140                       gtid, team->t.t_id, tid, bt));
141 
142         if (nproc > 1) {
143 #if KMP_BARRIER_ICV_PUSH
144             KMP_START_EXPLICIT_TIMER(USER_icv_copy);
145             if (propagate_icvs) {
146                 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
147                 for (i=1; i<nproc; ++i) {
148                     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
149                     ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
150                                    &team->t.t_implicit_task_taskdata[0].td_icvs);
151                 }
152                 ngo_sync();
153             }
154             KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
155 #endif // KMP_BARRIER_ICV_PUSH
156 
157             // Now, release all of the worker threads
158             for (i=1; i<nproc; ++i) {
159 #if KMP_CACHE_MANAGE
160                 // Prefetch next thread's go flag
161                 if (i+1 < nproc)
162                     KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
163 #endif /* KMP_CACHE_MANAGE */
164                 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
165                               "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
166                               other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
167                               &other_threads[i]->th.th_bar[bt].bb.b_go,
168                               other_threads[i]->th.th_bar[bt].bb.b_go,
169                               other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
170                 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
171                 flag.release();
172             }
173         }
174     } else { // Wait for the MASTER thread to release us
175         KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
176                       gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
177         kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
178         flag.wait(this_thr, TRUE
179                   USE_ITT_BUILD_ARG(itt_sync_obj) );
180 #if USE_ITT_BUILD && USE_ITT_NOTIFY
181         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
182             // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
183             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
184             // Cancel wait on previous parallel region...
185             __kmp_itt_task_starting(itt_sync_obj);
186 
187             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
188                 return;
189 
190             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
191             if (itt_sync_obj != NULL)
192                 // Call prepare as early as possible for "new" barrier
193                 __kmp_itt_task_finished(itt_sync_obj);
194         } else
195 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
196         // Early exit for reaping threads releasing forkjoin barrier
197         if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
198             return;
199         // The worker thread may now assume that the team is valid.
200 #ifdef KMP_DEBUG
201         tid = __kmp_tid_from_gtid(gtid);
202         team = __kmp_threads[gtid]->th.th_team;
203 #endif
204         KMP_DEBUG_ASSERT(team != NULL);
205         TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
206         KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
207                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
208         KMP_MB();  // Flush all pending memory write invalidates.
209     }
210     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
211                   gtid, team->t.t_id, tid, bt));
212 }
213 
214 // Tree barrier
215 static void
216 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
217                           void (*reduce)(void *, void *)
218                           USE_ITT_BUILD_ARG(void *itt_sync_obj) )
219 {
220     KMP_TIME_BLOCK(KMP_tree_gather);
221     register kmp_team_t *team = this_thr->th.th_team;
222     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
223     register kmp_info_t **other_threads = team->t.t_threads;
224     register kmp_uint32 nproc = this_thr->th.th_team_nproc;
225     register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
226     register kmp_uint32 branch_factor = 1 << branch_bits;
227     register kmp_uint32 child;
228     register kmp_uint32 child_tid;
229     register kmp_uint new_state;
230 
231     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
232                   gtid, team->t.t_id, tid, bt));
233     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
234 
235 #if USE_ITT_BUILD && USE_ITT_NOTIFY
236     // Barrier imbalance - save arrive time to the thread
237     if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
238         this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
239     }
240 #endif
241     // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
242     child_tid = (tid << branch_bits) + 1;
243     if (child_tid < nproc) {
244         // Parent threads wait for all their children to arrive
245         new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
246         child = 1;
247         do {
248             register kmp_info_t *child_thr = other_threads[child_tid];
249             register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
250 #if KMP_CACHE_MANAGE
251             // Prefetch next thread's arrived count
252             if (child+1 <= branch_factor && child_tid+1 < nproc)
253                 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
254 #endif /* KMP_CACHE_MANAGE */
255             KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
256                           "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
257                             __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
258                             &child_bar->b_arrived, new_state));
259             // Wait for child to arrive
260             kmp_flag_64 flag(&child_bar->b_arrived, new_state);
261             flag.wait(this_thr, FALSE
262                       USE_ITT_BUILD_ARG(itt_sync_obj) );
263 #if USE_ITT_BUILD && USE_ITT_NOTIFY
264             // Barrier imbalance - write min of the thread time and a child time to the thread.
265             if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
266                 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
267                                                           child_thr->th.th_bar_min_time);
268             }
269 #endif
270             if (reduce) {
271                 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
272                                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
273                                team->t.t_id, child_tid));
274                 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
275             }
276             child++;
277             child_tid++;
278         }
279         while (child <= branch_factor && child_tid < nproc);
280     }
281 
282     if (!KMP_MASTER_TID(tid)) { // Worker threads
283         register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
284 
285         KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
286                       "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
287                       __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
288                       &thr_bar->b_arrived, thr_bar->b_arrived,
289                       thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
290 
291         // Mark arrival to parent thread
292         /* After performing this write, a worker thread may not assume that the team is valid
293            any more - it could be deallocated by the master thread at any time.  */
294         kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
295         flag.release();
296     } else {
297         // Need to update the team arrived pointer if we are the master thread
298         if (nproc > 1) // New value was already computed above
299             team->t.t_bar[bt].b_arrived = new_state;
300         else
301             team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
302         KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
303                       gtid, team->t.t_id, tid, team->t.t_id,
304                       &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
305     }
306     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
307                   gtid, team->t.t_id, tid, bt));
308 }
309 
310 static void
311 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
312                            int propagate_icvs
313                            USE_ITT_BUILD_ARG(void *itt_sync_obj) )
314 {
315     KMP_TIME_BLOCK(KMP_tree_release);
316     register kmp_team_t *team;
317     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
318     register kmp_uint32 nproc;
319     register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
320     register kmp_uint32 branch_factor = 1 << branch_bits;
321     register kmp_uint32 child;
322     register kmp_uint32 child_tid;
323 
324     // Perform a tree release for all of the threads that have been gathered
325     if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
326         KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
327                       gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
328         // Wait for parent thread to release us
329         kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
330         flag.wait(this_thr, TRUE
331                   USE_ITT_BUILD_ARG(itt_sync_obj) );
332 #if USE_ITT_BUILD && USE_ITT_NOTIFY
333         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
334             // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
335             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
336             // Cancel wait on previous parallel region...
337             __kmp_itt_task_starting(itt_sync_obj);
338 
339             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
340                 return;
341 
342             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
343             if (itt_sync_obj != NULL)
344                 // Call prepare as early as possible for "new" barrier
345                 __kmp_itt_task_finished(itt_sync_obj);
346         } else
347 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
348         // Early exit for reaping threads releasing forkjoin barrier
349         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
350             return;
351 
352         // The worker thread may now assume that the team is valid.
353         team = __kmp_threads[gtid]->th.th_team;
354         KMP_DEBUG_ASSERT(team != NULL);
355         tid = __kmp_tid_from_gtid(gtid);
356 
357         TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
358         KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
359                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
360         KMP_MB();  // Flush all pending memory write invalidates.
361     } else {
362         team = __kmp_threads[gtid]->th.th_team;
363         KMP_DEBUG_ASSERT(team != NULL);
364         KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
365                       gtid, team->t.t_id, tid, bt));
366     }
367     nproc = this_thr->th.th_team_nproc;
368     child_tid = (tid << branch_bits) + 1;
369 
370     if (child_tid < nproc) {
371         register kmp_info_t **other_threads = team->t.t_threads;
372         child = 1;
373         // Parent threads release all their children
374         do {
375             register kmp_info_t *child_thr = other_threads[child_tid];
376             register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
377 #if KMP_CACHE_MANAGE
378             // Prefetch next thread's go count
379             if (child+1 <= branch_factor && child_tid+1 < nproc)
380                 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
381 #endif /* KMP_CACHE_MANAGE */
382 
383 #if KMP_BARRIER_ICV_PUSH
384             KMP_START_EXPLICIT_TIMER(USER_icv_copy);
385             if (propagate_icvs) {
386                 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
387                                          team, child_tid, FALSE);
388                 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
389                           &team->t.t_implicit_task_taskdata[0].td_icvs);
390             }
391             KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
392 #endif // KMP_BARRIER_ICV_PUSH
393             KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
394                           "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
395                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
396                           child_tid, &child_bar->b_go, child_bar->b_go,
397                           child_bar->b_go + KMP_BARRIER_STATE_BUMP));
398             // Release child from barrier
399             kmp_flag_64 flag(&child_bar->b_go, child_thr);
400             flag.release();
401             child++;
402             child_tid++;
403         }
404         while (child <= branch_factor && child_tid < nproc);
405     }
406     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
407                   gtid, team->t.t_id, tid, bt));
408 }
409 
410 
411 // Hyper Barrier
412 static void
413 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
414                            void (*reduce)(void *, void *)
415                            USE_ITT_BUILD_ARG(void *itt_sync_obj) )
416 {
417     KMP_TIME_BLOCK(KMP_hyper_gather);
418     register kmp_team_t *team = this_thr->th.th_team;
419     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
420     register kmp_info_t **other_threads = team->t.t_threads;
421     register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
422     register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
423     register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
424     register kmp_uint32 branch_factor = 1 << branch_bits;
425     register kmp_uint32 offset;
426     register kmp_uint32 level;
427 
428     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
429                   gtid, team->t.t_id, tid, bt));
430 
431     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
432 
433 #if USE_ITT_BUILD && USE_ITT_NOTIFY
434     // Barrier imbalance - save arrive time to the thread
435     if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
436         this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
437     }
438 #endif
439     /* Perform a hypercube-embedded tree gather to wait until all of the threads have
440        arrived, and reduce any required data as we go.  */
441     kmp_flag_64 p_flag(&thr_bar->b_arrived);
442     for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
443     {
444         register kmp_uint32 child;
445         register kmp_uint32 child_tid;
446 
447         if (((tid >> level) & (branch_factor - 1)) != 0) {
448             register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
449 
450             KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
451                           "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
452                           __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
453                           &thr_bar->b_arrived, thr_bar->b_arrived,
454                           thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
455             // Mark arrival to parent thread
456             /* After performing this write (in the last iteration of the enclosing for loop),
457                a worker thread may not assume that the team is valid any more - it could be
458                deallocated by the master thread at any time.  */
459             p_flag.set_waiter(other_threads[parent_tid]);
460 	    p_flag.release();
461             break;
462         }
463 
464         // Parent threads wait for children to arrive
465         if (new_state == KMP_BARRIER_UNUSED_STATE)
466             new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
467         for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
468              child++, child_tid+=(1 << level))
469         {
470             register kmp_info_t *child_thr = other_threads[child_tid];
471             register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
472 #if KMP_CACHE_MANAGE
473             register kmp_uint32 next_child_tid = child_tid + (1 << level);
474             // Prefetch next thread's arrived count
475             if (child+1 < branch_factor && next_child_tid < num_threads)
476                 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
477 #endif /* KMP_CACHE_MANAGE */
478             KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
479                           "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
480                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
481                           &child_bar->b_arrived, new_state));
482             // Wait for child to arrive
483             kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
484             c_flag.wait(this_thr, FALSE
485                         USE_ITT_BUILD_ARG(itt_sync_obj) );
486 #if USE_ITT_BUILD && USE_ITT_NOTIFY
487             // Barrier imbalance - write min of the thread time and a child time to the thread.
488             if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
489                 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
490                                                           child_thr->th.th_bar_min_time);
491             }
492 #endif
493             if (reduce) {
494                 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
495                                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
496                                team->t.t_id, child_tid));
497                 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
498             }
499         }
500     }
501 
502     if (KMP_MASTER_TID(tid)) {
503         // Need to update the team arrived pointer if we are the master thread
504         if (new_state == KMP_BARRIER_UNUSED_STATE)
505             team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
506         else
507             team->t.t_bar[bt].b_arrived = new_state;
508         KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
509                       gtid, team->t.t_id, tid, team->t.t_id,
510                       &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
511     }
512     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
513                   gtid, team->t.t_id, tid, bt));
514 }
515 
516 // The reverse versions seem to beat the forward versions overall
517 #define KMP_REVERSE_HYPER_BAR
518 static void
519 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
520                             int propagate_icvs
521                             USE_ITT_BUILD_ARG(void *itt_sync_obj) )
522 {
523     KMP_TIME_BLOCK(KMP_hyper_release);
524     register kmp_team_t    *team;
525     register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
526     register kmp_info_t   **other_threads;
527     register kmp_uint32     num_threads;
528     register kmp_uint32     branch_bits   = __kmp_barrier_release_branch_bits[ bt ];
529     register kmp_uint32     branch_factor = 1 << branch_bits;
530     register kmp_uint32     child;
531     register kmp_uint32     child_tid;
532     register kmp_uint32     offset;
533     register kmp_uint32     level;
534 
535     /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
536        If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
537        order of the corresponding gather, otherwise threads are released in the same order. */
538     if (KMP_MASTER_TID(tid)) { // master
539         team = __kmp_threads[gtid]->th.th_team;
540         KMP_DEBUG_ASSERT(team != NULL);
541         KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
542                       gtid, team->t.t_id, tid, bt));
543 #if KMP_BARRIER_ICV_PUSH
544         if (propagate_icvs) { // master already has ICVs in final destination; copy
545             copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
546         }
547 #endif
548     }
549     else  { // Handle fork barrier workers who aren't part of a team yet
550         KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
551                       gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
552         // Wait for parent thread to release us
553         kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
554         flag.wait(this_thr, TRUE
555                   USE_ITT_BUILD_ARG(itt_sync_obj) );
556 #if USE_ITT_BUILD && USE_ITT_NOTIFY
557         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
558             // In fork barrier where we could not get the object reliably
559             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
560             // Cancel wait on previous parallel region...
561             __kmp_itt_task_starting(itt_sync_obj);
562 
563             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
564                 return;
565 
566             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
567             if (itt_sync_obj != NULL)
568                 // Call prepare as early as possible for "new" barrier
569                 __kmp_itt_task_finished(itt_sync_obj);
570         } else
571 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
572         // Early exit for reaping threads releasing forkjoin barrier
573         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
574             return;
575 
576         // The worker thread may now assume that the team is valid.
577         team = __kmp_threads[gtid]->th.th_team;
578         KMP_DEBUG_ASSERT(team != NULL);
579         tid = __kmp_tid_from_gtid(gtid);
580 
581         TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
582         KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
583                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
584         KMP_MB();  // Flush all pending memory write invalidates.
585     }
586     num_threads = this_thr->th.th_team_nproc;
587     other_threads = team->t.t_threads;
588 
589 #ifdef KMP_REVERSE_HYPER_BAR
590     // Count up to correct level for parent
591     for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
592          level+=branch_bits, offset<<=branch_bits);
593 
594     // Now go down from there
595     for (level-=branch_bits, offset>>=branch_bits; offset != 0;
596          level-=branch_bits, offset>>=branch_bits)
597 #else
598     // Go down the tree, level by level
599     for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
600 #endif // KMP_REVERSE_HYPER_BAR
601     {
602 #ifdef KMP_REVERSE_HYPER_BAR
603         /* Now go in reverse order through the children, highest to lowest.
604            Initial setting of child is conservative here. */
605         child = num_threads >> ((level==0)?level:level-1);
606         for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
607              child>=1; child--, child_tid-=(1<<level))
608 #else
609         if (((tid >> level) & (branch_factor - 1)) != 0)
610             // No need to go lower than this, since this is the level parent would be notified
611             break;
612         // Iterate through children on this level of the tree
613         for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
614              child++, child_tid+=(1<<level))
615 #endif // KMP_REVERSE_HYPER_BAR
616         {
617             if (child_tid >= num_threads) continue;  // Child doesn't exist so keep going
618             else {
619                 register kmp_info_t *child_thr = other_threads[child_tid];
620                 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
621 #if KMP_CACHE_MANAGE
622                 register kmp_uint32 next_child_tid = child_tid - (1 << level);
623                 // Prefetch next thread's go count
624 # ifdef KMP_REVERSE_HYPER_BAR
625                 if (child-1 >= 1 && next_child_tid < num_threads)
626 # else
627                 if (child+1 < branch_factor && next_child_tid < num_threads)
628 # endif // KMP_REVERSE_HYPER_BAR
629                     KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
630 #endif /* KMP_CACHE_MANAGE */
631 
632 #if KMP_BARRIER_ICV_PUSH
633                 if (propagate_icvs) // push my fixed ICVs to my child
634                     copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
635 #endif // KMP_BARRIER_ICV_PUSH
636 
637                 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
638                               "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
639                               __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
640                               child_tid, &child_bar->b_go, child_bar->b_go,
641                               child_bar->b_go + KMP_BARRIER_STATE_BUMP));
642                 // Release child from barrier
643                 kmp_flag_64 flag(&child_bar->b_go, child_thr);
644                 flag.release();
645             }
646         }
647     }
648 #if KMP_BARRIER_ICV_PUSH
649     if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
650         __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
651         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
652     }
653 #endif
654     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
655                   gtid, team->t.t_id, tid, bt));
656 }
657 
658 // Hierarchical Barrier
659 
660 // Initialize thread barrier data
661 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.  Performs the
662    minimum amount of initialization required based on how the team has changed.  Returns true if
663    leaf children will require both on-core and traditional wake-up mechanisms.  For example, if the
664    team size increases, threads already in the team will respond to on-core wakeup on their parent
665    thread, but threads newly added to the team will only be listening on the their local b_go. */
666 static bool
667 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
668                                        int gtid, int tid, kmp_team_t *team)
669 {
670     // Checks to determine if (re-)initialization is needed
671     bool uninitialized = thr_bar->team == NULL;
672     bool team_changed = team != thr_bar->team;
673     bool team_sz_changed = nproc != thr_bar->nproc;
674     bool tid_changed = tid != thr_bar->old_tid;
675     bool retval = false;
676 
677     if (uninitialized || team_sz_changed) {
678         __kmp_get_hierarchy(nproc, thr_bar);
679     }
680 
681     if (uninitialized || team_sz_changed || tid_changed) {
682         thr_bar->my_level = thr_bar->depth-1; // default for master
683         thr_bar->parent_tid = -1; // default for master
684         if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
685             kmp_uint32 d=0;
686             while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
687                 kmp_uint32 rem;
688                 if (d == thr_bar->depth-2) { // reached level right below the master
689                     thr_bar->parent_tid = 0;
690                     thr_bar->my_level = d;
691                     break;
692                 }
693                 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
694                     // thread is not a subtree root at next level, so this is max
695                     thr_bar->parent_tid = tid - rem;
696                     thr_bar->my_level = d;
697                     break;
698                 }
699                 ++d;
700             }
701         }
702         thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
703         thr_bar->old_tid = tid;
704         thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
705     }
706     if (uninitialized || team_changed || tid_changed) {
707         thr_bar->team = team;
708         thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
709         retval = true;
710     }
711     if (uninitialized || team_sz_changed || tid_changed) {
712         thr_bar->nproc = nproc;
713         thr_bar->leaf_kids = thr_bar->base_leaf_kids;
714         if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
715         if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
716             thr_bar->leaf_kids = nproc - tid - 1;
717         thr_bar->leaf_state = 0;
718         for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
719     }
720     return retval;
721 }
722 
723 static void
724 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
725                                   int gtid, int tid, void (*reduce) (void *, void *)
726                                   USE_ITT_BUILD_ARG(void * itt_sync_obj) )
727 {
728     KMP_TIME_BLOCK(KMP_hier_gather);
729     register kmp_team_t *team = this_thr->th.th_team;
730     register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
731     register kmp_uint32 nproc = this_thr->th.th_team_nproc;
732     register kmp_info_t **other_threads = team->t.t_threads;
733     register kmp_uint64 new_state;
734 
735     int level = team->t.t_level;
736     if (other_threads[0]->th.th_teams_microtask)    // are we inside the teams construct?
737         if (this_thr->th.th_teams_size.nteams > 1)
738             ++level; // level was not increased in teams construct for team_of_masters
739     if (level == 1) thr_bar->use_oncore_barrier = 1;
740     else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
741 
742     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
743                   gtid, team->t.t_id, tid, bt));
744     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
745 
746     (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
747 
748     if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
749         register kmp_int32 child_tid;
750         new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
751         if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
752             if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
753                 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
754                 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
755                 flag.wait(this_thr, FALSE
756                           USE_ITT_BUILD_ARG(itt_sync_obj) );
757                 if (reduce) {
758                     for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
759                         KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
760                                        gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
761                                        team->t.t_id, child_tid));
762                         (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
763                     }
764                 }
765                 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
766             }
767             // Next, wait for higher level children on each child's b_arrived flag
768             for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
769                 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
770                 if (last > nproc) last = nproc;
771                 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
772                     register kmp_info_t *child_thr = other_threads[child_tid];
773                     register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
774                     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
775                                   "arrived(%p) == %u\n",
776                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
777                                   team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
778                     kmp_flag_64 flag(&child_bar->b_arrived, new_state);
779                     flag.wait(this_thr, FALSE
780                               USE_ITT_BUILD_ARG(itt_sync_obj) );
781                     if (reduce) {
782                         KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
783                                        gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
784                                        team->t.t_id, child_tid));
785                         (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
786                     }
787                 }
788             }
789         }
790         else { // Blocktime is not infinite
791             for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
792                 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
793                 if (last > nproc) last = nproc;
794                 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
795                     register kmp_info_t *child_thr = other_threads[child_tid];
796                     register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
797                     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
798                                   "arrived(%p) == %u\n",
799                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
800                                   team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
801                     kmp_flag_64 flag(&child_bar->b_arrived, new_state);
802                     flag.wait(this_thr, FALSE
803                               USE_ITT_BUILD_ARG(itt_sync_obj) );
804                     if (reduce) {
805                         KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
806                                        gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
807                                        team->t.t_id, child_tid));
808                         (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
809                     }
810                 }
811             }
812         }
813     }
814     // All subordinates are gathered; now release parent if not master thread
815 
816     if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
817         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
818                       "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
819                       __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
820                       &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
821         /* Mark arrival to parent: After performing this write, a worker thread may not assume that
822            the team is valid any more - it could be deallocated by the master thread at any time. */
823         if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
824             || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
825             kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
826             flag.release();
827         }
828         else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
829             thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
830             kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
831             flag.set_waiter(other_threads[thr_bar->parent_tid]);
832             flag.release();
833         }
834     } else { // Master thread needs to update the team's b_arrived value
835         team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
836         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
837                       gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
838     }
839     // Is the team access below unsafe or just technically invalid?
840     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
841                   gtid, team->t.t_id, tid, bt));
842 }
843 
844 static void
845 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
846                                    int propagate_icvs
847                                    USE_ITT_BUILD_ARG(void * itt_sync_obj) )
848 {
849     KMP_TIME_BLOCK(KMP_hier_release);
850     register kmp_team_t *team;
851     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
852     register kmp_uint32 nproc;
853     bool team_change = false; // indicates on-core barrier shouldn't be used
854 
855     if (KMP_MASTER_TID(tid)) {
856         team = __kmp_threads[gtid]->th.th_team;
857         KMP_DEBUG_ASSERT(team != NULL);
858         KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
859                       gtid, team->t.t_id, tid, bt));
860     }
861     else { // Worker threads
862         // Wait for parent thread to release me
863         if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
864             || thr_bar->my_level != 0 || thr_bar->team == NULL) {
865             // Use traditional method of waiting on my own b_go flag
866             thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
867             kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
868             flag.wait(this_thr, TRUE
869                       USE_ITT_BUILD_ARG(itt_sync_obj) );
870             TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
871         }
872         else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
873             // Wait on my "offset" bits on parent's b_go flag
874             thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
875             kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
876                                  bt, this_thr
877                                  USE_ITT_BUILD_ARG(itt_sync_obj) );
878             flag.wait(this_thr, TRUE
879                       USE_ITT_BUILD_ARG(itt_sync_obj) );
880             if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
881                 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
882             }
883             else { // Reset my bits on parent's b_go flag
884                 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
885             }
886         }
887         thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
888         // Early exit for reaping threads releasing forkjoin barrier
889         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
890             return;
891         // The worker thread may now assume that the team is valid.
892         team = __kmp_threads[gtid]->th.th_team;
893         KMP_DEBUG_ASSERT(team != NULL);
894         tid = __kmp_tid_from_gtid(gtid);
895 
896         KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
897                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
898         KMP_MB();  // Flush all pending memory write invalidates.
899     }
900 
901     int level = team->t.t_level;
902     if (team->t.t_threads[0]->th.th_teams_microtask ) {    // are we inside the teams construct?
903         if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
904             ++level; // level was not increased in teams construct for team_of_workers
905         if( this_thr->th.th_teams_size.nteams > 1 )
906             ++level; // level was not increased in teams construct for team_of_masters
907     }
908     if (level == 1) thr_bar->use_oncore_barrier = 1;
909     else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
910     nproc = this_thr->th.th_team_nproc;
911 
912     // If the team size has increased, we still communicate with old leaves via oncore barrier.
913     unsigned short int old_leaf_kids = thr_bar->leaf_kids;
914     kmp_uint64 old_leaf_state = thr_bar->leaf_state;
915     team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
916     // But if the entire team changes, we won't use oncore barrier at all
917     if (team_change) old_leaf_kids = 0;
918 
919 #if KMP_BARRIER_ICV_PUSH
920     if (propagate_icvs) {
921         if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
922             copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
923         }
924         else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
925             if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
926                 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
927                 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
928                           &thr_bar->parent_bar->th_fixed_icvs);
929             // non-leaves will get ICVs piggybacked with b_go via NGO store
930         }
931         else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
932             if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
933                 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
934             else // leaves copy parent's fixed ICVs directly to local ICV store
935                 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
936                           &thr_bar->parent_bar->th_fixed_icvs);
937         }
938     }
939 #endif // KMP_BARRIER_ICV_PUSH
940 
941     // Now, release my children
942     if (thr_bar->my_level) { // not a leaf
943         register kmp_int32 child_tid;
944         kmp_uint32 last;
945         if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
946             if (KMP_MASTER_TID(tid)) { // do a flat release
947                 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
948                 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
949                 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
950                 ngo_load(&thr_bar->th_fixed_icvs);
951                 // This loops over all the threads skipping only the leaf nodes in the hierarchy
952                 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
953                     register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
954                     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
955                                   " go(%p): %u => %u\n",
956                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
957                                   team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
958                                   child_bar->b_go + KMP_BARRIER_STATE_BUMP));
959                     // Use ngo store (if available) to both store ICVs and release child via child's b_go
960                     ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
961                 }
962                 ngo_sync();
963             }
964             TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
965             // Now, release leaf children
966             if (thr_bar->leaf_kids) { // if there are any
967                 // We test team_change on the off-chance that the level 1 team changed.
968                 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
969                     if (old_leaf_kids) { // release old leaf kids
970                         thr_bar->b_go |= old_leaf_state;
971                     }
972                     // Release new leaf kids
973                     last = tid+thr_bar->skip_per_level[1];
974                     if (last > nproc) last = nproc;
975                     for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
976                         register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
977                         register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
978                         KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
979                                       " T#%d(%d:%d) go(%p): %u => %u\n",
980                                       gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
981                                       team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
982                                       child_bar->b_go + KMP_BARRIER_STATE_BUMP));
983                         // Release child using child's b_go flag
984                         kmp_flag_64 flag(&child_bar->b_go, child_thr);
985                         flag.release();
986                     }
987                 }
988                 else { // Release all children at once with leaf_state bits on my own b_go flag
989                     thr_bar->b_go |= thr_bar->leaf_state;
990                 }
991             }
992         }
993         else { // Blocktime is not infinite; do a simple hierarchical release
994             for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
995                 last = tid+thr_bar->skip_per_level[d+1];
996                 kmp_uint32 skip = thr_bar->skip_per_level[d];
997                 if (last > nproc) last = nproc;
998                 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
999                     register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
1000                     register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1001                     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1002                                   " go(%p): %u => %u\n",
1003                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1004                                   team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1005                                   child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1006                     // Release child using child's b_go flag
1007                     kmp_flag_64 flag(&child_bar->b_go, child_thr);
1008                     flag.release();
1009                 }
1010             }
1011         }
1012 #if KMP_BARRIER_ICV_PUSH
1013         if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1014             copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1015 #endif // KMP_BARRIER_ICV_PUSH
1016     }
1017     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1018                   gtid, team->t.t_id, tid, bt));
1019 }
1020 
1021 // ---------------------------- End of Barrier Algorithms ----------------------------
1022 
1023 // Internal function to do a barrier.
1024 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1025    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1026    Returns 0 if master thread, 1 if worker thread.  */
1027 int
1028 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1029               void *reduce_data, void (*reduce)(void *, void *))
1030 {
1031     KMP_TIME_BLOCK(KMP_barrier);
1032     register int tid = __kmp_tid_from_gtid(gtid);
1033     register kmp_info_t *this_thr = __kmp_threads[gtid];
1034     register kmp_team_t *team = this_thr->th.th_team;
1035     register int status = 0;
1036     ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1037 #if OMPT_SUPPORT
1038     ompt_task_id_t my_task_id;
1039     ompt_parallel_id_t my_parallel_id;
1040 #endif
1041 
1042     KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1043                   gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1044 
1045 #if OMPT_SUPPORT && OMPT_TRACE
1046     if (ompt_status & ompt_status_track) {
1047         if (ompt_status == ompt_status_track_callback) {
1048             my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1049             my_parallel_id = team->t.ompt_team_info.parallel_id;
1050 
1051             if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1052                 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1053                     ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1054                         my_parallel_id, my_task_id);
1055                 }
1056             }
1057             this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1058             if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1059                 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1060                     my_parallel_id, my_task_id);
1061             }
1062         } else {
1063             this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1064         }
1065     }
1066 #endif
1067 
1068     if (! team->t.t_serialized) {
1069 #if USE_ITT_BUILD
1070         // This value will be used in itt notify events below.
1071         void *itt_sync_obj = NULL;
1072 # if USE_ITT_NOTIFY
1073         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1074             itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1075 # endif
1076 #endif /* USE_ITT_BUILD */
1077         if (__kmp_tasking_mode == tskm_extra_barrier) {
1078             __kmp_tasking_barrier(team, this_thr, gtid);
1079             KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1080                           gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1081         }
1082 
1083         /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1084            the team struct is not guaranteed to exist. */
1085         // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1086         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1087             this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1088             this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1089         }
1090 
1091 #if USE_ITT_BUILD
1092         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1093             __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1094 #endif /* USE_ITT_BUILD */
1095 
1096         if (reduce != NULL) {
1097             //KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1098             this_thr->th.th_local.reduce_data = reduce_data;
1099         }
1100         switch (__kmp_barrier_gather_pattern[bt]) {
1101         case bp_hyper_bar: {
1102             KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1103             __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1104                                        USE_ITT_BUILD_ARG(itt_sync_obj) );
1105             break;
1106         }
1107         case bp_hierarchical_bar: {
1108             __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1109                                               USE_ITT_BUILD_ARG(itt_sync_obj));
1110             break;
1111         }
1112         case bp_tree_bar: {
1113             KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1114             __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1115                                       USE_ITT_BUILD_ARG(itt_sync_obj) );
1116             break;
1117         }
1118         default: {
1119             __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1120                                         USE_ITT_BUILD_ARG(itt_sync_obj) );
1121         }
1122         }
1123 
1124         KMP_MB();
1125 
1126         if (KMP_MASTER_TID(tid)) {
1127             status = 0;
1128             if (__kmp_tasking_mode != tskm_immediate_exec) {
1129                 __kmp_task_team_wait(this_thr, team
1130                                      USE_ITT_BUILD_ARG(itt_sync_obj) );
1131                 __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team
1132             }
1133 
1134 
1135 #if USE_ITT_BUILD
1136             /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1137                before the final summation into the shared variable is done (final summation can be a
1138                long operation for array reductions).  */
1139             if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1140                 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1141 #endif /* USE_ITT_BUILD */
1142 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1143             // Barrier - report frame end
1144             if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1145                 kmp_uint64 cur_time = __itt_get_timestamp();
1146                 kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1147                 int nproc = this_thr->th.th_team_nproc;
1148                 int i;
1149                 // Initialize with master's wait time
1150                 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1151                 switch(__kmp_forkjoin_frames_mode) {
1152                 case 1:
1153                     __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1154                     this_thr->th.th_frame_time = cur_time;
1155                     break;
1156                 case 2:
1157                     __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1158                     break;
1159                 case 3:
1160                     if( __itt_metadata_add_ptr ) {
1161                         for (i=1; i<nproc; ++i) {
1162                             delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1163                         }
1164                         __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1165                     }
1166                     __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1167                     this_thr->th.th_frame_time = cur_time;
1168                     break;
1169                 }
1170             }
1171 #endif /* USE_ITT_BUILD */
1172         } else {
1173             status = 1;
1174 #if USE_ITT_BUILD
1175             if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1176                 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1177 #endif /* USE_ITT_BUILD */
1178         }
1179         if (status == 1 || ! is_split) {
1180             switch (__kmp_barrier_release_pattern[bt]) {
1181             case bp_hyper_bar: {
1182                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1183                 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1184                                             USE_ITT_BUILD_ARG(itt_sync_obj) );
1185                 break;
1186             }
1187             case bp_hierarchical_bar: {
1188                 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1189                                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
1190                 break;
1191             }
1192             case bp_tree_bar: {
1193                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1194                 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1195                                            USE_ITT_BUILD_ARG(itt_sync_obj) );
1196                 break;
1197             }
1198             default: {
1199                 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1200                                              USE_ITT_BUILD_ARG(itt_sync_obj) );
1201             }
1202             }
1203             if (__kmp_tasking_mode != tskm_immediate_exec) {
1204                 __kmp_task_team_sync(this_thr, team);
1205             }
1206         }
1207 
1208 #if USE_ITT_BUILD
1209         /* GEH: TODO: Move this under if-condition above and also include in
1210            __kmp_end_split_barrier(). This will more accurately represent the actual release time
1211            of the threads for split barriers.  */
1212         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1213             __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1214 #endif /* USE_ITT_BUILD */
1215     } else { // Team is serialized.
1216         status = 0;
1217         if (__kmp_tasking_mode != tskm_immediate_exec) {
1218             // The task team should be NULL for serialized code (tasks will be executed immediately)
1219             KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1220             KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1221         }
1222     }
1223     KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1224                   gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1225 
1226 #if OMPT_SUPPORT
1227     if (ompt_status & ompt_status_track) {
1228 #if OMPT_TRACE
1229         if ((ompt_status == ompt_status_track_callback) &&
1230             ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1231             ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1232                 my_parallel_id, my_task_id);
1233         }
1234 #endif
1235         this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1236     }
1237 #endif
1238 
1239     return status;
1240 }
1241 
1242 
1243 void
1244 __kmp_end_split_barrier(enum barrier_type bt, int gtid)
1245 {
1246     KMP_TIME_BLOCK(KMP_end_split_barrier);
1247     int tid = __kmp_tid_from_gtid(gtid);
1248     kmp_info_t *this_thr = __kmp_threads[gtid];
1249     kmp_team_t *team = this_thr->th.th_team;
1250 
1251     if (!team->t.t_serialized) {
1252         if (KMP_MASTER_GTID(gtid)) {
1253             switch (__kmp_barrier_release_pattern[bt]) {
1254             case bp_hyper_bar: {
1255                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1256                 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1257                                             USE_ITT_BUILD_ARG(NULL) );
1258                 break;
1259             }
1260             case bp_hierarchical_bar: {
1261                 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1262                                                    USE_ITT_BUILD_ARG(NULL));
1263                 break;
1264             }
1265             case bp_tree_bar: {
1266                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1267                 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1268                                            USE_ITT_BUILD_ARG(NULL) );
1269                 break;
1270             }
1271             default: {
1272                 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1273                                              USE_ITT_BUILD_ARG(NULL) );
1274             }
1275             }
1276             if (__kmp_tasking_mode != tskm_immediate_exec) {
1277                 __kmp_task_team_sync(this_thr, team);
1278             } // if
1279         }
1280     }
1281 }
1282 
1283 
1284 void
1285 __kmp_join_barrier(int gtid)
1286 {
1287     KMP_TIME_BLOCK(KMP_join_barrier);
1288     register kmp_info_t *this_thr = __kmp_threads[gtid];
1289     register kmp_team_t *team;
1290     register kmp_uint nproc;
1291     kmp_info_t *master_thread;
1292     int tid;
1293 #ifdef KMP_DEBUG
1294     int team_id;
1295 #endif /* KMP_DEBUG */
1296 #if USE_ITT_BUILD
1297     void *itt_sync_obj = NULL;
1298 # if USE_ITT_NOTIFY
1299     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1300         // Get object created at fork_barrier
1301         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1302 # endif
1303 #endif /* USE_ITT_BUILD */
1304     KMP_MB();
1305 
1306     // Get current info
1307     team = this_thr->th.th_team;
1308     nproc = this_thr->th.th_team_nproc;
1309     KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1310     tid = __kmp_tid_from_gtid(gtid);
1311 #ifdef KMP_DEBUG
1312     team_id = team->t.t_id;
1313 #endif /* KMP_DEBUG */
1314     master_thread = this_thr->th.th_team_master;
1315 #ifdef KMP_DEBUG
1316     if (master_thread != team->t.t_threads[0]) {
1317         __kmp_print_structure();
1318     }
1319 #endif /* KMP_DEBUG */
1320     KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1321     KMP_MB();
1322 
1323     // Verify state
1324     KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1325     KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1326     KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1327     KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1328     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1329 
1330 #if OMPT_SUPPORT && OMPT_TRACE
1331     if ((ompt_status == ompt_status_track_callback) &&
1332         ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1333         ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1334             team->t.ompt_team_info.parallel_id,
1335             team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1336     }
1337     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1338 #endif
1339 
1340     if (__kmp_tasking_mode == tskm_extra_barrier) {
1341         __kmp_tasking_barrier(team, this_thr, gtid);
1342         KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1343     }
1344 # ifdef KMP_DEBUG
1345     if (__kmp_tasking_mode != tskm_immediate_exec) {
1346         KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1347                        __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
1348                        this_thr->th.th_task_team));
1349         KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
1350     }
1351 # endif /* KMP_DEBUG */
1352 
1353     /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1354        team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1355        down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1356        since the values are not used by __kmp_wait_template() in that case. */
1357     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1358         this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1359         this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1360     }
1361 
1362 #if USE_ITT_BUILD
1363     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1364         __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1365 #endif /* USE_ITT_BUILD */
1366 
1367     switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1368     case bp_hyper_bar: {
1369         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1370         __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1371                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
1372         break;
1373     }
1374     case bp_hierarchical_bar: {
1375         __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1376                                           USE_ITT_BUILD_ARG(itt_sync_obj) );
1377         break;
1378     }
1379     case bp_tree_bar: {
1380         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1381         __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1382                                   USE_ITT_BUILD_ARG(itt_sync_obj) );
1383         break;
1384     }
1385     default: {
1386         __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1387                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
1388     }
1389     }
1390 
1391     /* From this point on, the team data structure may be deallocated at any time by the
1392        master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1393        data items that need to be referenced before the end of the barrier should be moved to
1394        the kmp_task_team_t structs.  */
1395     if (KMP_MASTER_TID(tid)) {
1396         if (__kmp_tasking_mode != tskm_immediate_exec) {
1397             // Master shouldn't call decrease_load().         // TODO: enable master threads.
1398             // Master should have th_may_decrease_load == 0.  // TODO: enable master threads.
1399             __kmp_task_team_wait(this_thr, team
1400                                  USE_ITT_BUILD_ARG(itt_sync_obj) );
1401         }
1402 #if USE_ITT_BUILD
1403         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1404             __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1405 #endif /* USE_ITT_BUILD */
1406 
1407 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1408         // Join barrier - report frame end
1409         if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1410             kmp_uint64 cur_time = __itt_get_timestamp();
1411             ident_t * loc = team->t.t_ident;
1412             kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1413             int nproc = this_thr->th.th_team_nproc;
1414             int i;
1415             // Initialize with master's wait time
1416             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1417             switch(__kmp_forkjoin_frames_mode) {
1418             case 1:
1419                 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1420                 break;
1421             case 2:
1422                 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1423                 break;
1424             case 3:
1425                 if( __itt_metadata_add_ptr ) {
1426                     for (i=1; i<nproc; ++i) {
1427                         delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1428                     }
1429                     __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1430                 }
1431                 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1432                 this_thr->th.th_frame_time = cur_time;
1433                 break;
1434             }
1435         }
1436 # endif /* USE_ITT_BUILD */
1437     }
1438 #if USE_ITT_BUILD
1439     else {
1440         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1441             __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1442     }
1443 #endif /* USE_ITT_BUILD */
1444 
1445 #if KMP_DEBUG
1446     if (KMP_MASTER_TID(tid)) {
1447         KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1448                       gtid, team_id, tid, nproc));
1449     }
1450 #endif /* KMP_DEBUG */
1451 
1452     // TODO now, mark worker threads as done so they may be disbanded
1453     KMP_MB(); // Flush all pending memory write invalidates.
1454     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1455 
1456 #if OMPT_SUPPORT
1457     if (ompt_status == ompt_status_track) {
1458 #if OMPT_TRACE
1459         if ((ompt_status == ompt_status_track_callback) &&
1460             ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1461             ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1462                 team->t.ompt_team_info.parallel_id,
1463                 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1464        }
1465 #endif
1466 
1467         // return to default state
1468         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1469     }
1470 #endif
1471 }
1472 
1473 
1474 // TODO release worker threads' fork barriers as we are ready instead of all at once
1475 void
1476 __kmp_fork_barrier(int gtid, int tid)
1477 {
1478     KMP_TIME_BLOCK(KMP_fork_barrier);
1479     kmp_info_t *this_thr = __kmp_threads[gtid];
1480     kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1481 #if USE_ITT_BUILD
1482     void * itt_sync_obj = NULL;
1483 #endif /* USE_ITT_BUILD */
1484 
1485     KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1486                   gtid, (team != NULL) ? team->t.t_id : -1, tid));
1487 
1488     // th_team pointer only valid for master thread here
1489     if (KMP_MASTER_TID(tid)) {
1490 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1491         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1492             // Create itt barrier object
1493             itt_sync_obj  = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1494             __kmp_itt_barrier_middle(gtid, itt_sync_obj);  // Call acquired/releasing
1495         }
1496 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1497 
1498 #ifdef KMP_DEBUG
1499         register kmp_info_t **other_threads = team->t.t_threads;
1500         register int i;
1501 
1502         // Verify state
1503         KMP_MB();
1504 
1505         for(i=1; i<team->t.t_nproc; ++i) {
1506             KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1507                            gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1508                            team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1509                            other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1510             KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1511                               & ~(KMP_BARRIER_SLEEP_STATE))
1512                              == KMP_INIT_BARRIER_STATE);
1513             KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1514         }
1515 #endif
1516 
1517         if (__kmp_tasking_mode != tskm_immediate_exec) {
1518             __kmp_task_team_setup(this_thr, team, 1);  // 1 indicates setup both task teams
1519         }
1520 
1521         /* The master thread may have changed its blocktime between the join barrier and the
1522            fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1523            access it when the team struct is not guaranteed to exist. */
1524         // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1525         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1526             this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1527             this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1528         }
1529     } // master
1530 
1531     switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1532     case bp_hyper_bar: {
1533         KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1534         __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1535                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
1536         break;
1537     }
1538     case bp_hierarchical_bar: {
1539         __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1540                                            USE_ITT_BUILD_ARG(itt_sync_obj) );
1541         break;
1542     }
1543     case bp_tree_bar: {
1544         KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1545         __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1546                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
1547         break;
1548     }
1549     default: {
1550         __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1551                                      USE_ITT_BUILD_ARG(itt_sync_obj) );
1552     }
1553     }
1554 
1555     // Early exit for reaping threads releasing forkjoin barrier
1556     if (TCR_4(__kmp_global.g.g_done)) {
1557         if (this_thr->th.th_task_team != NULL) {
1558             if (KMP_MASTER_TID(tid)) {
1559                 TCW_PTR(this_thr->th.th_task_team, NULL);
1560             }
1561             else {
1562                 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1563             }
1564         }
1565 
1566 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1567         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1568             if (!KMP_MASTER_TID(tid)) {
1569                 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1570                 if (itt_sync_obj)
1571                     __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1572             }
1573         }
1574 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1575         KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1576         return;
1577     }
1578 
1579     /* We can now assume that a valid team structure has been allocated by the master and
1580        propagated to all worker threads. The current thread, however, may not be part of the
1581        team, so we can't blindly assume that the team pointer is non-null.  */
1582     team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1583     KMP_DEBUG_ASSERT(team != NULL);
1584     tid = __kmp_tid_from_gtid(gtid);
1585 
1586 
1587 #if KMP_BARRIER_ICV_PULL
1588     /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1589        __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1590        this data before this function is called. We cannot modify __kmp_fork_call() to look at
1591        the fixed ICVs in the master's thread struct, because it is not always the case that the
1592        threads arrays have been allocated when __kmp_fork_call() is executed. */
1593     KMP_START_EXPLICIT_TIMER(USER_icv_copy);
1594     if (!KMP_MASTER_TID(tid)) {  // master thread already has ICVs
1595         // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1596         KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1597         __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1598         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1599                   &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1600     }
1601     KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
1602 #endif // KMP_BARRIER_ICV_PULL
1603 
1604     if (__kmp_tasking_mode != tskm_immediate_exec) {
1605         __kmp_task_team_sync(this_thr, team);
1606     }
1607 
1608 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1609     kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1610     if (proc_bind == proc_bind_intel) {
1611 #endif
1612 #if KMP_AFFINITY_SUPPORTED
1613         // Call dynamic affinity settings
1614         if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1615             __kmp_balanced_affinity(tid, team->t.t_nproc);
1616         }
1617 #endif // KMP_AFFINITY_SUPPORTED
1618 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1619     }
1620     else if (proc_bind != proc_bind_false) {
1621         if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1622             KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1623                            __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1624         }
1625         else {
1626             __kmp_affinity_set_place(gtid);
1627         }
1628     }
1629 #endif
1630 
1631 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1632     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1633         if (!KMP_MASTER_TID(tid)) {
1634             // Get correct barrier object
1635             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1636             __kmp_itt_barrier_finished(gtid, itt_sync_obj);  // Workers call acquired
1637         } // (prepare called inside barrier_release)
1638     }
1639 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1640     KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1641 }
1642 
1643 
1644 void
1645 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1646 {
1647     KMP_TIME_BLOCK(KMP_setup_icv_copy);
1648     int f;
1649 
1650     KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1651     KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1652 
1653     /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1654        __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1655        this data before this function is called. */
1656 #if KMP_BARRIER_ICV_PULL
1657     /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1658        all of the worker threads can access them and make their own copies after the barrier. */
1659     KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
1660     copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1661     KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1662                   0, team->t.t_threads[0], team));
1663 #elif KMP_BARRIER_ICV_PUSH
1664     // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1665     KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1666                   0, team->t.t_threads[0], team));
1667 #else
1668     // Copy the ICVs to each of the non-master threads.  This takes O(nthreads) time.
1669     ngo_load(new_icvs);
1670     KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
1671     for (f=1; f<new_nproc; ++f) { // Skip the master thread
1672         // TODO: GEH - pass in better source location info since usually NULL here
1673         KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1674                       f, team->t.t_threads[f], team));
1675         __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1676         ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1677         KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1678                       f, team->t.t_threads[f], team));
1679     }
1680     ngo_sync();
1681 #endif // KMP_BARRIER_ICV_PULL
1682 }
1683