1 /*
2  * kmp_barrier.cpp
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_itt.h"
19 #include "kmp_os.h"
20 #include "kmp_stats.h"
21 
22 
23 #if KMP_MIC
24 #include <immintrin.h>
25 #define USE_NGO_STORES 1
26 #endif // KMP_MIC
27 
28 #include "tsan_annotations.h"
29 
30 #if KMP_MIC && USE_NGO_STORES
31 // ICV copying
32 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
33 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
35 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
36 #else
37 #define ngo_load(src) ((void)0)
38 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
39 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
40 #define ngo_sync() ((void)0)
41 #endif /* KMP_MIC && USE_NGO_STORES */
42 
43 void __kmp_print_structure(void); // Forward declaration
44 
45 // ---------------------------- Barrier Algorithms ----------------------------
46 
47 // Linear Barrier
48 static void __kmp_linear_barrier_gather(
49     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
51   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
52   register kmp_team_t *team = this_thr->th.th_team;
53   register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
54   register kmp_info_t **other_threads = team->t.t_threads;
55 
56   KA_TRACE(
57       20,
58       ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59        gtid, team->t.t_id, tid, bt));
60   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61 
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
63   // Barrier imbalance - save arrive time to the thread
64   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
66         __itt_get_timestamp();
67   }
68 #endif
69   // We now perform a linear reduction to signal that all of the threads have
70   // arrived.
71   if (!KMP_MASTER_TID(tid)) {
72     KA_TRACE(20,
73              ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
74               "arrived(%p): %llu => %llu\n",
75               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
76               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
77               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
78     // Mark arrival to master thread
79     /* After performing this write, a worker thread may not assume that the team
80        is valid any more - it could be deallocated by the master thread at any
81        time. */
82     ANNOTATE_BARRIER_BEGIN(this_thr);
83     kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
84     flag.release();
85   } else {
86     register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
87     register int nproc = this_thr->th.th_team_nproc;
88     register int i;
89     // Don't have to worry about sleep bit here or atomic since team setting
90     register kmp_uint64 new_state =
91         team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
92 
93     // Collect all the worker team member threads.
94     for (i = 1; i < nproc; ++i) {
95 #if KMP_CACHE_MANAGE
96       // Prefetch next thread's arrived count
97       if (i + 1 < nproc)
98         KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
99 #endif /* KMP_CACHE_MANAGE */
100       KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
101                     "arrived(%p) == %llu\n",
102                     gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
103                     team->t.t_id, i,
104                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
105 
106       // Wait for worker thread to arrive
107       kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
108                        new_state);
109       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
110       ANNOTATE_BARRIER_END(other_threads[i]);
111 #if USE_ITT_BUILD && USE_ITT_NOTIFY
112       // Barrier imbalance - write min of the thread time and the other thread
113       // time to the thread.
114       if (__kmp_forkjoin_frames_mode == 2) {
115         this_thr->th.th_bar_min_time = KMP_MIN(
116             this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
117       }
118 #endif
119       if (reduce) {
120         KA_TRACE(100,
121                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
122                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
123                   team->t.t_id, i));
124         ANNOTATE_REDUCE_AFTER(reduce);
125         (*reduce)(this_thr->th.th_local.reduce_data,
126                   other_threads[i]->th.th_local.reduce_data);
127         ANNOTATE_REDUCE_BEFORE(reduce);
128         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
129       }
130     }
131     // Don't have to worry about sleep bit here or atomic since team setting
132     team_bar->b_arrived = new_state;
133     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
134                   "arrived(%p) = %llu\n",
135                   gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
136                   new_state));
137   }
138   KA_TRACE(
139       20,
140       ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
141        gtid, team->t.t_id, tid, bt));
142 }
143 
144 static void __kmp_linear_barrier_release(
145     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
146     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
147   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
148   register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
149   register kmp_team_t *team;
150 
151   if (KMP_MASTER_TID(tid)) {
152     register unsigned int i;
153     register kmp_uint32 nproc = this_thr->th.th_team_nproc;
154     register kmp_info_t **other_threads;
155 
156     team = __kmp_threads[gtid]->th.th_team;
157     KMP_DEBUG_ASSERT(team != NULL);
158     other_threads = team->t.t_threads;
159 
160     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
161                   "barrier type %d\n",
162                   gtid, team->t.t_id, tid, bt));
163 
164     if (nproc > 1) {
165 #if KMP_BARRIER_ICV_PUSH
166       {
167         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
168         if (propagate_icvs) {
169           ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
170           for (i = 1; i < nproc; ++i) {
171             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
172                                      team, i, FALSE);
173             ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
174                            &team->t.t_implicit_task_taskdata[0].td_icvs);
175           }
176           ngo_sync();
177         }
178       }
179 #endif // KMP_BARRIER_ICV_PUSH
180 
181       // Now, release all of the worker threads
182       for (i = 1; i < nproc; ++i) {
183 #if KMP_CACHE_MANAGE
184         // Prefetch next thread's go flag
185         if (i + 1 < nproc)
186           KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
187 #endif /* KMP_CACHE_MANAGE */
188         KA_TRACE(
189             20,
190             ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
191              "go(%p): %u => %u\n",
192              gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
193              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
194              other_threads[i]->th.th_bar[bt].bb.b_go,
195              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
196         ANNOTATE_BARRIER_BEGIN(other_threads[i]);
197         kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
198                          other_threads[i]);
199         flag.release();
200       }
201     }
202   } else { // Wait for the MASTER thread to release us
203     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
204                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
205     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
206     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
207     ANNOTATE_BARRIER_END(this_thr);
208 #if USE_ITT_BUILD && USE_ITT_NOTIFY
209     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
210       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
211       // disabled)
212       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
213       // Cancel wait on previous parallel region...
214       __kmp_itt_task_starting(itt_sync_obj);
215 
216       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
217         return;
218 
219       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
220       if (itt_sync_obj != NULL)
221         // Call prepare as early as possible for "new" barrier
222         __kmp_itt_task_finished(itt_sync_obj);
223     } else
224 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
225         // Early exit for reaping threads releasing forkjoin barrier
226         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
227       return;
228 // The worker thread may now assume that the team is valid.
229 #ifdef KMP_DEBUG
230     tid = __kmp_tid_from_gtid(gtid);
231     team = __kmp_threads[gtid]->th.th_team;
232 #endif
233     KMP_DEBUG_ASSERT(team != NULL);
234     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
235     KA_TRACE(20,
236              ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
237               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
238     KMP_MB(); // Flush all pending memory write invalidates.
239   }
240   KA_TRACE(
241       20,
242       ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
243        gtid, team->t.t_id, tid, bt));
244 }
245 
246 // Tree barrier
247 static void
248 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
249                           int tid, void (*reduce)(void *, void *)
250                                        USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
251   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
252   register kmp_team_t *team = this_thr->th.th_team;
253   register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
254   register kmp_info_t **other_threads = team->t.t_threads;
255   register kmp_uint32 nproc = this_thr->th.th_team_nproc;
256   register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
257   register kmp_uint32 branch_factor = 1 << branch_bits;
258   register kmp_uint32 child;
259   register kmp_uint32 child_tid;
260   register kmp_uint64 new_state;
261 
262   KA_TRACE(
263       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
264            gtid, team->t.t_id, tid, bt));
265   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
266 
267 #if USE_ITT_BUILD && USE_ITT_NOTIFY
268   // Barrier imbalance - save arrive time to the thread
269   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
270     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
271         __itt_get_timestamp();
272   }
273 #endif
274   // Perform tree gather to wait until all threads have arrived; reduce any
275   // required data as we go
276   child_tid = (tid << branch_bits) + 1;
277   if (child_tid < nproc) {
278     // Parent threads wait for all their children to arrive
279     new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
280     child = 1;
281     do {
282       register kmp_info_t *child_thr = other_threads[child_tid];
283       register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
284 #if KMP_CACHE_MANAGE
285       // Prefetch next thread's arrived count
286       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
287         KMP_CACHE_PREFETCH(
288             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
289 #endif /* KMP_CACHE_MANAGE */
290       KA_TRACE(20,
291                ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
292                 "arrived(%p) == %llu\n",
293                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
294                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
295       // Wait for child to arrive
296       kmp_flag_64 flag(&child_bar->b_arrived, new_state);
297       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
298       ANNOTATE_BARRIER_END(child_thr);
299 #if USE_ITT_BUILD && USE_ITT_NOTIFY
300       // Barrier imbalance - write min of the thread time and a child time to
301       // the thread.
302       if (__kmp_forkjoin_frames_mode == 2) {
303         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
304                                                child_thr->th.th_bar_min_time);
305       }
306 #endif
307       if (reduce) {
308         KA_TRACE(100,
309                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
310                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
311                   team->t.t_id, child_tid));
312         ANNOTATE_REDUCE_AFTER(reduce);
313         (*reduce)(this_thr->th.th_local.reduce_data,
314                   child_thr->th.th_local.reduce_data);
315         ANNOTATE_REDUCE_BEFORE(reduce);
316         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
317       }
318       child++;
319       child_tid++;
320     } while (child <= branch_factor && child_tid < nproc);
321   }
322 
323   if (!KMP_MASTER_TID(tid)) { // Worker threads
324     register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
325 
326     KA_TRACE(20,
327              ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
328               "arrived(%p): %llu => %llu\n",
329               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
330               team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
331               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
332 
333     // Mark arrival to parent thread
334     /* After performing this write, a worker thread may not assume that the team
335        is valid any more - it could be deallocated by the master thread at any
336        time.  */
337     ANNOTATE_BARRIER_BEGIN(this_thr);
338     kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
339     flag.release();
340   } else {
341     // Need to update the team arrived pointer if we are the master thread
342     if (nproc > 1) // New value was already computed above
343       team->t.t_bar[bt].b_arrived = new_state;
344     else
345       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
346     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
347                   "arrived(%p) = %llu\n",
348                   gtid, team->t.t_id, tid, team->t.t_id,
349                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
350   }
351   KA_TRACE(20,
352            ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
353             gtid, team->t.t_id, tid, bt));
354 }
355 
356 static void __kmp_tree_barrier_release(
357     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
358     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
359   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
360   register kmp_team_t *team;
361   register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
362   register kmp_uint32 nproc;
363   register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
364   register kmp_uint32 branch_factor = 1 << branch_bits;
365   register kmp_uint32 child;
366   register kmp_uint32 child_tid;
367 
368   // Perform a tree release for all of the threads that have been gathered
369   if (!KMP_MASTER_TID(
370           tid)) { // Handle fork barrier workers who aren't part of a team yet
371     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
372                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
373     // Wait for parent thread to release us
374     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
375     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
376     ANNOTATE_BARRIER_END(this_thr);
377 #if USE_ITT_BUILD && USE_ITT_NOTIFY
378     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
379       // In fork barrier where we could not get the object reliably (or
380       // ITTNOTIFY is disabled)
381       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
382       // Cancel wait on previous parallel region...
383       __kmp_itt_task_starting(itt_sync_obj);
384 
385       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
386         return;
387 
388       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
389       if (itt_sync_obj != NULL)
390         // Call prepare as early as possible for "new" barrier
391         __kmp_itt_task_finished(itt_sync_obj);
392     } else
393 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
394         // Early exit for reaping threads releasing forkjoin barrier
395         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
396       return;
397 
398     // The worker thread may now assume that the team is valid.
399     team = __kmp_threads[gtid]->th.th_team;
400     KMP_DEBUG_ASSERT(team != NULL);
401     tid = __kmp_tid_from_gtid(gtid);
402 
403     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
404     KA_TRACE(20,
405              ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
406               team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
407     KMP_MB(); // Flush all pending memory write invalidates.
408   } else {
409     team = __kmp_threads[gtid]->th.th_team;
410     KMP_DEBUG_ASSERT(team != NULL);
411     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
412                   "barrier type %d\n",
413                   gtid, team->t.t_id, tid, bt));
414   }
415   nproc = this_thr->th.th_team_nproc;
416   child_tid = (tid << branch_bits) + 1;
417 
418   if (child_tid < nproc) {
419     register kmp_info_t **other_threads = team->t.t_threads;
420     child = 1;
421     // Parent threads release all their children
422     do {
423       register kmp_info_t *child_thr = other_threads[child_tid];
424       register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
425 #if KMP_CACHE_MANAGE
426       // Prefetch next thread's go count
427       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
428         KMP_CACHE_PREFETCH(
429             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
430 #endif /* KMP_CACHE_MANAGE */
431 
432 #if KMP_BARRIER_ICV_PUSH
433       {
434         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
435         if (propagate_icvs) {
436           __kmp_init_implicit_task(team->t.t_ident,
437                                    team->t.t_threads[child_tid], team,
438                                    child_tid, FALSE);
439           copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
440                     &team->t.t_implicit_task_taskdata[0].td_icvs);
441         }
442       }
443 #endif // KMP_BARRIER_ICV_PUSH
444       KA_TRACE(20,
445                ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
446                 "go(%p): %u => %u\n",
447                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
448                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
449                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
450       // Release child from barrier
451       ANNOTATE_BARRIER_BEGIN(child_thr);
452       kmp_flag_64 flag(&child_bar->b_go, child_thr);
453       flag.release();
454       child++;
455       child_tid++;
456     } while (child <= branch_factor && child_tid < nproc);
457   }
458   KA_TRACE(
459       20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
460            gtid, team->t.t_id, tid, bt));
461 }
462 
463 // Hyper Barrier
464 static void
465 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
466                            int tid, void (*reduce)(void *, void *)
467                                         USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
468   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
469   register kmp_team_t *team = this_thr->th.th_team;
470   register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
471   register kmp_info_t **other_threads = team->t.t_threads;
472   register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
473   register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
474   register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
475   register kmp_uint32 branch_factor = 1 << branch_bits;
476   register kmp_uint32 offset;
477   register kmp_uint32 level;
478 
479   KA_TRACE(
480       20,
481       ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
482        gtid, team->t.t_id, tid, bt));
483   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
484 
485 #if USE_ITT_BUILD && USE_ITT_NOTIFY
486   // Barrier imbalance - save arrive time to the thread
487   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
488     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
489         __itt_get_timestamp();
490   }
491 #endif
492   /* Perform a hypercube-embedded tree gather to wait until all of the threads
493      have arrived, and reduce any required data as we go.  */
494   kmp_flag_64 p_flag(&thr_bar->b_arrived);
495   for (level = 0, offset = 1; offset < num_threads;
496        level += branch_bits, offset <<= branch_bits) {
497     register kmp_uint32 child;
498     register kmp_uint32 child_tid;
499 
500     if (((tid >> level) & (branch_factor - 1)) != 0) {
501       register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
502 
503       KA_TRACE(20,
504                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
505                 "arrived(%p): %llu => %llu\n",
506                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
507                 team->t.t_id, parent_tid, &thr_bar->b_arrived,
508                 thr_bar->b_arrived,
509                 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
510       // Mark arrival to parent thread
511       /* After performing this write (in the last iteration of the enclosing for
512          loop), a worker thread may not assume that the team is valid any more
513          - it could be deallocated by the master thread at any time.  */
514       ANNOTATE_BARRIER_BEGIN(this_thr);
515       p_flag.set_waiter(other_threads[parent_tid]);
516       p_flag.release();
517       break;
518     }
519 
520     // Parent threads wait for children to arrive
521     if (new_state == KMP_BARRIER_UNUSED_STATE)
522       new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
523     for (child = 1, child_tid = tid + (1 << level);
524          child < branch_factor && child_tid < num_threads;
525          child++, child_tid += (1 << level)) {
526       register kmp_info_t *child_thr = other_threads[child_tid];
527       register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
528 #if KMP_CACHE_MANAGE
529       register kmp_uint32 next_child_tid = child_tid + (1 << level);
530       // Prefetch next thread's arrived count
531       if (child + 1 < branch_factor && next_child_tid < num_threads)
532         KMP_CACHE_PREFETCH(
533             &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
534 #endif /* KMP_CACHE_MANAGE */
535       KA_TRACE(20,
536                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
537                 "arrived(%p) == %llu\n",
538                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
539                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
540       // Wait for child to arrive
541       kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
542       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
543       ANNOTATE_BARRIER_END(child_thr);
544 #if USE_ITT_BUILD && USE_ITT_NOTIFY
545       // Barrier imbalance - write min of the thread time and a child time to
546       // the thread.
547       if (__kmp_forkjoin_frames_mode == 2) {
548         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
549                                                child_thr->th.th_bar_min_time);
550       }
551 #endif
552       if (reduce) {
553         KA_TRACE(100,
554                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
555                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
556                   team->t.t_id, child_tid));
557         ANNOTATE_REDUCE_AFTER(reduce);
558         (*reduce)(this_thr->th.th_local.reduce_data,
559                   child_thr->th.th_local.reduce_data);
560         ANNOTATE_REDUCE_BEFORE(reduce);
561         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
562       }
563     }
564   }
565 
566   if (KMP_MASTER_TID(tid)) {
567     // Need to update the team arrived pointer if we are the master thread
568     if (new_state == KMP_BARRIER_UNUSED_STATE)
569       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
570     else
571       team->t.t_bar[bt].b_arrived = new_state;
572     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
573                   "arrived(%p) = %llu\n",
574                   gtid, team->t.t_id, tid, team->t.t_id,
575                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
576   }
577   KA_TRACE(
578       20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
579            gtid, team->t.t_id, tid, bt));
580 }
581 
582 // The reverse versions seem to beat the forward versions overall
583 #define KMP_REVERSE_HYPER_BAR
584 static void __kmp_hyper_barrier_release(
585     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
586     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
587   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
588   register kmp_team_t *team;
589   register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
590   register kmp_info_t **other_threads;
591   register kmp_uint32 num_threads;
592   register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
593   register kmp_uint32 branch_factor = 1 << branch_bits;
594   register kmp_uint32 child;
595   register kmp_uint32 child_tid;
596   register kmp_uint32 offset;
597   register kmp_uint32 level;
598 
599   /* Perform a hypercube-embedded tree release for all of the threads that have
600      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
601      are released in the reverse order of the corresponding gather, otherwise
602      threads are released in the same order. */
603   if (KMP_MASTER_TID(tid)) { // master
604     team = __kmp_threads[gtid]->th.th_team;
605     KMP_DEBUG_ASSERT(team != NULL);
606     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
607                   "barrier type %d\n",
608                   gtid, team->t.t_id, tid, bt));
609 #if KMP_BARRIER_ICV_PUSH
610     if (propagate_icvs) { // master already has ICVs in final destination; copy
611       copy_icvs(&thr_bar->th_fixed_icvs,
612                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
613     }
614 #endif
615   } else { // Handle fork barrier workers who aren't part of a team yet
616     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
617                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
618     // Wait for parent thread to release us
619     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
620     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
621     ANNOTATE_BARRIER_END(this_thr);
622 #if USE_ITT_BUILD && USE_ITT_NOTIFY
623     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
624       // In fork barrier where we could not get the object reliably
625       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
626       // Cancel wait on previous parallel region...
627       __kmp_itt_task_starting(itt_sync_obj);
628 
629       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
630         return;
631 
632       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
633       if (itt_sync_obj != NULL)
634         // Call prepare as early as possible for "new" barrier
635         __kmp_itt_task_finished(itt_sync_obj);
636     } else
637 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
638         // Early exit for reaping threads releasing forkjoin barrier
639         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
640       return;
641 
642     // The worker thread may now assume that the team is valid.
643     team = __kmp_threads[gtid]->th.th_team;
644     KMP_DEBUG_ASSERT(team != NULL);
645     tid = __kmp_tid_from_gtid(gtid);
646 
647     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
648     KA_TRACE(20,
649              ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
650               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
651     KMP_MB(); // Flush all pending memory write invalidates.
652   }
653   num_threads = this_thr->th.th_team_nproc;
654   other_threads = team->t.t_threads;
655 
656 #ifdef KMP_REVERSE_HYPER_BAR
657   // Count up to correct level for parent
658   for (level = 0, offset = 1;
659        offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
660        level += branch_bits, offset <<= branch_bits)
661     ;
662 
663   // Now go down from there
664   for (level -= branch_bits, offset >>= branch_bits; offset != 0;
665        level -= branch_bits, offset >>= branch_bits)
666 #else
667   // Go down the tree, level by level
668   for (level = 0, offset = 1; offset < num_threads;
669        level += branch_bits, offset <<= branch_bits)
670 #endif // KMP_REVERSE_HYPER_BAR
671   {
672 #ifdef KMP_REVERSE_HYPER_BAR
673     /* Now go in reverse order through the children, highest to lowest.
674        Initial setting of child is conservative here. */
675     child = num_threads >> ((level == 0) ? level : level - 1);
676     for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
677         child_tid = tid + (child << level);
678          child >= 1; child--, child_tid -= (1 << level))
679 #else
680     if (((tid >> level) & (branch_factor - 1)) != 0)
681       // No need to go lower than this, since this is the level parent would be
682       // notified
683       break;
684     // Iterate through children on this level of the tree
685     for (child = 1, child_tid = tid + (1 << level);
686          child < branch_factor && child_tid < num_threads;
687          child++, child_tid += (1 << level))
688 #endif // KMP_REVERSE_HYPER_BAR
689     {
690       if (child_tid >= num_threads)
691         continue; // Child doesn't exist so keep going
692       else {
693         register kmp_info_t *child_thr = other_threads[child_tid];
694         register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
695 #if KMP_CACHE_MANAGE
696         register kmp_uint32 next_child_tid = child_tid - (1 << level);
697 // Prefetch next thread's go count
698 #ifdef KMP_REVERSE_HYPER_BAR
699         if (child - 1 >= 1 && next_child_tid < num_threads)
700 #else
701         if (child + 1 < branch_factor && next_child_tid < num_threads)
702 #endif // KMP_REVERSE_HYPER_BAR
703           KMP_CACHE_PREFETCH(
704               &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
705 #endif /* KMP_CACHE_MANAGE */
706 
707 #if KMP_BARRIER_ICV_PUSH
708         if (propagate_icvs) // push my fixed ICVs to my child
709           copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
710 #endif // KMP_BARRIER_ICV_PUSH
711 
712         KA_TRACE(
713             20,
714             ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
715              "go(%p): %u => %u\n",
716              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
717              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
718              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
719         // Release child from barrier
720         ANNOTATE_BARRIER_BEGIN(child_thr);
721         kmp_flag_64 flag(&child_bar->b_go, child_thr);
722         flag.release();
723       }
724     }
725   }
726 #if KMP_BARRIER_ICV_PUSH
727   if (propagate_icvs &&
728       !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
729     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
730                              FALSE);
731     copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
732               &thr_bar->th_fixed_icvs);
733   }
734 #endif
735   KA_TRACE(
736       20,
737       ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
738        gtid, team->t.t_id, tid, bt));
739 }
740 
741 // Hierarchical Barrier
742 
743 // Initialize thread barrier data
744 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
745    Performs the minimum amount of initialization required based on how the team
746    has changed. Returns true if leaf children will require both on-core and
747    traditional wake-up mechanisms. For example, if the team size increases,
748    threads already in the team will respond to on-core wakeup on their parent
749    thread, but threads newly added to the team will only be listening on the
750    their local b_go. */
751 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
752                                                    kmp_bstate_t *thr_bar,
753                                                    kmp_uint32 nproc, int gtid,
754                                                    int tid, kmp_team_t *team) {
755   // Checks to determine if (re-)initialization is needed
756   bool uninitialized = thr_bar->team == NULL;
757   bool team_changed = team != thr_bar->team;
758   bool team_sz_changed = nproc != thr_bar->nproc;
759   bool tid_changed = tid != thr_bar->old_tid;
760   bool retval = false;
761 
762   if (uninitialized || team_sz_changed) {
763     __kmp_get_hierarchy(nproc, thr_bar);
764   }
765 
766   if (uninitialized || team_sz_changed || tid_changed) {
767     thr_bar->my_level = thr_bar->depth - 1; // default for master
768     thr_bar->parent_tid = -1; // default for master
769     if (!KMP_MASTER_TID(
770             tid)) { // if not master, find parent thread in hierarchy
771       kmp_uint32 d = 0;
772       while (d < thr_bar->depth) { // find parent based on level of thread in
773         // hierarchy, and note level
774         kmp_uint32 rem;
775         if (d == thr_bar->depth - 2) { // reached level right below the master
776           thr_bar->parent_tid = 0;
777           thr_bar->my_level = d;
778           break;
779         } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
780                    0) { // TODO: can we make this op faster?
781           // thread is not a subtree root at next level, so this is max
782           thr_bar->parent_tid = tid - rem;
783           thr_bar->my_level = d;
784           break;
785         }
786         ++d;
787       }
788     }
789     thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
790     thr_bar->old_tid = tid;
791     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
792     thr_bar->team = team;
793     thr_bar->parent_bar =
794         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
795   }
796   if (uninitialized || team_changed || tid_changed) {
797     thr_bar->team = team;
798     thr_bar->parent_bar =
799         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
800     retval = true;
801   }
802   if (uninitialized || team_sz_changed || tid_changed) {
803     thr_bar->nproc = nproc;
804     thr_bar->leaf_kids = thr_bar->base_leaf_kids;
805     if (thr_bar->my_level == 0)
806       thr_bar->leaf_kids = 0;
807     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
808       thr_bar->leaf_kids = nproc - tid - 1;
809     thr_bar->leaf_state = 0;
810     for (int i = 0; i < thr_bar->leaf_kids; ++i)
811       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
812   }
813   return retval;
814 }
815 
816 static void __kmp_hierarchical_barrier_gather(
817     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
818     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
819   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
820   register kmp_team_t *team = this_thr->th.th_team;
821   register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
822   register kmp_uint32 nproc = this_thr->th.th_team_nproc;
823   register kmp_info_t **other_threads = team->t.t_threads;
824   register kmp_uint64 new_state;
825 
826   int level = team->t.t_level;
827 #if OMP_40_ENABLED
828   if (other_threads[0]
829           ->th.th_teams_microtask) // are we inside the teams construct?
830     if (this_thr->th.th_teams_size.nteams > 1)
831       ++level; // level was not increased in teams construct for team_of_masters
832 #endif
833   if (level == 1)
834     thr_bar->use_oncore_barrier = 1;
835   else
836     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
837 
838   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
839                 "barrier type %d\n",
840                 gtid, team->t.t_id, tid, bt));
841   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
842 
843 #if USE_ITT_BUILD && USE_ITT_NOTIFY
844   // Barrier imbalance - save arrive time to the thread
845   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
846     this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
847   }
848 #endif
849 
850   (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
851                                                team);
852 
853   if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
854     register kmp_int32 child_tid;
855     new_state =
856         (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
857     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
858         thr_bar->use_oncore_barrier) {
859       if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on
860         // my b_arrived flag
861         kmp_uint64 leaf_state =
862             KMP_MASTER_TID(tid)
863                 ? thr_bar->b_arrived | thr_bar->leaf_state
864                 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
865         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
866                       "for leaf kids\n",
867                       gtid, team->t.t_id, tid));
868         kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
869         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
870         if (reduce) {
871           ANNOTATE_REDUCE_AFTER(reduce);
872           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
873                ++child_tid) {
874             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
875                            "T#%d(%d:%d)\n",
876                            gtid, team->t.t_id, tid,
877                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
878                            child_tid));
879             ANNOTATE_BARRIER_END(other_threads[child_tid]);
880             (*reduce)(this_thr->th.th_local.reduce_data,
881                       other_threads[child_tid]->th.th_local.reduce_data);
882           }
883           ANNOTATE_REDUCE_BEFORE(reduce);
884           ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
885         }
886         (void)KMP_TEST_THEN_AND64(
887             (volatile kmp_int64 *)&thr_bar->b_arrived,
888             ~(thr_bar->leaf_state)); // clear leaf_state bits
889       }
890       // Next, wait for higher level children on each child's b_arrived flag
891       for (kmp_uint32 d = 1; d < thr_bar->my_level;
892            ++d) { // gather lowest level threads first, but skip 0
893         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
894                    skip = thr_bar->skip_per_level[d];
895         if (last > nproc)
896           last = nproc;
897         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
898           register kmp_info_t *child_thr = other_threads[child_tid];
899           register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
900           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
901                         "T#%d(%d:%d) "
902                         "arrived(%p) == %llu\n",
903                         gtid, team->t.t_id, tid,
904                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
905                         child_tid, &child_bar->b_arrived, new_state));
906           kmp_flag_64 flag(&child_bar->b_arrived, new_state);
907           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
908           ANNOTATE_BARRIER_END(child_thr);
909           if (reduce) {
910             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
911                            "T#%d(%d:%d)\n",
912                            gtid, team->t.t_id, tid,
913                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
914                            child_tid));
915             ANNOTATE_REDUCE_AFTER(reduce);
916             (*reduce)(this_thr->th.th_local.reduce_data,
917                       child_thr->th.th_local.reduce_data);
918             ANNOTATE_REDUCE_BEFORE(reduce);
919             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
920           }
921         }
922       }
923     } else { // Blocktime is not infinite
924       for (kmp_uint32 d = 0; d < thr_bar->my_level;
925            ++d) { // Gather lowest level threads first
926         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
927                    skip = thr_bar->skip_per_level[d];
928         if (last > nproc)
929           last = nproc;
930         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
931           register kmp_info_t *child_thr = other_threads[child_tid];
932           register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
933           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
934                         "T#%d(%d:%d) "
935                         "arrived(%p) == %llu\n",
936                         gtid, team->t.t_id, tid,
937                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
938                         child_tid, &child_bar->b_arrived, new_state));
939           kmp_flag_64 flag(&child_bar->b_arrived, new_state);
940           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
941           ANNOTATE_BARRIER_END(child_thr);
942           if (reduce) {
943             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
944                            "T#%d(%d:%d)\n",
945                            gtid, team->t.t_id, tid,
946                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
947                            child_tid));
948             ANNOTATE_REDUCE_AFTER(reduce);
949             (*reduce)(this_thr->th.th_local.reduce_data,
950                       child_thr->th.th_local.reduce_data);
951             ANNOTATE_REDUCE_BEFORE(reduce);
952             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
953           }
954         }
955       }
956     }
957   }
958   // All subordinates are gathered; now release parent if not master thread
959 
960   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
961     KA_TRACE(
962         20,
963         ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
964          "arrived(%p): %llu => %llu\n",
965          gtid, team->t.t_id, tid,
966          __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
967          thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
968          thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
969     /* Mark arrival to parent: After performing this write, a worker thread may
970        not assume that the team is valid any more - it could be deallocated by
971        the master thread at any time. */
972     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
973         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
974       // flag; release it
975       ANNOTATE_BARRIER_BEGIN(this_thr);
976       kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
977       flag.release();
978     } else { // Leaf does special release on the "offset" bits of parent's
979       // b_arrived flag
980       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
981       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
982       flag.set_waiter(other_threads[thr_bar->parent_tid]);
983       flag.release();
984     }
985   } else { // Master thread needs to update the team's b_arrived value
986     team->t.t_bar[bt].b_arrived = new_state;
987     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
988                   "arrived(%p) = %llu\n",
989                   gtid, team->t.t_id, tid, team->t.t_id,
990                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
991   }
992   // Is the team access below unsafe or just technically invalid?
993   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
994                 "barrier type %d\n",
995                 gtid, team->t.t_id, tid, bt));
996 }
997 
998 static void __kmp_hierarchical_barrier_release(
999     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1000     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1001   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1002   register kmp_team_t *team;
1003   register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1004   register kmp_uint32 nproc;
1005   bool team_change = false; // indicates on-core barrier shouldn't be used
1006 
1007   if (KMP_MASTER_TID(tid)) {
1008     team = __kmp_threads[gtid]->th.th_team;
1009     KMP_DEBUG_ASSERT(team != NULL);
1010     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1011                   "entered barrier type %d\n",
1012                   gtid, team->t.t_id, tid, bt));
1013   } else { // Worker threads
1014     // Wait for parent thread to release me
1015     if (!thr_bar->use_oncore_barrier ||
1016         __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1017         thr_bar->team == NULL) {
1018       // Use traditional method of waiting on my own b_go flag
1019       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1020       kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1021       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1022       ANNOTATE_BARRIER_END(this_thr);
1023       TCW_8(thr_bar->b_go,
1024             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1025     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1026       // infinite, not nested
1027       // Wait on my "offset" bits on parent's b_go flag
1028       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1029       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1030                            thr_bar->offset, bt,
1031                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1032       flag.wait(this_thr, TRUE);
1033       if (thr_bar->wait_flag ==
1034           KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1035         TCW_8(thr_bar->b_go,
1036               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1037       } else { // Reset my bits on parent's b_go flag
1038         ((char *)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
1039       }
1040     }
1041     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1042     // Early exit for reaping threads releasing forkjoin barrier
1043     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1044       return;
1045     // The worker thread may now assume that the team is valid.
1046     team = __kmp_threads[gtid]->th.th_team;
1047     KMP_DEBUG_ASSERT(team != NULL);
1048     tid = __kmp_tid_from_gtid(gtid);
1049 
1050     KA_TRACE(
1051         20,
1052         ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1053          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1054     KMP_MB(); // Flush all pending memory write invalidates.
1055   }
1056 
1057   nproc = this_thr->th.th_team_nproc;
1058   int level = team->t.t_level;
1059 #if OMP_40_ENABLED
1060   if (team->t.t_threads[0]
1061           ->th.th_teams_microtask) { // are we inside the teams construct?
1062     if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1063         this_thr->th.th_teams_level == level)
1064       ++level; // level was not increased in teams construct for team_of_workers
1065     if (this_thr->th.th_teams_size.nteams > 1)
1066       ++level; // level was not increased in teams construct for team_of_masters
1067   }
1068 #endif
1069   if (level == 1)
1070     thr_bar->use_oncore_barrier = 1;
1071   else
1072     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1073 
1074   // If the team size has increased, we still communicate with old leaves via
1075   // oncore barrier.
1076   unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1077   kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1078   team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1079                                                        tid, team);
1080   // But if the entire team changes, we won't use oncore barrier at all
1081   if (team_change)
1082     old_leaf_kids = 0;
1083 
1084 #if KMP_BARRIER_ICV_PUSH
1085   if (propagate_icvs) {
1086     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1087                              FALSE);
1088     if (KMP_MASTER_TID(
1089             tid)) { // master already has copy in final destination; copy
1090       copy_icvs(&thr_bar->th_fixed_icvs,
1091                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1092     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1093                thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1094       if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1095         // leaves (on-core children) pull parent's fixed ICVs directly to local
1096         // ICV store
1097         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1098                   &thr_bar->parent_bar->th_fixed_icvs);
1099       // non-leaves will get ICVs piggybacked with b_go via NGO store
1100     } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1101       if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1102         // access
1103         copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1104       else // leaves copy parent's fixed ICVs directly to local ICV store
1105         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1106                   &thr_bar->parent_bar->th_fixed_icvs);
1107     }
1108   }
1109 #endif // KMP_BARRIER_ICV_PUSH
1110 
1111   // Now, release my children
1112   if (thr_bar->my_level) { // not a leaf
1113     register kmp_int32 child_tid;
1114     kmp_uint32 last;
1115     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1116         thr_bar->use_oncore_barrier) {
1117       if (KMP_MASTER_TID(tid)) { // do a flat release
1118         // Set local b_go to bump children via NGO store of the cache line
1119         // containing IVCs and b_go.
1120         thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1121         // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1122         // the cache line
1123         ngo_load(&thr_bar->th_fixed_icvs);
1124         // This loops over all the threads skipping only the leaf nodes in the
1125         // hierarchy
1126         for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1127              child_tid += thr_bar->skip_per_level[1]) {
1128           register kmp_bstate_t *child_bar =
1129               &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1130           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1131                         "releasing T#%d(%d:%d)"
1132                         " go(%p): %u => %u\n",
1133                         gtid, team->t.t_id, tid,
1134                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1135                         child_tid, &child_bar->b_go, child_bar->b_go,
1136                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1137           // Use ngo store (if available) to both store ICVs and release child
1138           // via child's b_go
1139           ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1140         }
1141         ngo_sync();
1142       }
1143       TCW_8(thr_bar->b_go,
1144             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1145       // Now, release leaf children
1146       if (thr_bar->leaf_kids) { // if there are any
1147         // We test team_change on the off-chance that the level 1 team changed.
1148         if (team_change ||
1149             old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1150           if (old_leaf_kids) { // release old leaf kids
1151             thr_bar->b_go |= old_leaf_state;
1152           }
1153           // Release new leaf kids
1154           last = tid + thr_bar->skip_per_level[1];
1155           if (last > nproc)
1156             last = nproc;
1157           for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1158                ++child_tid) { // skip_per_level[0]=1
1159             register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1160             register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1161             KA_TRACE(
1162                 20,
1163                 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1164                  " T#%d(%d:%d) go(%p): %u => %u\n",
1165                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1166                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1167                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1168             // Release child using child's b_go flag
1169             ANNOTATE_BARRIER_BEGIN(child_thr);
1170             kmp_flag_64 flag(&child_bar->b_go, child_thr);
1171             flag.release();
1172           }
1173         } else { // Release all children at once with leaf_state bits on my own
1174           // b_go flag
1175           thr_bar->b_go |= thr_bar->leaf_state;
1176         }
1177       }
1178     } else { // Blocktime is not infinite; do a simple hierarchical release
1179       for (int d = thr_bar->my_level - 1; d >= 0;
1180            --d) { // Release highest level threads first
1181         last = tid + thr_bar->skip_per_level[d + 1];
1182         kmp_uint32 skip = thr_bar->skip_per_level[d];
1183         if (last > nproc)
1184           last = nproc;
1185         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1186           register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1187           register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1188           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1189                         "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1190                         gtid, team->t.t_id, tid,
1191                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1192                         child_tid, &child_bar->b_go, child_bar->b_go,
1193                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1194           // Release child using child's b_go flag
1195           ANNOTATE_BARRIER_BEGIN(child_thr);
1196           kmp_flag_64 flag(&child_bar->b_go, child_thr);
1197           flag.release();
1198         }
1199       }
1200     }
1201 #if KMP_BARRIER_ICV_PUSH
1202     if (propagate_icvs && !KMP_MASTER_TID(tid))
1203       // non-leaves copy ICVs from fixed ICVs to local dest
1204       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1205                 &thr_bar->th_fixed_icvs);
1206 #endif // KMP_BARRIER_ICV_PUSH
1207   }
1208   KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1209                 "barrier type %d\n",
1210                 gtid, team->t.t_id, tid, bt));
1211 }
1212 
1213 
1214 // End of Barrier Algorithms
1215 
1216 // Internal function to do a barrier.
1217 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1218    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1219    barrier
1220    Returns 0 if master thread, 1 if worker thread.  */
1221 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1222                   size_t reduce_size, void *reduce_data,
1223                   void (*reduce)(void *, void *)) {
1224   KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1225   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1226   register int tid = __kmp_tid_from_gtid(gtid);
1227   register kmp_info_t *this_thr = __kmp_threads[gtid];
1228   register kmp_team_t *team = this_thr->th.th_team;
1229   register int status = 0;
1230   ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1231 #if OMPT_SUPPORT
1232   ompt_task_id_t my_task_id;
1233   ompt_parallel_id_t my_parallel_id;
1234 #endif
1235 
1236   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1237                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1238 
1239   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1240 #if OMPT_SUPPORT
1241   if (ompt_enabled) {
1242 #if OMPT_BLAME
1243     my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1244     my_parallel_id = team->t.ompt_team_info.parallel_id;
1245 
1246 #if OMPT_TRACE
1247     if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1248       if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1249         ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1250             my_parallel_id, my_task_id);
1251       }
1252     }
1253 #endif
1254     if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1255       ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(my_parallel_id,
1256                                                              my_task_id);
1257     }
1258 #endif
1259     // It is OK to report the barrier state after the barrier begin callback.
1260     // According to the OMPT specification, a compliant implementation may
1261     // even delay reporting this state until the barrier begins to wait.
1262     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1263   }
1264 #endif
1265 
1266   if (!team->t.t_serialized) {
1267 #if USE_ITT_BUILD
1268     // This value will be used in itt notify events below.
1269     void *itt_sync_obj = NULL;
1270 #if USE_ITT_NOTIFY
1271     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1272       itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1273 #endif
1274 #endif /* USE_ITT_BUILD */
1275     if (__kmp_tasking_mode == tskm_extra_barrier) {
1276       __kmp_tasking_barrier(team, this_thr, gtid);
1277       KA_TRACE(15,
1278                ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1279                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1280     }
1281 
1282     /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1283        access it when the team struct is not guaranteed to exist. */
1284     // See note about the corresponding code in __kmp_join_barrier() being
1285     // performance-critical.
1286     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1287 #if KMP_USE_MONITOR
1288       this_thr->th.th_team_bt_intervals =
1289           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1290       this_thr->th.th_team_bt_set =
1291           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1292 #else
1293       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL();
1294 #endif
1295     }
1296 
1297 #if USE_ITT_BUILD
1298     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1299       __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1300 #endif /* USE_ITT_BUILD */
1301 #if USE_DEBUGGER
1302     // Let the debugger know: the thread arrived to the barrier and waiting.
1303     if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1304       team->t.t_bar[bt].b_master_arrived += 1;
1305     } else {
1306       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1307     } // if
1308 #endif /* USE_DEBUGGER */
1309     if (reduce != NULL) {
1310       // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1311       this_thr->th.th_local.reduce_data = reduce_data;
1312     }
1313 
1314     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1315       __kmp_task_team_setup(
1316           this_thr, team,
1317           0); // use 0 to only setup the current team if nthreads > 1
1318 
1319     switch (__kmp_barrier_gather_pattern[bt]) {
1320     case bp_hyper_bar: {
1321       KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1322       // to 0; use linear
1323       __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1324                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1325       break;
1326     }
1327     case bp_hierarchical_bar: {
1328       __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid,
1329                                         reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1330       break;
1331     }
1332     case bp_tree_bar: {
1333       KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1334       // to 0; use linear
1335       __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1336                                 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1337       break;
1338     }
1339     default: {
1340       __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1341                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1342     }
1343     }
1344 
1345     KMP_MB();
1346 
1347     if (KMP_MASTER_TID(tid)) {
1348       status = 0;
1349       if (__kmp_tasking_mode != tskm_immediate_exec) {
1350           __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1351       }
1352 #if USE_DEBUGGER
1353       // Let the debugger know: All threads are arrived and starting leaving the
1354       // barrier.
1355       team->t.t_bar[bt].b_team_arrived += 1;
1356 #endif
1357 
1358 #if OMP_40_ENABLED
1359       // Reset cancellation flag for worksharing constructs
1360       if (team->t.t_cancel_request == cancel_loop ||
1361           team->t.t_cancel_request == cancel_sections) {
1362         team->t.t_cancel_request = cancel_noreq;
1363       }
1364 #endif
1365 #if USE_ITT_BUILD
1366       /* TODO: In case of split reduction barrier, master thread may send
1367          acquired event early, before the final summation into the shared
1368          variable is done (final summation can be a long operation for array
1369          reductions).  */
1370       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1371         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1372 #endif /* USE_ITT_BUILD */
1373 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1374       // Barrier - report frame end (only if active_level == 1)
1375       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1376           __kmp_forkjoin_frames_mode &&
1377 #if OMP_40_ENABLED
1378           this_thr->th.th_teams_microtask == NULL &&
1379 #endif
1380           team->t.t_active_level == 1) {
1381         kmp_uint64 cur_time = __itt_get_timestamp();
1382         kmp_info_t **other_threads = team->t.t_threads;
1383         int nproc = this_thr->th.th_team_nproc;
1384         int i;
1385         switch (__kmp_forkjoin_frames_mode) {
1386         case 1:
1387           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1388                                  loc, nproc);
1389           this_thr->th.th_frame_time = cur_time;
1390           break;
1391         case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1392           // be fixed)
1393           __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1394                                  1, loc, nproc);
1395           break;
1396         case 3:
1397           if (__itt_metadata_add_ptr) {
1398             // Initialize with master's wait time
1399             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1400             // Set arrive time to zero to be able to check it in
1401             // __kmp_invoke_task(); the same is done inside the loop below
1402             this_thr->th.th_bar_arrive_time = 0;
1403             for (i = 1; i < nproc; ++i) {
1404               delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1405               other_threads[i]->th.th_bar_arrive_time = 0;
1406             }
1407             __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1408                                          cur_time, delta,
1409                                          (kmp_uint64)(reduce != NULL));
1410           }
1411           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1412                                  loc, nproc);
1413           this_thr->th.th_frame_time = cur_time;
1414           break;
1415         }
1416       }
1417 #endif /* USE_ITT_BUILD */
1418     } else {
1419       status = 1;
1420 #if USE_ITT_BUILD
1421       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1422         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1423 #endif /* USE_ITT_BUILD */
1424     }
1425     if (status == 1 || !is_split) {
1426       switch (__kmp_barrier_release_pattern[bt]) {
1427       case bp_hyper_bar: {
1428         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1429         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1430                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1431         break;
1432       }
1433       case bp_hierarchical_bar: {
1434         __kmp_hierarchical_barrier_release(
1435             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1436         break;
1437       }
1438       case bp_tree_bar: {
1439         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1440         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1441                                    FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1442         break;
1443       }
1444       default: {
1445         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1446                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1447       }
1448       }
1449       if (__kmp_tasking_mode != tskm_immediate_exec) {
1450         __kmp_task_team_sync(this_thr, team);
1451       }
1452     }
1453 
1454 #if USE_ITT_BUILD
1455     /* GEH: TODO: Move this under if-condition above and also include in
1456        __kmp_end_split_barrier(). This will more accurately represent the actual
1457        release time of the threads for split barriers.  */
1458     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1459       __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1460 #endif /* USE_ITT_BUILD */
1461   } else { // Team is serialized.
1462     status = 0;
1463     if (__kmp_tasking_mode != tskm_immediate_exec) {
1464 #if OMP_45_ENABLED
1465       if (this_thr->th.th_task_team != NULL) {
1466         void *itt_sync_obj = NULL;
1467 #if USE_ITT_NOTIFY
1468         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1469           itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1470           __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1471         }
1472 #endif
1473 
1474         KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1475                          TRUE);
1476         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1477         __kmp_task_team_setup(this_thr, team, 0);
1478 
1479 #if USE_ITT_BUILD
1480         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1481           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1482 #endif /* USE_ITT_BUILD */
1483       }
1484 #else
1485       // The task team should be NULL for serialized code (tasks will be
1486       // executed immediately)
1487       KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1488       KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1489 #endif
1490     }
1491   }
1492   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1493                 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1494                 __kmp_tid_from_gtid(gtid), status));
1495 
1496 #if OMPT_SUPPORT
1497   if (ompt_enabled) {
1498 #if OMPT_BLAME
1499     if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1500       ompt_callbacks.ompt_callback(ompt_event_barrier_end)(my_parallel_id,
1501                                                            my_task_id);
1502     }
1503 #endif
1504     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1505   }
1506 #endif
1507   ANNOTATE_BARRIER_END(&team->t.t_bar);
1508 
1509   return status;
1510 }
1511 
1512 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1513   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1514   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1515   int tid = __kmp_tid_from_gtid(gtid);
1516   kmp_info_t *this_thr = __kmp_threads[gtid];
1517   kmp_team_t *team = this_thr->th.th_team;
1518 
1519   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1520   if (!team->t.t_serialized) {
1521     if (KMP_MASTER_GTID(gtid)) {
1522       switch (__kmp_barrier_release_pattern[bt]) {
1523       case bp_hyper_bar: {
1524         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1525         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1526                                     FALSE USE_ITT_BUILD_ARG(NULL));
1527         break;
1528       }
1529       case bp_hierarchical_bar: {
1530         __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1531                                            FALSE USE_ITT_BUILD_ARG(NULL));
1532         break;
1533       }
1534       case bp_tree_bar: {
1535         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1536         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1537                                    FALSE USE_ITT_BUILD_ARG(NULL));
1538         break;
1539       }
1540       default: {
1541         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1542                                      FALSE USE_ITT_BUILD_ARG(NULL));
1543       }
1544       }
1545       if (__kmp_tasking_mode != tskm_immediate_exec) {
1546         __kmp_task_team_sync(this_thr, team);
1547       } // if
1548     }
1549   }
1550   ANNOTATE_BARRIER_END(&team->t.t_bar);
1551 }
1552 
1553 void __kmp_join_barrier(int gtid) {
1554   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1555   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1556   register kmp_info_t *this_thr = __kmp_threads[gtid];
1557   register kmp_team_t *team;
1558   register kmp_uint nproc;
1559   kmp_info_t *master_thread;
1560   int tid;
1561 #ifdef KMP_DEBUG
1562   int team_id;
1563 #endif /* KMP_DEBUG */
1564 #if USE_ITT_BUILD
1565   void *itt_sync_obj = NULL;
1566 #if USE_ITT_NOTIFY
1567   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1568     // Get object created at fork_barrier
1569     itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1570 #endif
1571 #endif /* USE_ITT_BUILD */
1572   KMP_MB();
1573 
1574   // Get current info
1575   team = this_thr->th.th_team;
1576   nproc = this_thr->th.th_team_nproc;
1577   KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1578   tid = __kmp_tid_from_gtid(gtid);
1579 #ifdef KMP_DEBUG
1580   team_id = team->t.t_id;
1581 #endif /* KMP_DEBUG */
1582   master_thread = this_thr->th.th_team_master;
1583 #ifdef KMP_DEBUG
1584   if (master_thread != team->t.t_threads[0]) {
1585     __kmp_print_structure();
1586   }
1587 #endif /* KMP_DEBUG */
1588   KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1589   KMP_MB();
1590 
1591   // Verify state
1592   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1593   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1594   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1595   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1596   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1597                 gtid, team_id, tid));
1598 
1599   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1600 #if OMPT_SUPPORT
1601 #if OMPT_TRACE
1602   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1603     ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1604         team->t.ompt_team_info.parallel_id,
1605         team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1606   }
1607 #endif
1608   this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1609 #endif
1610 
1611   if (__kmp_tasking_mode == tskm_extra_barrier) {
1612     __kmp_tasking_barrier(team, this_thr, gtid);
1613     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1614                   team_id, tid));
1615   }
1616 #ifdef KMP_DEBUG
1617   if (__kmp_tasking_mode != tskm_immediate_exec) {
1618     KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1619                   "%p, th_task_team = %p\n",
1620                   __kmp_gtid_from_thread(this_thr), team_id,
1621                   team->t.t_task_team[this_thr->th.th_task_state],
1622                   this_thr->th.th_task_team));
1623     KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1624                      team->t.t_task_team[this_thr->th.th_task_state]);
1625   }
1626 #endif /* KMP_DEBUG */
1627 
1628   /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1629      access it when the team struct is not guaranteed to exist. Doing these
1630      loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1631      we do not perform the copy if blocktime=infinite, since the values are not
1632      used by __kmp_wait_template() in that case. */
1633   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1634 #if KMP_USE_MONITOR
1635     this_thr->th.th_team_bt_intervals =
1636         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1637     this_thr->th.th_team_bt_set =
1638         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1639 #else
1640     this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL();
1641 #endif
1642   }
1643 
1644 #if USE_ITT_BUILD
1645   if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1646     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1647 #endif /* USE_ITT_BUILD */
1648 
1649   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1650   case bp_hyper_bar: {
1651     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1652     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1653                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1654     break;
1655   }
1656   case bp_hierarchical_bar: {
1657     __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1658                                       NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1659     break;
1660   }
1661   case bp_tree_bar: {
1662     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1663     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1664                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1665     break;
1666   }
1667   default: {
1668     __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1669                                 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1670   }
1671   }
1672 
1673   /* From this point on, the team data structure may be deallocated at any time
1674      by the master thread - it is unsafe to reference it in any of the worker
1675      threads. Any per-team data items that need to be referenced before the
1676      end of the barrier should be moved to the kmp_task_team_t structs.  */
1677   if (KMP_MASTER_TID(tid)) {
1678     if (__kmp_tasking_mode != tskm_immediate_exec) {
1679       __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1680     }
1681 #if KMP_STATS_ENABLED
1682     // Have master thread flag the workers to indicate they are now waiting for
1683     // next parallel region, Also wake them up so they switch their timers to
1684     // idle.
1685     for (int i = 0; i < team->t.t_nproc; ++i) {
1686       kmp_info_t *team_thread = team->t.t_threads[i];
1687       if (team_thread == this_thr)
1688         continue;
1689       team_thread->th.th_stats->setIdleFlag();
1690       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1691           team_thread->th.th_sleep_loc != NULL)
1692         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1693                                   team_thread->th.th_sleep_loc);
1694     }
1695 #endif
1696 #if USE_ITT_BUILD
1697     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1698       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1699 #endif /* USE_ITT_BUILD */
1700 
1701 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1702     // Join barrier - report frame end
1703     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1704         __kmp_forkjoin_frames_mode &&
1705 #if OMP_40_ENABLED
1706         this_thr->th.th_teams_microtask == NULL &&
1707 #endif
1708         team->t.t_active_level == 1) {
1709       kmp_uint64 cur_time = __itt_get_timestamp();
1710       ident_t *loc = team->t.t_ident;
1711       kmp_info_t **other_threads = team->t.t_threads;
1712       int nproc = this_thr->th.th_team_nproc;
1713       int i;
1714       switch (__kmp_forkjoin_frames_mode) {
1715       case 1:
1716         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1717                                loc, nproc);
1718         break;
1719       case 2:
1720         __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1721                                loc, nproc);
1722         break;
1723       case 3:
1724         if (__itt_metadata_add_ptr) {
1725           // Initialize with master's wait time
1726           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1727           // Set arrive time to zero to be able to check it in
1728           // __kmp_invoke_task(); the same is done inside the loop below
1729           this_thr->th.th_bar_arrive_time = 0;
1730           for (i = 1; i < nproc; ++i) {
1731             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1732             other_threads[i]->th.th_bar_arrive_time = 0;
1733           }
1734           __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1735                                        cur_time, delta, 0);
1736         }
1737         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1738                                loc, nproc);
1739         this_thr->th.th_frame_time = cur_time;
1740         break;
1741       }
1742     }
1743 #endif /* USE_ITT_BUILD */
1744   }
1745 #if USE_ITT_BUILD
1746   else {
1747     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1748       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1749   }
1750 #endif /* USE_ITT_BUILD */
1751 
1752 #if KMP_DEBUG
1753   if (KMP_MASTER_TID(tid)) {
1754     KA_TRACE(
1755         15,
1756         ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1757          gtid, team_id, tid, nproc));
1758   }
1759 #endif /* KMP_DEBUG */
1760 
1761   // TODO now, mark worker threads as done so they may be disbanded
1762   KMP_MB(); // Flush all pending memory write invalidates.
1763   KA_TRACE(10,
1764            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1765 
1766 #if OMPT_SUPPORT
1767   if (ompt_enabled) {
1768 #if OMPT_BLAME
1769     if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1770       ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1771           team->t.ompt_team_info.parallel_id,
1772           team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1773     }
1774 #endif
1775 
1776     // return to default state
1777     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1778   }
1779 #endif
1780   ANNOTATE_BARRIER_END(&team->t.t_bar);
1781 }
1782 
1783 // TODO release worker threads' fork barriers as we are ready instead of all at
1784 // once
1785 void __kmp_fork_barrier(int gtid, int tid) {
1786   KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1787   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1788   kmp_info_t *this_thr = __kmp_threads[gtid];
1789   kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1790 #if USE_ITT_BUILD
1791   void *itt_sync_obj = NULL;
1792 #endif /* USE_ITT_BUILD */
1793   if (team)
1794     ANNOTATE_BARRIER_END(&team->t.t_bar);
1795 
1796   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1797                 (team != NULL) ? team->t.t_id : -1, tid));
1798 
1799   // th_team pointer only valid for master thread here
1800   if (KMP_MASTER_TID(tid)) {
1801 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1802     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1803       // Create itt barrier object
1804       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1805       __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1806     }
1807 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1808 
1809 #ifdef KMP_DEBUG
1810     register kmp_info_t **other_threads = team->t.t_threads;
1811     register int i;
1812 
1813     // Verify state
1814     KMP_MB();
1815 
1816     for (i = 1; i < team->t.t_nproc; ++i) {
1817       KA_TRACE(500,
1818                ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1819                 "== %u.\n",
1820                 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1821                 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1822                 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1823       KMP_DEBUG_ASSERT(
1824           (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1825            ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1826       KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1827     }
1828 #endif
1829 
1830     if (__kmp_tasking_mode != tskm_immediate_exec) {
1831       // 0 indicates setup current task team if nthreads > 1
1832       __kmp_task_team_setup(this_thr, team, 0);
1833     }
1834 
1835     /* The master thread may have changed its blocktime between the join barrier
1836        and the fork barrier. Copy the blocktime info to the thread, where
1837        __kmp_wait_template() can access it when the team struct is not
1838        guaranteed to exist. */
1839     // See note about the corresponding code in __kmp_join_barrier() being
1840     // performance-critical
1841     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1842 #if KMP_USE_MONITOR
1843       this_thr->th.th_team_bt_intervals =
1844           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1845       this_thr->th.th_team_bt_set =
1846           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1847 #else
1848       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL();
1849 #endif
1850     }
1851   } // master
1852 
1853   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1854   case bp_hyper_bar: {
1855     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1856     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1857                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1858     break;
1859   }
1860   case bp_hierarchical_bar: {
1861     __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1862                                        TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1863     break;
1864   }
1865   case bp_tree_bar: {
1866     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1867     __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1868                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1869     break;
1870   }
1871   default: {
1872     __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1873                                  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1874   }
1875   }
1876 
1877   // Early exit for reaping threads releasing forkjoin barrier
1878   if (TCR_4(__kmp_global.g.g_done)) {
1879     this_thr->th.th_task_team = NULL;
1880 
1881 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1882     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1883       if (!KMP_MASTER_TID(tid)) {
1884         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1885         if (itt_sync_obj)
1886           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1887       }
1888     }
1889 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1890     KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1891     return;
1892   }
1893 
1894   /* We can now assume that a valid team structure has been allocated by the
1895      master and propagated to all worker threads. The current thread, however,
1896      may not be part of the team, so we can't blindly assume that the team
1897      pointer is non-null.  */
1898   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1899   KMP_DEBUG_ASSERT(team != NULL);
1900   tid = __kmp_tid_from_gtid(gtid);
1901 
1902 #if KMP_BARRIER_ICV_PULL
1903   /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1904      __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
1905      implicit task has this data before this function is called. We cannot
1906      modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
1907      struct, because it is not always the case that the threads arrays have
1908      been allocated when __kmp_fork_call() is executed. */
1909   {
1910     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
1911     if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1912       // Copy the initial ICVs from the master's thread struct to the implicit
1913       // task for this tid.
1914       KA_TRACE(10,
1915                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1916       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
1917                                tid, FALSE);
1918       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1919                 &team->t.t_threads[0]
1920                      ->th.th_bar[bs_forkjoin_barrier]
1921                      .bb.th_fixed_icvs);
1922     }
1923   }
1924 #endif // KMP_BARRIER_ICV_PULL
1925 
1926   if (__kmp_tasking_mode != tskm_immediate_exec) {
1927     __kmp_task_team_sync(this_thr, team);
1928   }
1929 
1930 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1931   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1932   if (proc_bind == proc_bind_intel) {
1933 #endif
1934 #if KMP_AFFINITY_SUPPORTED
1935     // Call dynamic affinity settings
1936     if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1937       __kmp_balanced_affinity(tid, team->t.t_nproc);
1938     }
1939 #endif // KMP_AFFINITY_SUPPORTED
1940 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1941   } else if (proc_bind != proc_bind_false) {
1942     if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1943       KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1944                      __kmp_gtid_from_thread(this_thr),
1945                      this_thr->th.th_current_place));
1946     } else {
1947       __kmp_affinity_set_place(gtid);
1948     }
1949   }
1950 #endif
1951 
1952 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1953   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1954     if (!KMP_MASTER_TID(tid)) {
1955       // Get correct barrier object
1956       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1957       __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1958     } // (prepare called inside barrier_release)
1959   }
1960 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1961   ANNOTATE_BARRIER_END(&team->t.t_bar);
1962   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
1963                 team->t.t_id, tid));
1964 }
1965 
1966 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
1967                           kmp_internal_control_t *new_icvs, ident_t *loc) {
1968   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
1969 
1970   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1971   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1972 
1973 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1974    __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
1975    implicit task has this data before this function is called. */
1976 #if KMP_BARRIER_ICV_PULL
1977   /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
1978      untouched), where all of the worker threads can access them and make their
1979      own copies after the barrier. */
1980   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
1981   // allocated at this point
1982   copy_icvs(
1983       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
1984       new_icvs);
1985   KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
1986                 team->t.t_threads[0], team));
1987 #elif KMP_BARRIER_ICV_PUSH
1988   // The ICVs will be propagated in the fork barrier, so nothing needs to be
1989   // done here.
1990   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
1991                 team->t.t_threads[0], team));
1992 #else
1993   // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
1994   // time.
1995   ngo_load(new_icvs);
1996   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
1997   // allocated at this point
1998   for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
1999     // TODO: GEH - pass in better source location info since usually NULL here
2000     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2001                   f, team->t.t_threads[f], team));
2002     __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2003     ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2004     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2005                   f, team->t.t_threads[f], team));
2006   }
2007   ngo_sync();
2008 #endif // KMP_BARRIER_ICV_PULL
2009 }
2010