1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_wait_release.h"
16 #include "kmp_itt.h"
17 #include "kmp_os.h"
18 #include "kmp_stats.h"
19 #if OMPT_SUPPORT
20 #include "ompt-specific.h"
21 #endif
22 
23 #if KMP_MIC
24 #include <immintrin.h>
25 #define USE_NGO_STORES 1
26 #endif // KMP_MIC
27 
28 #include "tsan_annotations.h"
29 
30 #if KMP_MIC && USE_NGO_STORES
31 // ICV copying
32 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
33 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
35 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
36 #else
37 #define ngo_load(src) ((void)0)
38 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
39 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
40 #define ngo_sync() ((void)0)
41 #endif /* KMP_MIC && USE_NGO_STORES */
42 
43 void __kmp_print_structure(void); // Forward declaration
44 
45 // ---------------------------- Barrier Algorithms ----------------------------
46 
47 // Linear Barrier
48 static void __kmp_linear_barrier_gather(
49     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
51   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
52   kmp_team_t *team = this_thr->th.th_team;
53   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
54   kmp_info_t **other_threads = team->t.t_threads;
55 
56   KA_TRACE(
57       20,
58       ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59        gtid, team->t.t_id, tid, bt));
60   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61 
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
63   // Barrier imbalance - save arrive time to the thread
64   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
66         __itt_get_timestamp();
67   }
68 #endif
69   // We now perform a linear reduction to signal that all of the threads have
70   // arrived.
71   if (!KMP_MASTER_TID(tid)) {
72     KA_TRACE(20,
73              ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
74               "arrived(%p): %llu => %llu\n",
75               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
76               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
77               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
78     // Mark arrival to master thread
79     /* After performing this write, a worker thread may not assume that the team
80        is valid any more - it could be deallocated by the master thread at any
81        time. */
82     ANNOTATE_BARRIER_BEGIN(this_thr);
83     kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
84     flag.release();
85   } else {
86     kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
87     int nproc = this_thr->th.th_team_nproc;
88     int i;
89     // Don't have to worry about sleep bit here or atomic since team setting
90     kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
91 
92     // Collect all the worker team member threads.
93     for (i = 1; i < nproc; ++i) {
94 #if KMP_CACHE_MANAGE
95       // Prefetch next thread's arrived count
96       if (i + 1 < nproc)
97         KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
98 #endif /* KMP_CACHE_MANAGE */
99       KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
100                     "arrived(%p) == %llu\n",
101                     gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
102                     team->t.t_id, i,
103                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
104 
105       // Wait for worker thread to arrive
106       kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
107                        new_state);
108       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109       ANNOTATE_BARRIER_END(other_threads[i]);
110 #if USE_ITT_BUILD && USE_ITT_NOTIFY
111       // Barrier imbalance - write min of the thread time and the other thread
112       // time to the thread.
113       if (__kmp_forkjoin_frames_mode == 2) {
114         this_thr->th.th_bar_min_time = KMP_MIN(
115             this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
116       }
117 #endif
118       if (reduce) {
119         KA_TRACE(100,
120                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
121                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
122                   team->t.t_id, i));
123         ANNOTATE_REDUCE_AFTER(reduce);
124         (*reduce)(this_thr->th.th_local.reduce_data,
125                   other_threads[i]->th.th_local.reduce_data);
126         ANNOTATE_REDUCE_BEFORE(reduce);
127         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
128       }
129     }
130     // Don't have to worry about sleep bit here or atomic since team setting
131     team_bar->b_arrived = new_state;
132     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
133                   "arrived(%p) = %llu\n",
134                   gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
135                   new_state));
136   }
137   KA_TRACE(
138       20,
139       ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
140        gtid, team->t.t_id, tid, bt));
141 }
142 
143 static void __kmp_linear_barrier_release(
144     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
145     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
146   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
147   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
148   kmp_team_t *team;
149 
150   if (KMP_MASTER_TID(tid)) {
151     unsigned int i;
152     kmp_uint32 nproc = this_thr->th.th_team_nproc;
153     kmp_info_t **other_threads;
154 
155     team = __kmp_threads[gtid]->th.th_team;
156     KMP_DEBUG_ASSERT(team != NULL);
157     other_threads = team->t.t_threads;
158 
159     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
160                   "barrier type %d\n",
161                   gtid, team->t.t_id, tid, bt));
162 
163     if (nproc > 1) {
164 #if KMP_BARRIER_ICV_PUSH
165       {
166         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
167         if (propagate_icvs) {
168           ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
169           for (i = 1; i < nproc; ++i) {
170             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
171                                      team, i, FALSE);
172             ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
173                            &team->t.t_implicit_task_taskdata[0].td_icvs);
174           }
175           ngo_sync();
176         }
177       }
178 #endif // KMP_BARRIER_ICV_PUSH
179 
180       // Now, release all of the worker threads
181       for (i = 1; i < nproc; ++i) {
182 #if KMP_CACHE_MANAGE
183         // Prefetch next thread's go flag
184         if (i + 1 < nproc)
185           KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
186 #endif /* KMP_CACHE_MANAGE */
187         KA_TRACE(
188             20,
189             ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
190              "go(%p): %u => %u\n",
191              gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
192              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
193              other_threads[i]->th.th_bar[bt].bb.b_go,
194              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
195         ANNOTATE_BARRIER_BEGIN(other_threads[i]);
196         kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
197                          other_threads[i]);
198         flag.release();
199       }
200     }
201   } else { // Wait for the MASTER thread to release us
202     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
203                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
204     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
205     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
206     ANNOTATE_BARRIER_END(this_thr);
207 #if USE_ITT_BUILD && USE_ITT_NOTIFY
208     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
209       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
210       // disabled)
211       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
212       // Cancel wait on previous parallel region...
213       __kmp_itt_task_starting(itt_sync_obj);
214 
215       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
216         return;
217 
218       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
219       if (itt_sync_obj != NULL)
220         // Call prepare as early as possible for "new" barrier
221         __kmp_itt_task_finished(itt_sync_obj);
222     } else
223 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
224         // Early exit for reaping threads releasing forkjoin barrier
225         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
226       return;
227 // The worker thread may now assume that the team is valid.
228 #ifdef KMP_DEBUG
229     tid = __kmp_tid_from_gtid(gtid);
230     team = __kmp_threads[gtid]->th.th_team;
231 #endif
232     KMP_DEBUG_ASSERT(team != NULL);
233     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
234     KA_TRACE(20,
235              ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
236               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
237     KMP_MB(); // Flush all pending memory write invalidates.
238   }
239   KA_TRACE(
240       20,
241       ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
242        gtid, team->t.t_id, tid, bt));
243 }
244 
245 // Tree barrier
246 static void
247 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
248                           int tid, void (*reduce)(void *, void *)
249                                        USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
250   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
251   kmp_team_t *team = this_thr->th.th_team;
252   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
253   kmp_info_t **other_threads = team->t.t_threads;
254   kmp_uint32 nproc = this_thr->th.th_team_nproc;
255   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
256   kmp_uint32 branch_factor = 1 << branch_bits;
257   kmp_uint32 child;
258   kmp_uint32 child_tid;
259   kmp_uint64 new_state;
260 
261   KA_TRACE(
262       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
263            gtid, team->t.t_id, tid, bt));
264   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
265 
266 #if USE_ITT_BUILD && USE_ITT_NOTIFY
267   // Barrier imbalance - save arrive time to the thread
268   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
269     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
270         __itt_get_timestamp();
271   }
272 #endif
273   // Perform tree gather to wait until all threads have arrived; reduce any
274   // required data as we go
275   child_tid = (tid << branch_bits) + 1;
276   if (child_tid < nproc) {
277     // Parent threads wait for all their children to arrive
278     new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
279     child = 1;
280     do {
281       kmp_info_t *child_thr = other_threads[child_tid];
282       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
283 #if KMP_CACHE_MANAGE
284       // Prefetch next thread's arrived count
285       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
286         KMP_CACHE_PREFETCH(
287             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
288 #endif /* KMP_CACHE_MANAGE */
289       KA_TRACE(20,
290                ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
291                 "arrived(%p) == %llu\n",
292                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
293                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
294       // Wait for child to arrive
295       kmp_flag_64 flag(&child_bar->b_arrived, new_state);
296       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
297       ANNOTATE_BARRIER_END(child_thr);
298 #if USE_ITT_BUILD && USE_ITT_NOTIFY
299       // Barrier imbalance - write min of the thread time and a child time to
300       // the thread.
301       if (__kmp_forkjoin_frames_mode == 2) {
302         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
303                                                child_thr->th.th_bar_min_time);
304       }
305 #endif
306       if (reduce) {
307         KA_TRACE(100,
308                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
309                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
310                   team->t.t_id, child_tid));
311         ANNOTATE_REDUCE_AFTER(reduce);
312         (*reduce)(this_thr->th.th_local.reduce_data,
313                   child_thr->th.th_local.reduce_data);
314         ANNOTATE_REDUCE_BEFORE(reduce);
315         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
316       }
317       child++;
318       child_tid++;
319     } while (child <= branch_factor && child_tid < nproc);
320   }
321 
322   if (!KMP_MASTER_TID(tid)) { // Worker threads
323     kmp_int32 parent_tid = (tid - 1) >> branch_bits;
324 
325     KA_TRACE(20,
326              ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
327               "arrived(%p): %llu => %llu\n",
328               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
329               team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
330               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
331 
332     // Mark arrival to parent thread
333     /* After performing this write, a worker thread may not assume that the team
334        is valid any more - it could be deallocated by the master thread at any
335        time.  */
336     ANNOTATE_BARRIER_BEGIN(this_thr);
337     kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
338     flag.release();
339   } else {
340     // Need to update the team arrived pointer if we are the master thread
341     if (nproc > 1) // New value was already computed above
342       team->t.t_bar[bt].b_arrived = new_state;
343     else
344       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
345     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
346                   "arrived(%p) = %llu\n",
347                   gtid, team->t.t_id, tid, team->t.t_id,
348                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
349   }
350   KA_TRACE(20,
351            ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
352             gtid, team->t.t_id, tid, bt));
353 }
354 
355 static void __kmp_tree_barrier_release(
356     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
357     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
358   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
359   kmp_team_t *team;
360   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
361   kmp_uint32 nproc;
362   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
363   kmp_uint32 branch_factor = 1 << branch_bits;
364   kmp_uint32 child;
365   kmp_uint32 child_tid;
366 
367   // Perform a tree release for all of the threads that have been gathered
368   if (!KMP_MASTER_TID(
369           tid)) { // Handle fork barrier workers who aren't part of a team yet
370     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
371                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
372     // Wait for parent thread to release us
373     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
374     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
375     ANNOTATE_BARRIER_END(this_thr);
376 #if USE_ITT_BUILD && USE_ITT_NOTIFY
377     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
378       // In fork barrier where we could not get the object reliably (or
379       // ITTNOTIFY is disabled)
380       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
381       // Cancel wait on previous parallel region...
382       __kmp_itt_task_starting(itt_sync_obj);
383 
384       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
385         return;
386 
387       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
388       if (itt_sync_obj != NULL)
389         // Call prepare as early as possible for "new" barrier
390         __kmp_itt_task_finished(itt_sync_obj);
391     } else
392 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
393         // Early exit for reaping threads releasing forkjoin barrier
394         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
395       return;
396 
397     // The worker thread may now assume that the team is valid.
398     team = __kmp_threads[gtid]->th.th_team;
399     KMP_DEBUG_ASSERT(team != NULL);
400     tid = __kmp_tid_from_gtid(gtid);
401 
402     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
403     KA_TRACE(20,
404              ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
405               team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
406     KMP_MB(); // Flush all pending memory write invalidates.
407   } else {
408     team = __kmp_threads[gtid]->th.th_team;
409     KMP_DEBUG_ASSERT(team != NULL);
410     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
411                   "barrier type %d\n",
412                   gtid, team->t.t_id, tid, bt));
413   }
414   nproc = this_thr->th.th_team_nproc;
415   child_tid = (tid << branch_bits) + 1;
416 
417   if (child_tid < nproc) {
418     kmp_info_t **other_threads = team->t.t_threads;
419     child = 1;
420     // Parent threads release all their children
421     do {
422       kmp_info_t *child_thr = other_threads[child_tid];
423       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
424 #if KMP_CACHE_MANAGE
425       // Prefetch next thread's go count
426       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
427         KMP_CACHE_PREFETCH(
428             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
429 #endif /* KMP_CACHE_MANAGE */
430 
431 #if KMP_BARRIER_ICV_PUSH
432       {
433         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
434         if (propagate_icvs) {
435           __kmp_init_implicit_task(team->t.t_ident,
436                                    team->t.t_threads[child_tid], team,
437                                    child_tid, FALSE);
438           copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
439                     &team->t.t_implicit_task_taskdata[0].td_icvs);
440         }
441       }
442 #endif // KMP_BARRIER_ICV_PUSH
443       KA_TRACE(20,
444                ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
445                 "go(%p): %u => %u\n",
446                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
447                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
448                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
449       // Release child from barrier
450       ANNOTATE_BARRIER_BEGIN(child_thr);
451       kmp_flag_64 flag(&child_bar->b_go, child_thr);
452       flag.release();
453       child++;
454       child_tid++;
455     } while (child <= branch_factor && child_tid < nproc);
456   }
457   KA_TRACE(
458       20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
459            gtid, team->t.t_id, tid, bt));
460 }
461 
462 // Hyper Barrier
463 static void
464 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
465                            int tid, void (*reduce)(void *, void *)
466                                         USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
467   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
468   kmp_team_t *team = this_thr->th.th_team;
469   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
470   kmp_info_t **other_threads = team->t.t_threads;
471   kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
472   kmp_uint32 num_threads = this_thr->th.th_team_nproc;
473   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
474   kmp_uint32 branch_factor = 1 << branch_bits;
475   kmp_uint32 offset;
476   kmp_uint32 level;
477 
478   KA_TRACE(
479       20,
480       ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
481        gtid, team->t.t_id, tid, bt));
482   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
483 
484 #if USE_ITT_BUILD && USE_ITT_NOTIFY
485   // Barrier imbalance - save arrive time to the thread
486   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
487     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
488         __itt_get_timestamp();
489   }
490 #endif
491   /* Perform a hypercube-embedded tree gather to wait until all of the threads
492      have arrived, and reduce any required data as we go.  */
493   kmp_flag_64 p_flag(&thr_bar->b_arrived);
494   for (level = 0, offset = 1; offset < num_threads;
495        level += branch_bits, offset <<= branch_bits) {
496     kmp_uint32 child;
497     kmp_uint32 child_tid;
498 
499     if (((tid >> level) & (branch_factor - 1)) != 0) {
500       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
501 
502       KA_TRACE(20,
503                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
504                 "arrived(%p): %llu => %llu\n",
505                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
506                 team->t.t_id, parent_tid, &thr_bar->b_arrived,
507                 thr_bar->b_arrived,
508                 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
509       // Mark arrival to parent thread
510       /* After performing this write (in the last iteration of the enclosing for
511          loop), a worker thread may not assume that the team is valid any more
512          - it could be deallocated by the master thread at any time.  */
513       ANNOTATE_BARRIER_BEGIN(this_thr);
514       p_flag.set_waiter(other_threads[parent_tid]);
515       p_flag.release();
516       break;
517     }
518 
519     // Parent threads wait for children to arrive
520     if (new_state == KMP_BARRIER_UNUSED_STATE)
521       new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
522     for (child = 1, child_tid = tid + (1 << level);
523          child < branch_factor && child_tid < num_threads;
524          child++, child_tid += (1 << level)) {
525       kmp_info_t *child_thr = other_threads[child_tid];
526       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
527 #if KMP_CACHE_MANAGE
528       kmp_uint32 next_child_tid = child_tid + (1 << level);
529       // Prefetch next thread's arrived count
530       if (child + 1 < branch_factor && next_child_tid < num_threads)
531         KMP_CACHE_PREFETCH(
532             &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
533 #endif /* KMP_CACHE_MANAGE */
534       KA_TRACE(20,
535                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
536                 "arrived(%p) == %llu\n",
537                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
538                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
539       // Wait for child to arrive
540       kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
541       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
542       ANNOTATE_BARRIER_END(child_thr);
543 #if USE_ITT_BUILD && USE_ITT_NOTIFY
544       // Barrier imbalance - write min of the thread time and a child time to
545       // the thread.
546       if (__kmp_forkjoin_frames_mode == 2) {
547         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
548                                                child_thr->th.th_bar_min_time);
549       }
550 #endif
551       if (reduce) {
552         KA_TRACE(100,
553                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
554                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
555                   team->t.t_id, child_tid));
556         ANNOTATE_REDUCE_AFTER(reduce);
557         (*reduce)(this_thr->th.th_local.reduce_data,
558                   child_thr->th.th_local.reduce_data);
559         ANNOTATE_REDUCE_BEFORE(reduce);
560         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
561       }
562     }
563   }
564 
565   if (KMP_MASTER_TID(tid)) {
566     // Need to update the team arrived pointer if we are the master thread
567     if (new_state == KMP_BARRIER_UNUSED_STATE)
568       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
569     else
570       team->t.t_bar[bt].b_arrived = new_state;
571     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
572                   "arrived(%p) = %llu\n",
573                   gtid, team->t.t_id, tid, team->t.t_id,
574                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
575   }
576   KA_TRACE(
577       20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
578            gtid, team->t.t_id, tid, bt));
579 }
580 
581 // The reverse versions seem to beat the forward versions overall
582 #define KMP_REVERSE_HYPER_BAR
583 static void __kmp_hyper_barrier_release(
584     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
585     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
586   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
587   kmp_team_t *team;
588   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
589   kmp_info_t **other_threads;
590   kmp_uint32 num_threads;
591   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
592   kmp_uint32 branch_factor = 1 << branch_bits;
593   kmp_uint32 child;
594   kmp_uint32 child_tid;
595   kmp_uint32 offset;
596   kmp_uint32 level;
597 
598   /* Perform a hypercube-embedded tree release for all of the threads that have
599      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
600      are released in the reverse order of the corresponding gather, otherwise
601      threads are released in the same order. */
602   if (KMP_MASTER_TID(tid)) { // master
603     team = __kmp_threads[gtid]->th.th_team;
604     KMP_DEBUG_ASSERT(team != NULL);
605     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
606                   "barrier type %d\n",
607                   gtid, team->t.t_id, tid, bt));
608 #if KMP_BARRIER_ICV_PUSH
609     if (propagate_icvs) { // master already has ICVs in final destination; copy
610       copy_icvs(&thr_bar->th_fixed_icvs,
611                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
612     }
613 #endif
614   } else { // Handle fork barrier workers who aren't part of a team yet
615     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
616                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
617     // Wait for parent thread to release us
618     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
619     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
620     ANNOTATE_BARRIER_END(this_thr);
621 #if USE_ITT_BUILD && USE_ITT_NOTIFY
622     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
623       // In fork barrier where we could not get the object reliably
624       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
625       // Cancel wait on previous parallel region...
626       __kmp_itt_task_starting(itt_sync_obj);
627 
628       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
629         return;
630 
631       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
632       if (itt_sync_obj != NULL)
633         // Call prepare as early as possible for "new" barrier
634         __kmp_itt_task_finished(itt_sync_obj);
635     } else
636 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
637         // Early exit for reaping threads releasing forkjoin barrier
638         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
639       return;
640 
641     // The worker thread may now assume that the team is valid.
642     team = __kmp_threads[gtid]->th.th_team;
643     KMP_DEBUG_ASSERT(team != NULL);
644     tid = __kmp_tid_from_gtid(gtid);
645 
646     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
647     KA_TRACE(20,
648              ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
649               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
650     KMP_MB(); // Flush all pending memory write invalidates.
651   }
652   num_threads = this_thr->th.th_team_nproc;
653   other_threads = team->t.t_threads;
654 
655 #ifdef KMP_REVERSE_HYPER_BAR
656   // Count up to correct level for parent
657   for (level = 0, offset = 1;
658        offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
659        level += branch_bits, offset <<= branch_bits)
660     ;
661 
662   // Now go down from there
663   for (level -= branch_bits, offset >>= branch_bits; offset != 0;
664        level -= branch_bits, offset >>= branch_bits)
665 #else
666   // Go down the tree, level by level
667   for (level = 0, offset = 1; offset < num_threads;
668        level += branch_bits, offset <<= branch_bits)
669 #endif // KMP_REVERSE_HYPER_BAR
670   {
671 #ifdef KMP_REVERSE_HYPER_BAR
672     /* Now go in reverse order through the children, highest to lowest.
673        Initial setting of child is conservative here. */
674     child = num_threads >> ((level == 0) ? level : level - 1);
675     for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
676         child_tid = tid + (child << level);
677          child >= 1; child--, child_tid -= (1 << level))
678 #else
679     if (((tid >> level) & (branch_factor - 1)) != 0)
680       // No need to go lower than this, since this is the level parent would be
681       // notified
682       break;
683     // Iterate through children on this level of the tree
684     for (child = 1, child_tid = tid + (1 << level);
685          child < branch_factor && child_tid < num_threads;
686          child++, child_tid += (1 << level))
687 #endif // KMP_REVERSE_HYPER_BAR
688     {
689       if (child_tid >= num_threads)
690         continue; // Child doesn't exist so keep going
691       else {
692         kmp_info_t *child_thr = other_threads[child_tid];
693         kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
694 #if KMP_CACHE_MANAGE
695         kmp_uint32 next_child_tid = child_tid - (1 << level);
696 // Prefetch next thread's go count
697 #ifdef KMP_REVERSE_HYPER_BAR
698         if (child - 1 >= 1 && next_child_tid < num_threads)
699 #else
700         if (child + 1 < branch_factor && next_child_tid < num_threads)
701 #endif // KMP_REVERSE_HYPER_BAR
702           KMP_CACHE_PREFETCH(
703               &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
704 #endif /* KMP_CACHE_MANAGE */
705 
706 #if KMP_BARRIER_ICV_PUSH
707         if (propagate_icvs) // push my fixed ICVs to my child
708           copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
709 #endif // KMP_BARRIER_ICV_PUSH
710 
711         KA_TRACE(
712             20,
713             ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
714              "go(%p): %u => %u\n",
715              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
716              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
717              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
718         // Release child from barrier
719         ANNOTATE_BARRIER_BEGIN(child_thr);
720         kmp_flag_64 flag(&child_bar->b_go, child_thr);
721         flag.release();
722       }
723     }
724   }
725 #if KMP_BARRIER_ICV_PUSH
726   if (propagate_icvs &&
727       !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
728     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
729                              FALSE);
730     copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
731               &thr_bar->th_fixed_icvs);
732   }
733 #endif
734   KA_TRACE(
735       20,
736       ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
737        gtid, team->t.t_id, tid, bt));
738 }
739 
740 // Hierarchical Barrier
741 
742 // Initialize thread barrier data
743 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
744    Performs the minimum amount of initialization required based on how the team
745    has changed. Returns true if leaf children will require both on-core and
746    traditional wake-up mechanisms. For example, if the team size increases,
747    threads already in the team will respond to on-core wakeup on their parent
748    thread, but threads newly added to the team will only be listening on the
749    their local b_go. */
750 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
751                                                    kmp_bstate_t *thr_bar,
752                                                    kmp_uint32 nproc, int gtid,
753                                                    int tid, kmp_team_t *team) {
754   // Checks to determine if (re-)initialization is needed
755   bool uninitialized = thr_bar->team == NULL;
756   bool team_changed = team != thr_bar->team;
757   bool team_sz_changed = nproc != thr_bar->nproc;
758   bool tid_changed = tid != thr_bar->old_tid;
759   bool retval = false;
760 
761   if (uninitialized || team_sz_changed) {
762     __kmp_get_hierarchy(nproc, thr_bar);
763   }
764 
765   if (uninitialized || team_sz_changed || tid_changed) {
766     thr_bar->my_level = thr_bar->depth - 1; // default for master
767     thr_bar->parent_tid = -1; // default for master
768     if (!KMP_MASTER_TID(
769             tid)) { // if not master, find parent thread in hierarchy
770       kmp_uint32 d = 0;
771       while (d < thr_bar->depth) { // find parent based on level of thread in
772         // hierarchy, and note level
773         kmp_uint32 rem;
774         if (d == thr_bar->depth - 2) { // reached level right below the master
775           thr_bar->parent_tid = 0;
776           thr_bar->my_level = d;
777           break;
778         } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
779                    0) { // TODO: can we make this op faster?
780           // thread is not a subtree root at next level, so this is max
781           thr_bar->parent_tid = tid - rem;
782           thr_bar->my_level = d;
783           break;
784         }
785         ++d;
786       }
787     }
788     thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
789     thr_bar->old_tid = tid;
790     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
791     thr_bar->team = team;
792     thr_bar->parent_bar =
793         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
794   }
795   if (uninitialized || team_changed || tid_changed) {
796     thr_bar->team = team;
797     thr_bar->parent_bar =
798         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
799     retval = true;
800   }
801   if (uninitialized || team_sz_changed || tid_changed) {
802     thr_bar->nproc = nproc;
803     thr_bar->leaf_kids = thr_bar->base_leaf_kids;
804     if (thr_bar->my_level == 0)
805       thr_bar->leaf_kids = 0;
806     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
807       thr_bar->leaf_kids = nproc - tid - 1;
808     thr_bar->leaf_state = 0;
809     for (int i = 0; i < thr_bar->leaf_kids; ++i)
810       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
811   }
812   return retval;
813 }
814 
815 static void __kmp_hierarchical_barrier_gather(
816     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
817     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
818   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
819   kmp_team_t *team = this_thr->th.th_team;
820   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
821   kmp_uint32 nproc = this_thr->th.th_team_nproc;
822   kmp_info_t **other_threads = team->t.t_threads;
823   kmp_uint64 new_state;
824 
825   int level = team->t.t_level;
826 #if OMP_40_ENABLED
827   if (other_threads[0]
828           ->th.th_teams_microtask) // are we inside the teams construct?
829     if (this_thr->th.th_teams_size.nteams > 1)
830       ++level; // level was not increased in teams construct for team_of_masters
831 #endif
832   if (level == 1)
833     thr_bar->use_oncore_barrier = 1;
834   else
835     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
836 
837   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
838                 "barrier type %d\n",
839                 gtid, team->t.t_id, tid, bt));
840   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
841 
842 #if USE_ITT_BUILD && USE_ITT_NOTIFY
843   // Barrier imbalance - save arrive time to the thread
844   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
845     this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
846   }
847 #endif
848 
849   (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
850                                                team);
851 
852   if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
853     kmp_int32 child_tid;
854     new_state =
855         (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
856     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
857         thr_bar->use_oncore_barrier) {
858       if (thr_bar->leaf_kids) {
859         // First, wait for leaf children to check-in on my b_arrived flag
860         kmp_uint64 leaf_state =
861             KMP_MASTER_TID(tid)
862                 ? thr_bar->b_arrived | thr_bar->leaf_state
863                 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
864         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
865                       "for leaf kids\n",
866                       gtid, team->t.t_id, tid));
867         kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
868         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
869         if (reduce) {
870           ANNOTATE_REDUCE_AFTER(reduce);
871           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
872                ++child_tid) {
873             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
874                            "T#%d(%d:%d)\n",
875                            gtid, team->t.t_id, tid,
876                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
877                            child_tid));
878             ANNOTATE_BARRIER_END(other_threads[child_tid]);
879             (*reduce)(this_thr->th.th_local.reduce_data,
880                       other_threads[child_tid]->th.th_local.reduce_data);
881           }
882           ANNOTATE_REDUCE_BEFORE(reduce);
883           ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
884         }
885         // clear leaf_state bits
886         KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
887       }
888       // Next, wait for higher level children on each child's b_arrived flag
889       for (kmp_uint32 d = 1; d < thr_bar->my_level;
890            ++d) { // gather lowest level threads first, but skip 0
891         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
892                    skip = thr_bar->skip_per_level[d];
893         if (last > nproc)
894           last = nproc;
895         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
896           kmp_info_t *child_thr = other_threads[child_tid];
897           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
898           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
899                         "T#%d(%d:%d) "
900                         "arrived(%p) == %llu\n",
901                         gtid, team->t.t_id, tid,
902                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
903                         child_tid, &child_bar->b_arrived, new_state));
904           kmp_flag_64 flag(&child_bar->b_arrived, new_state);
905           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
906           ANNOTATE_BARRIER_END(child_thr);
907           if (reduce) {
908             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
909                            "T#%d(%d:%d)\n",
910                            gtid, team->t.t_id, tid,
911                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
912                            child_tid));
913             ANNOTATE_REDUCE_AFTER(reduce);
914             (*reduce)(this_thr->th.th_local.reduce_data,
915                       child_thr->th.th_local.reduce_data);
916             ANNOTATE_REDUCE_BEFORE(reduce);
917             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
918           }
919         }
920       }
921     } else { // Blocktime is not infinite
922       for (kmp_uint32 d = 0; d < thr_bar->my_level;
923            ++d) { // Gather lowest level threads first
924         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
925                    skip = thr_bar->skip_per_level[d];
926         if (last > nproc)
927           last = nproc;
928         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
929           kmp_info_t *child_thr = other_threads[child_tid];
930           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
931           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
932                         "T#%d(%d:%d) "
933                         "arrived(%p) == %llu\n",
934                         gtid, team->t.t_id, tid,
935                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
936                         child_tid, &child_bar->b_arrived, new_state));
937           kmp_flag_64 flag(&child_bar->b_arrived, new_state);
938           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
939           ANNOTATE_BARRIER_END(child_thr);
940           if (reduce) {
941             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
942                            "T#%d(%d:%d)\n",
943                            gtid, team->t.t_id, tid,
944                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
945                            child_tid));
946             ANNOTATE_REDUCE_AFTER(reduce);
947             (*reduce)(this_thr->th.th_local.reduce_data,
948                       child_thr->th.th_local.reduce_data);
949             ANNOTATE_REDUCE_BEFORE(reduce);
950             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
951           }
952         }
953       }
954     }
955   }
956   // All subordinates are gathered; now release parent if not master thread
957 
958   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
959     KA_TRACE(
960         20,
961         ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
962          "arrived(%p): %llu => %llu\n",
963          gtid, team->t.t_id, tid,
964          __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
965          thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
966          thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
967     /* Mark arrival to parent: After performing this write, a worker thread may
968        not assume that the team is valid any more - it could be deallocated by
969        the master thread at any time. */
970     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
971         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
972       // flag; release it
973       ANNOTATE_BARRIER_BEGIN(this_thr);
974       kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
975       flag.release();
976     } else { // Leaf does special release on the "offset" bits of parent's
977       // b_arrived flag
978       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
979       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
980       flag.set_waiter(other_threads[thr_bar->parent_tid]);
981       flag.release();
982     }
983   } else { // Master thread needs to update the team's b_arrived value
984     team->t.t_bar[bt].b_arrived = new_state;
985     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
986                   "arrived(%p) = %llu\n",
987                   gtid, team->t.t_id, tid, team->t.t_id,
988                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
989   }
990   // Is the team access below unsafe or just technically invalid?
991   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
992                 "barrier type %d\n",
993                 gtid, team->t.t_id, tid, bt));
994 }
995 
996 static void __kmp_hierarchical_barrier_release(
997     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
998     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
999   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1000   kmp_team_t *team;
1001   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1002   kmp_uint32 nproc;
1003   bool team_change = false; // indicates on-core barrier shouldn't be used
1004 
1005   if (KMP_MASTER_TID(tid)) {
1006     team = __kmp_threads[gtid]->th.th_team;
1007     KMP_DEBUG_ASSERT(team != NULL);
1008     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1009                   "entered barrier type %d\n",
1010                   gtid, team->t.t_id, tid, bt));
1011   } else { // Worker threads
1012     // Wait for parent thread to release me
1013     if (!thr_bar->use_oncore_barrier ||
1014         __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1015         thr_bar->team == NULL) {
1016       // Use traditional method of waiting on my own b_go flag
1017       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1018       kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1019       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1020       ANNOTATE_BARRIER_END(this_thr);
1021       TCW_8(thr_bar->b_go,
1022             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1023     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1024       // infinite, not nested
1025       // Wait on my "offset" bits on parent's b_go flag
1026       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1027       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1028                            thr_bar->offset, bt,
1029                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1030       flag.wait(this_thr, TRUE);
1031       if (thr_bar->wait_flag ==
1032           KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1033         TCW_8(thr_bar->b_go,
1034               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1035       } else { // Reset my bits on parent's b_go flag
1036         (RCAST(volatile char *,
1037                &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1038       }
1039     }
1040     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1041     // Early exit for reaping threads releasing forkjoin barrier
1042     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1043       return;
1044     // The worker thread may now assume that the team is valid.
1045     team = __kmp_threads[gtid]->th.th_team;
1046     KMP_DEBUG_ASSERT(team != NULL);
1047     tid = __kmp_tid_from_gtid(gtid);
1048 
1049     KA_TRACE(
1050         20,
1051         ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1052          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1053     KMP_MB(); // Flush all pending memory write invalidates.
1054   }
1055 
1056   nproc = this_thr->th.th_team_nproc;
1057   int level = team->t.t_level;
1058 #if OMP_40_ENABLED
1059   if (team->t.t_threads[0]
1060           ->th.th_teams_microtask) { // are we inside the teams construct?
1061     if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1062         this_thr->th.th_teams_level == level)
1063       ++level; // level was not increased in teams construct for team_of_workers
1064     if (this_thr->th.th_teams_size.nteams > 1)
1065       ++level; // level was not increased in teams construct for team_of_masters
1066   }
1067 #endif
1068   if (level == 1)
1069     thr_bar->use_oncore_barrier = 1;
1070   else
1071     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1072 
1073   // If the team size has increased, we still communicate with old leaves via
1074   // oncore barrier.
1075   unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1076   kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1077   team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1078                                                        tid, team);
1079   // But if the entire team changes, we won't use oncore barrier at all
1080   if (team_change)
1081     old_leaf_kids = 0;
1082 
1083 #if KMP_BARRIER_ICV_PUSH
1084   if (propagate_icvs) {
1085     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1086                              FALSE);
1087     if (KMP_MASTER_TID(
1088             tid)) { // master already has copy in final destination; copy
1089       copy_icvs(&thr_bar->th_fixed_icvs,
1090                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1091     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1092                thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1093       if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1094         // leaves (on-core children) pull parent's fixed ICVs directly to local
1095         // ICV store
1096         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1097                   &thr_bar->parent_bar->th_fixed_icvs);
1098       // non-leaves will get ICVs piggybacked with b_go via NGO store
1099     } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1100       if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1101         // access
1102         copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1103       else // leaves copy parent's fixed ICVs directly to local ICV store
1104         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1105                   &thr_bar->parent_bar->th_fixed_icvs);
1106     }
1107   }
1108 #endif // KMP_BARRIER_ICV_PUSH
1109 
1110   // Now, release my children
1111   if (thr_bar->my_level) { // not a leaf
1112     kmp_int32 child_tid;
1113     kmp_uint32 last;
1114     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1115         thr_bar->use_oncore_barrier) {
1116       if (KMP_MASTER_TID(tid)) { // do a flat release
1117         // Set local b_go to bump children via NGO store of the cache line
1118         // containing IVCs and b_go.
1119         thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1120         // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1121         // the cache line
1122         ngo_load(&thr_bar->th_fixed_icvs);
1123         // This loops over all the threads skipping only the leaf nodes in the
1124         // hierarchy
1125         for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1126              child_tid += thr_bar->skip_per_level[1]) {
1127           kmp_bstate_t *child_bar =
1128               &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1129           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1130                         "releasing T#%d(%d:%d)"
1131                         " go(%p): %u => %u\n",
1132                         gtid, team->t.t_id, tid,
1133                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1134                         child_tid, &child_bar->b_go, child_bar->b_go,
1135                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1136           // Use ngo store (if available) to both store ICVs and release child
1137           // via child's b_go
1138           ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1139         }
1140         ngo_sync();
1141       }
1142       TCW_8(thr_bar->b_go,
1143             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1144       // Now, release leaf children
1145       if (thr_bar->leaf_kids) { // if there are any
1146         // We test team_change on the off-chance that the level 1 team changed.
1147         if (team_change ||
1148             old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1149           if (old_leaf_kids) { // release old leaf kids
1150             thr_bar->b_go |= old_leaf_state;
1151           }
1152           // Release new leaf kids
1153           last = tid + thr_bar->skip_per_level[1];
1154           if (last > nproc)
1155             last = nproc;
1156           for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1157                ++child_tid) { // skip_per_level[0]=1
1158             kmp_info_t *child_thr = team->t.t_threads[child_tid];
1159             kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1160             KA_TRACE(
1161                 20,
1162                 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1163                  " T#%d(%d:%d) go(%p): %u => %u\n",
1164                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1165                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1166                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1167             // Release child using child's b_go flag
1168             ANNOTATE_BARRIER_BEGIN(child_thr);
1169             kmp_flag_64 flag(&child_bar->b_go, child_thr);
1170             flag.release();
1171           }
1172         } else { // Release all children at once with leaf_state bits on my own
1173           // b_go flag
1174           thr_bar->b_go |= thr_bar->leaf_state;
1175         }
1176       }
1177     } else { // Blocktime is not infinite; do a simple hierarchical release
1178       for (int d = thr_bar->my_level - 1; d >= 0;
1179            --d) { // Release highest level threads first
1180         last = tid + thr_bar->skip_per_level[d + 1];
1181         kmp_uint32 skip = thr_bar->skip_per_level[d];
1182         if (last > nproc)
1183           last = nproc;
1184         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1185           kmp_info_t *child_thr = team->t.t_threads[child_tid];
1186           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1187           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1188                         "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1189                         gtid, team->t.t_id, tid,
1190                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1191                         child_tid, &child_bar->b_go, child_bar->b_go,
1192                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1193           // Release child using child's b_go flag
1194           ANNOTATE_BARRIER_BEGIN(child_thr);
1195           kmp_flag_64 flag(&child_bar->b_go, child_thr);
1196           flag.release();
1197         }
1198       }
1199     }
1200 #if KMP_BARRIER_ICV_PUSH
1201     if (propagate_icvs && !KMP_MASTER_TID(tid))
1202       // non-leaves copy ICVs from fixed ICVs to local dest
1203       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1204                 &thr_bar->th_fixed_icvs);
1205 #endif // KMP_BARRIER_ICV_PUSH
1206   }
1207   KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1208                 "barrier type %d\n",
1209                 gtid, team->t.t_id, tid, bt));
1210 }
1211 
1212 // End of Barrier Algorithms
1213 
1214 // Internal function to do a barrier.
1215 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1216    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1217    barrier
1218    Returns 0 if master thread, 1 if worker thread.  */
1219 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1220                   size_t reduce_size, void *reduce_data,
1221                   void (*reduce)(void *, void *)) {
1222   KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1223   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1224   int tid = __kmp_tid_from_gtid(gtid);
1225   kmp_info_t *this_thr = __kmp_threads[gtid];
1226   kmp_team_t *team = this_thr->th.th_team;
1227   int status = 0;
1228   ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1229 #if OMPT_SUPPORT
1230   ompt_data_t *my_task_data;
1231   ompt_data_t *my_parallel_data;
1232   void *return_address;
1233 #endif
1234 
1235   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1236                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1237 
1238   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1239 #if OMPT_SUPPORT
1240   if (ompt_enabled.enabled) {
1241 #if OMPT_OPTIONAL
1242     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1243     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1244     return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1245     if (ompt_enabled.ompt_callback_sync_region) {
1246       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1247           ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1248           my_task_data, return_address);
1249     }
1250     if (ompt_enabled.ompt_callback_sync_region_wait) {
1251       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1252           ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1253           my_task_data, return_address);
1254     }
1255 #endif
1256     // It is OK to report the barrier state after the barrier begin callback.
1257     // According to the OMPT specification, a compliant implementation may
1258     // even delay reporting this state until the barrier begins to wait.
1259     this_thr->th.ompt_thread_info.state = omp_state_wait_barrier;
1260   }
1261 #endif
1262 
1263   if (!team->t.t_serialized) {
1264 #if USE_ITT_BUILD
1265     // This value will be used in itt notify events below.
1266     void *itt_sync_obj = NULL;
1267 #if USE_ITT_NOTIFY
1268     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1269       itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1270 #endif
1271 #endif /* USE_ITT_BUILD */
1272     if (__kmp_tasking_mode == tskm_extra_barrier) {
1273       __kmp_tasking_barrier(team, this_thr, gtid);
1274       KA_TRACE(15,
1275                ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1276                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1277     }
1278 
1279     /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1280        access it when the team struct is not guaranteed to exist. */
1281     // See note about the corresponding code in __kmp_join_barrier() being
1282     // performance-critical.
1283     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1284 #if KMP_USE_MONITOR
1285       this_thr->th.th_team_bt_intervals =
1286           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1287       this_thr->th.th_team_bt_set =
1288           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1289 #else
1290       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1291 #endif
1292     }
1293 
1294 #if USE_ITT_BUILD
1295     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1296       __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1297 #endif /* USE_ITT_BUILD */
1298 #if USE_DEBUGGER
1299     // Let the debugger know: the thread arrived to the barrier and waiting.
1300     if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1301       team->t.t_bar[bt].b_master_arrived += 1;
1302     } else {
1303       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1304     } // if
1305 #endif /* USE_DEBUGGER */
1306     if (reduce != NULL) {
1307       // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1308       this_thr->th.th_local.reduce_data = reduce_data;
1309     }
1310 
1311     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1312       __kmp_task_team_setup(
1313           this_thr, team,
1314           0); // use 0 to only setup the current team if nthreads > 1
1315 
1316     switch (__kmp_barrier_gather_pattern[bt]) {
1317     case bp_hyper_bar: {
1318       KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1319       // to 0; use linear
1320       __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1321                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1322       break;
1323     }
1324     case bp_hierarchical_bar: {
1325       __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid,
1326                                         reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1327       break;
1328     }
1329     case bp_tree_bar: {
1330       KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1331       // to 0; use linear
1332       __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1333                                 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1334       break;
1335     }
1336     default: {
1337       __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1338                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1339     }
1340     }
1341 
1342     KMP_MB();
1343 
1344     if (KMP_MASTER_TID(tid)) {
1345       status = 0;
1346       if (__kmp_tasking_mode != tskm_immediate_exec) {
1347         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1348       }
1349 #if USE_DEBUGGER
1350       // Let the debugger know: All threads are arrived and starting leaving the
1351       // barrier.
1352       team->t.t_bar[bt].b_team_arrived += 1;
1353 #endif
1354 
1355 #if OMP_40_ENABLED
1356       // Reset cancellation flag for worksharing constructs
1357       if (team->t.t_cancel_request == cancel_loop ||
1358           team->t.t_cancel_request == cancel_sections) {
1359         team->t.t_cancel_request = cancel_noreq;
1360       }
1361 #endif
1362 #if USE_ITT_BUILD
1363       /* TODO: In case of split reduction barrier, master thread may send
1364          acquired event early, before the final summation into the shared
1365          variable is done (final summation can be a long operation for array
1366          reductions).  */
1367       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1368         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1369 #endif /* USE_ITT_BUILD */
1370 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1371       // Barrier - report frame end (only if active_level == 1)
1372       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1373           __kmp_forkjoin_frames_mode &&
1374 #if OMP_40_ENABLED
1375           this_thr->th.th_teams_microtask == NULL &&
1376 #endif
1377           team->t.t_active_level == 1) {
1378         kmp_uint64 cur_time = __itt_get_timestamp();
1379         kmp_info_t **other_threads = team->t.t_threads;
1380         int nproc = this_thr->th.th_team_nproc;
1381         int i;
1382         switch (__kmp_forkjoin_frames_mode) {
1383         case 1:
1384           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1385                                  loc, nproc);
1386           this_thr->th.th_frame_time = cur_time;
1387           break;
1388         case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1389           // be fixed)
1390           __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1391                                  1, loc, nproc);
1392           break;
1393         case 3:
1394           if (__itt_metadata_add_ptr) {
1395             // Initialize with master's wait time
1396             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1397             // Set arrive time to zero to be able to check it in
1398             // __kmp_invoke_task(); the same is done inside the loop below
1399             this_thr->th.th_bar_arrive_time = 0;
1400             for (i = 1; i < nproc; ++i) {
1401               delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1402               other_threads[i]->th.th_bar_arrive_time = 0;
1403             }
1404             __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1405                                          cur_time, delta,
1406                                          (kmp_uint64)(reduce != NULL));
1407           }
1408           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1409                                  loc, nproc);
1410           this_thr->th.th_frame_time = cur_time;
1411           break;
1412         }
1413       }
1414 #endif /* USE_ITT_BUILD */
1415     } else {
1416       status = 1;
1417 #if USE_ITT_BUILD
1418       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1419         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1420 #endif /* USE_ITT_BUILD */
1421     }
1422     if (status == 1 || !is_split) {
1423       switch (__kmp_barrier_release_pattern[bt]) {
1424       case bp_hyper_bar: {
1425         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1426         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1427                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1428         break;
1429       }
1430       case bp_hierarchical_bar: {
1431         __kmp_hierarchical_barrier_release(
1432             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1433         break;
1434       }
1435       case bp_tree_bar: {
1436         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1437         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1438                                    FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1439         break;
1440       }
1441       default: {
1442         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1443                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1444       }
1445       }
1446       if (__kmp_tasking_mode != tskm_immediate_exec) {
1447         __kmp_task_team_sync(this_thr, team);
1448       }
1449     }
1450 
1451 #if USE_ITT_BUILD
1452     /* GEH: TODO: Move this under if-condition above and also include in
1453        __kmp_end_split_barrier(). This will more accurately represent the actual
1454        release time of the threads for split barriers.  */
1455     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1456       __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1457 #endif /* USE_ITT_BUILD */
1458   } else { // Team is serialized.
1459     status = 0;
1460     if (__kmp_tasking_mode != tskm_immediate_exec) {
1461 #if OMP_45_ENABLED
1462       if (this_thr->th.th_task_team != NULL) {
1463         void *itt_sync_obj = NULL;
1464 #if USE_ITT_NOTIFY
1465         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1466           itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1467           __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1468         }
1469 #endif
1470 
1471         KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1472                          TRUE);
1473         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1474         __kmp_task_team_setup(this_thr, team, 0);
1475 
1476 #if USE_ITT_BUILD
1477         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1478           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1479 #endif /* USE_ITT_BUILD */
1480       }
1481 #else
1482       // The task team should be NULL for serialized code (tasks will be
1483       // executed immediately)
1484       KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1485       KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1486 #endif
1487     }
1488   }
1489   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1490                 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1491                 __kmp_tid_from_gtid(gtid), status));
1492 
1493 #if OMPT_SUPPORT
1494   if (ompt_enabled.enabled) {
1495 #if OMPT_OPTIONAL
1496     if (ompt_enabled.ompt_callback_sync_region_wait) {
1497       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1498           ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1499           my_task_data, return_address);
1500     }
1501     if (ompt_enabled.ompt_callback_sync_region) {
1502       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1503           ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1504           my_task_data, return_address);
1505     }
1506 #endif
1507     this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1508   }
1509 #endif
1510   ANNOTATE_BARRIER_END(&team->t.t_bar);
1511 
1512   return status;
1513 }
1514 
1515 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1516   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1517   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1518   int tid = __kmp_tid_from_gtid(gtid);
1519   kmp_info_t *this_thr = __kmp_threads[gtid];
1520   kmp_team_t *team = this_thr->th.th_team;
1521 
1522   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1523   if (!team->t.t_serialized) {
1524     if (KMP_MASTER_GTID(gtid)) {
1525       switch (__kmp_barrier_release_pattern[bt]) {
1526       case bp_hyper_bar: {
1527         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1528         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1529                                     FALSE USE_ITT_BUILD_ARG(NULL));
1530         break;
1531       }
1532       case bp_hierarchical_bar: {
1533         __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1534                                            FALSE USE_ITT_BUILD_ARG(NULL));
1535         break;
1536       }
1537       case bp_tree_bar: {
1538         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1539         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1540                                    FALSE USE_ITT_BUILD_ARG(NULL));
1541         break;
1542       }
1543       default: {
1544         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1545                                      FALSE USE_ITT_BUILD_ARG(NULL));
1546       }
1547       }
1548       if (__kmp_tasking_mode != tskm_immediate_exec) {
1549         __kmp_task_team_sync(this_thr, team);
1550       } // if
1551     }
1552   }
1553   ANNOTATE_BARRIER_END(&team->t.t_bar);
1554 }
1555 
1556 void __kmp_join_barrier(int gtid) {
1557   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1558   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1559   kmp_info_t *this_thr = __kmp_threads[gtid];
1560   kmp_team_t *team;
1561   kmp_uint nproc;
1562   kmp_info_t *master_thread;
1563   int tid;
1564 #ifdef KMP_DEBUG
1565   int team_id;
1566 #endif /* KMP_DEBUG */
1567 #if USE_ITT_BUILD
1568   void *itt_sync_obj = NULL;
1569 #if USE_ITT_NOTIFY
1570   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1571     // Get object created at fork_barrier
1572     itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1573 #endif
1574 #endif /* USE_ITT_BUILD */
1575   KMP_MB();
1576 
1577   // Get current info
1578   team = this_thr->th.th_team;
1579   nproc = this_thr->th.th_team_nproc;
1580   KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1581   tid = __kmp_tid_from_gtid(gtid);
1582 #ifdef KMP_DEBUG
1583   team_id = team->t.t_id;
1584 #endif /* KMP_DEBUG */
1585   master_thread = this_thr->th.th_team_master;
1586 #ifdef KMP_DEBUG
1587   if (master_thread != team->t.t_threads[0]) {
1588     __kmp_print_structure();
1589   }
1590 #endif /* KMP_DEBUG */
1591   KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1592   KMP_MB();
1593 
1594   // Verify state
1595   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1596   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1597   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1598   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1599   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1600                 gtid, team_id, tid));
1601 
1602   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1603 #if OMPT_SUPPORT
1604   ompt_data_t *my_task_data;
1605   ompt_data_t *my_parallel_data;
1606   if (ompt_enabled.enabled) {
1607 #if OMPT_OPTIONAL
1608     void *codeptr = NULL;
1609     int ds_tid = this_thr->th.th_info.ds.ds_tid;
1610     if (KMP_MASTER_TID(ds_tid) &&
1611         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1612          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1613       codeptr = team->t.ompt_team_info.master_return_address;
1614     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1615     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1616     if (ompt_enabled.ompt_callback_sync_region) {
1617       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1618           ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1619           my_task_data, codeptr);
1620     }
1621     if (ompt_enabled.ompt_callback_sync_region_wait) {
1622       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1623           ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1624           my_task_data, codeptr);
1625     }
1626     if (!KMP_MASTER_TID(ds_tid))
1627       this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1628 #endif
1629     this_thr->th.ompt_thread_info.state = omp_state_wait_barrier_implicit;
1630   }
1631 #endif
1632 
1633   if (__kmp_tasking_mode == tskm_extra_barrier) {
1634     __kmp_tasking_barrier(team, this_thr, gtid);
1635     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1636                   team_id, tid));
1637   }
1638 #ifdef KMP_DEBUG
1639   if (__kmp_tasking_mode != tskm_immediate_exec) {
1640     KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1641                   "%p, th_task_team = %p\n",
1642                   __kmp_gtid_from_thread(this_thr), team_id,
1643                   team->t.t_task_team[this_thr->th.th_task_state],
1644                   this_thr->th.th_task_team));
1645     KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1646                      team->t.t_task_team[this_thr->th.th_task_state]);
1647   }
1648 #endif /* KMP_DEBUG */
1649 
1650   /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1651      access it when the team struct is not guaranteed to exist. Doing these
1652      loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1653      we do not perform the copy if blocktime=infinite, since the values are not
1654      used by __kmp_wait_template() in that case. */
1655   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1656 #if KMP_USE_MONITOR
1657     this_thr->th.th_team_bt_intervals =
1658         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1659     this_thr->th.th_team_bt_set =
1660         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1661 #else
1662     this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1663 #endif
1664   }
1665 
1666 #if USE_ITT_BUILD
1667   if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1668     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1669 #endif /* USE_ITT_BUILD */
1670 
1671   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1672   case bp_hyper_bar: {
1673     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1674     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1675                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1676     break;
1677   }
1678   case bp_hierarchical_bar: {
1679     __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1680                                       NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1681     break;
1682   }
1683   case bp_tree_bar: {
1684     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1685     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1686                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1687     break;
1688   }
1689   default: {
1690     __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1691                                 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1692   }
1693   }
1694 
1695   /* From this point on, the team data structure may be deallocated at any time
1696      by the master thread - it is unsafe to reference it in any of the worker
1697      threads. Any per-team data items that need to be referenced before the
1698      end of the barrier should be moved to the kmp_task_team_t structs.  */
1699   if (KMP_MASTER_TID(tid)) {
1700     if (__kmp_tasking_mode != tskm_immediate_exec) {
1701       __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1702     }
1703 #if KMP_STATS_ENABLED
1704     // Have master thread flag the workers to indicate they are now waiting for
1705     // next parallel region, Also wake them up so they switch their timers to
1706     // idle.
1707     for (int i = 0; i < team->t.t_nproc; ++i) {
1708       kmp_info_t *team_thread = team->t.t_threads[i];
1709       if (team_thread == this_thr)
1710         continue;
1711       team_thread->th.th_stats->setIdleFlag();
1712       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1713           team_thread->th.th_sleep_loc != NULL)
1714         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1715                                   team_thread->th.th_sleep_loc);
1716     }
1717 #endif
1718 #if USE_ITT_BUILD
1719     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1720       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1721 #endif /* USE_ITT_BUILD */
1722 
1723 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1724     // Join barrier - report frame end
1725     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1726         __kmp_forkjoin_frames_mode &&
1727 #if OMP_40_ENABLED
1728         this_thr->th.th_teams_microtask == NULL &&
1729 #endif
1730         team->t.t_active_level == 1) {
1731       kmp_uint64 cur_time = __itt_get_timestamp();
1732       ident_t *loc = team->t.t_ident;
1733       kmp_info_t **other_threads = team->t.t_threads;
1734       int nproc = this_thr->th.th_team_nproc;
1735       int i;
1736       switch (__kmp_forkjoin_frames_mode) {
1737       case 1:
1738         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1739                                loc, nproc);
1740         break;
1741       case 2:
1742         __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1743                                loc, nproc);
1744         break;
1745       case 3:
1746         if (__itt_metadata_add_ptr) {
1747           // Initialize with master's wait time
1748           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1749           // Set arrive time to zero to be able to check it in
1750           // __kmp_invoke_task(); the same is done inside the loop below
1751           this_thr->th.th_bar_arrive_time = 0;
1752           for (i = 1; i < nproc; ++i) {
1753             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1754             other_threads[i]->th.th_bar_arrive_time = 0;
1755           }
1756           __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1757                                        cur_time, delta, 0);
1758         }
1759         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1760                                loc, nproc);
1761         this_thr->th.th_frame_time = cur_time;
1762         break;
1763       }
1764     }
1765 #endif /* USE_ITT_BUILD */
1766   }
1767 #if USE_ITT_BUILD
1768   else {
1769     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1770       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1771   }
1772 #endif /* USE_ITT_BUILD */
1773 
1774 #if KMP_DEBUG
1775   if (KMP_MASTER_TID(tid)) {
1776     KA_TRACE(
1777         15,
1778         ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1779          gtid, team_id, tid, nproc));
1780   }
1781 #endif /* KMP_DEBUG */
1782 
1783   // TODO now, mark worker threads as done so they may be disbanded
1784   KMP_MB(); // Flush all pending memory write invalidates.
1785   KA_TRACE(10,
1786            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1787 
1788   ANNOTATE_BARRIER_END(&team->t.t_bar);
1789 }
1790 
1791 // TODO release worker threads' fork barriers as we are ready instead of all at
1792 // once
1793 void __kmp_fork_barrier(int gtid, int tid) {
1794   KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1795   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1796   kmp_info_t *this_thr = __kmp_threads[gtid];
1797   kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1798 #if USE_ITT_BUILD
1799   void *itt_sync_obj = NULL;
1800 #endif /* USE_ITT_BUILD */
1801   if (team)
1802     ANNOTATE_BARRIER_END(&team->t.t_bar);
1803 
1804   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1805                 (team != NULL) ? team->t.t_id : -1, tid));
1806 
1807   // th_team pointer only valid for master thread here
1808   if (KMP_MASTER_TID(tid)) {
1809 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1810     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1811       // Create itt barrier object
1812       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1813       __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1814     }
1815 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1816 
1817 #ifdef KMP_DEBUG
1818     kmp_info_t **other_threads = team->t.t_threads;
1819     int i;
1820 
1821     // Verify state
1822     KMP_MB();
1823 
1824     for (i = 1; i < team->t.t_nproc; ++i) {
1825       KA_TRACE(500,
1826                ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1827                 "== %u.\n",
1828                 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1829                 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1830                 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1831       KMP_DEBUG_ASSERT(
1832           (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1833            ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1834       KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1835     }
1836 #endif
1837 
1838     if (__kmp_tasking_mode != tskm_immediate_exec) {
1839       // 0 indicates setup current task team if nthreads > 1
1840       __kmp_task_team_setup(this_thr, team, 0);
1841     }
1842 
1843     /* The master thread may have changed its blocktime between the join barrier
1844        and the fork barrier. Copy the blocktime info to the thread, where
1845        __kmp_wait_template() can access it when the team struct is not
1846        guaranteed to exist. */
1847     // See note about the corresponding code in __kmp_join_barrier() being
1848     // performance-critical
1849     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1850 #if KMP_USE_MONITOR
1851       this_thr->th.th_team_bt_intervals =
1852           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1853       this_thr->th.th_team_bt_set =
1854           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1855 #else
1856       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1857 #endif
1858     }
1859   } // master
1860 
1861   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1862   case bp_hyper_bar: {
1863     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1864     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1865                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1866     break;
1867   }
1868   case bp_hierarchical_bar: {
1869     __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1870                                        TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1871     break;
1872   }
1873   case bp_tree_bar: {
1874     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1875     __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1876                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1877     break;
1878   }
1879   default: {
1880     __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1881                                  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1882   }
1883   }
1884 
1885 #if OMPT_SUPPORT
1886   if (ompt_enabled.enabled &&
1887       this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
1888     int ds_tid = this_thr->th.th_info.ds.ds_tid;
1889     ompt_data_t *task_data = (team)
1890                                  ? OMPT_CUR_TASK_DATA(this_thr)
1891                                  : &(this_thr->th.ompt_thread_info.task_data);
1892     this_thr->th.ompt_thread_info.state = omp_state_overhead;
1893 #if OMPT_OPTIONAL
1894     void *codeptr = NULL;
1895     if (KMP_MASTER_TID(ds_tid) &&
1896         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1897          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1898       codeptr = team->t.ompt_team_info.master_return_address;
1899     if (ompt_enabled.ompt_callback_sync_region_wait) {
1900       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1901           ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
1902     }
1903     if (ompt_enabled.ompt_callback_sync_region) {
1904       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1905           ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
1906     }
1907 #endif
1908     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
1909       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1910           ompt_scope_end, NULL, task_data, 0, ds_tid);
1911     }
1912   }
1913 #endif
1914 
1915   // Early exit for reaping threads releasing forkjoin barrier
1916   if (TCR_4(__kmp_global.g.g_done)) {
1917     this_thr->th.th_task_team = NULL;
1918 
1919 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1920     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1921       if (!KMP_MASTER_TID(tid)) {
1922         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1923         if (itt_sync_obj)
1924           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1925       }
1926     }
1927 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1928     KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1929     return;
1930   }
1931 
1932   /* We can now assume that a valid team structure has been allocated by the
1933      master and propagated to all worker threads. The current thread, however,
1934      may not be part of the team, so we can't blindly assume that the team
1935      pointer is non-null.  */
1936   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1937   KMP_DEBUG_ASSERT(team != NULL);
1938   tid = __kmp_tid_from_gtid(gtid);
1939 
1940 #if KMP_BARRIER_ICV_PULL
1941   /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1942      __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
1943      implicit task has this data before this function is called. We cannot
1944      modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
1945      struct, because it is not always the case that the threads arrays have
1946      been allocated when __kmp_fork_call() is executed. */
1947   {
1948     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
1949     if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1950       // Copy the initial ICVs from the master's thread struct to the implicit
1951       // task for this tid.
1952       KA_TRACE(10,
1953                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1954       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
1955                                tid, FALSE);
1956       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1957                 &team->t.t_threads[0]
1958                      ->th.th_bar[bs_forkjoin_barrier]
1959                      .bb.th_fixed_icvs);
1960     }
1961   }
1962 #endif // KMP_BARRIER_ICV_PULL
1963 
1964   if (__kmp_tasking_mode != tskm_immediate_exec) {
1965     __kmp_task_team_sync(this_thr, team);
1966   }
1967 
1968 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1969   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1970   if (proc_bind == proc_bind_intel) {
1971 #endif
1972 #if KMP_AFFINITY_SUPPORTED
1973     // Call dynamic affinity settings
1974     if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1975       __kmp_balanced_affinity(tid, team->t.t_nproc);
1976     }
1977 #endif // KMP_AFFINITY_SUPPORTED
1978 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1979   } else if (proc_bind != proc_bind_false) {
1980     if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1981       KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1982                      __kmp_gtid_from_thread(this_thr),
1983                      this_thr->th.th_current_place));
1984     } else {
1985       __kmp_affinity_set_place(gtid);
1986     }
1987   }
1988 #endif
1989 
1990 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1991   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1992     if (!KMP_MASTER_TID(tid)) {
1993       // Get correct barrier object
1994       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1995       __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1996     } // (prepare called inside barrier_release)
1997   }
1998 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1999   ANNOTATE_BARRIER_END(&team->t.t_bar);
2000   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2001                 team->t.t_id, tid));
2002 }
2003 
2004 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2005                           kmp_internal_control_t *new_icvs, ident_t *loc) {
2006   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2007 
2008   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2009   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2010 
2011 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2012    __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2013    implicit task has this data before this function is called. */
2014 #if KMP_BARRIER_ICV_PULL
2015   /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2016      untouched), where all of the worker threads can access them and make their
2017      own copies after the barrier. */
2018   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2019   // allocated at this point
2020   copy_icvs(
2021       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2022       new_icvs);
2023   KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2024                 team->t.t_threads[0], team));
2025 #elif KMP_BARRIER_ICV_PUSH
2026   // The ICVs will be propagated in the fork barrier, so nothing needs to be
2027   // done here.
2028   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2029                 team->t.t_threads[0], team));
2030 #else
2031   // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
2032   // time.
2033   ngo_load(new_icvs);
2034   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2035   // allocated at this point
2036   for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2037     // TODO: GEH - pass in better source location info since usually NULL here
2038     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2039                   f, team->t.t_threads[f], team));
2040     __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2041     ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2042     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2043                   f, team->t.t_threads[f], team));
2044   }
2045   ngo_sync();
2046 #endif // KMP_BARRIER_ICV_PULL
2047 }
2048