1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #include "tsan_annotations.h"
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 
44 // Linear Barrier
45 template <bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50   kmp_team_t *team = this_thr->th.th_team;
51   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52   kmp_info_t **other_threads = team->t.t_threads;
53 
54   KA_TRACE(
55       20,
56       ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57        gtid, team->t.t_id, tid, bt));
58   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61   // Barrier imbalance - save arrive time to the thread
62   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64         __itt_get_timestamp();
65   }
66 #endif
67   // We now perform a linear reduction to signal that all of the threads have
68   // arrived.
69   if (!KMP_MASTER_TID(tid)) {
70     KA_TRACE(20,
71              ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72               "arrived(%p): %llu => %llu\n",
73               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76     // Mark arrival to primary thread
77     /* After performing this write, a worker thread may not assume that the team
78        is valid any more - it could be deallocated by the primary thread at any
79        time. */
80     ANNOTATE_BARRIER_BEGIN(this_thr);
81     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
82     flag.release();
83   } else {
84     kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85     int nproc = this_thr->th.th_team_nproc;
86     int i;
87     // Don't have to worry about sleep bit here or atomic since team setting
88     kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89 
90     // Collect all the worker team member threads.
91     for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93       // Prefetch next thread's arrived count
94       if (i + 1 < nproc)
95         KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97       KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98                     "arrived(%p) == %llu\n",
99                     gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100                     team->t.t_id, i,
101                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102 
103       // Wait for worker thread to arrive
104       if (cancellable) {
105         kmp_flag_64<true, false> flag(
106             &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
107         if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
108           return true;
109       } else {
110         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
111                            new_state);
112         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113       }
114       ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116       // Barrier imbalance - write min of the thread time and the other thread
117       // time to the thread.
118       if (__kmp_forkjoin_frames_mode == 2) {
119         this_thr->th.th_bar_min_time = KMP_MIN(
120             this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121       }
122 #endif
123       if (reduce) {
124         KA_TRACE(100,
125                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127                   team->t.t_id, i));
128         ANNOTATE_REDUCE_AFTER(reduce);
129         OMPT_REDUCTION_DECL(this_thr, gtid);
130         OMPT_REDUCTION_BEGIN;
131         (*reduce)(this_thr->th.th_local.reduce_data,
132                   other_threads[i]->th.th_local.reduce_data);
133         OMPT_REDUCTION_END;
134         ANNOTATE_REDUCE_BEFORE(reduce);
135         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136       }
137     }
138     // Don't have to worry about sleep bit here or atomic since team setting
139     team_bar->b_arrived = new_state;
140     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141                   "arrived(%p) = %llu\n",
142                   gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143                   new_state));
144   }
145   KA_TRACE(
146       20,
147       ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148        gtid, team->t.t_id, tid, bt));
149   return false;
150 }
151 
152 template <bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158   kmp_team_t *team;
159 
160   if (KMP_MASTER_TID(tid)) {
161     unsigned int i;
162     kmp_uint32 nproc = this_thr->th.th_team_nproc;
163     kmp_info_t **other_threads;
164 
165     team = __kmp_threads[gtid]->th.th_team;
166     KMP_DEBUG_ASSERT(team != NULL);
167     other_threads = team->t.t_threads;
168 
169     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
170                   "barrier type %d\n",
171                   gtid, team->t.t_id, tid, bt));
172 
173     if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175       {
176         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177         if (propagate_icvs) {
178           ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179           for (i = 1; i < nproc; ++i) {
180             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181                                      team, i, FALSE);
182             ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183                            &team->t.t_implicit_task_taskdata[0].td_icvs);
184           }
185           ngo_sync();
186         }
187       }
188 #endif // KMP_BARRIER_ICV_PUSH
189 
190       // Now, release all of the worker threads
191       for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193         // Prefetch next thread's go flag
194         if (i + 1 < nproc)
195           KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197         KA_TRACE(
198             20,
199             ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200              "go(%p): %u => %u\n",
201              gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203              other_threads[i]->th.th_bar[bt].bb.b_go,
204              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205         ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207                            other_threads[i]);
208         flag.release();
209       }
210     }
211   } else { // Wait for the PRIMARY thread to release us
212     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214     if (cancellable) {
215       kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
216       if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
217         return true;
218     } else {
219       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
220       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
221     }
222     ANNOTATE_BARRIER_END(this_thr);
223 #if USE_ITT_BUILD && USE_ITT_NOTIFY
224     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
225       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
226       // disabled)
227       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
228       // Cancel wait on previous parallel region...
229       __kmp_itt_task_starting(itt_sync_obj);
230 
231       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
232         return false;
233 
234       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
235       if (itt_sync_obj != NULL)
236         // Call prepare as early as possible for "new" barrier
237         __kmp_itt_task_finished(itt_sync_obj);
238     } else
239 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
240         // Early exit for reaping threads releasing forkjoin barrier
241         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
242       return false;
243 // The worker thread may now assume that the team is valid.
244 #ifdef KMP_DEBUG
245     tid = __kmp_tid_from_gtid(gtid);
246     team = __kmp_threads[gtid]->th.th_team;
247 #endif
248     KMP_DEBUG_ASSERT(team != NULL);
249     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
250     KA_TRACE(20,
251              ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
252               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
253     KMP_MB(); // Flush all pending memory write invalidates.
254   }
255   KA_TRACE(
256       20,
257       ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
258        gtid, team->t.t_id, tid, bt));
259   return false;
260 }
261 
262 static void __kmp_linear_barrier_gather(
263     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
264     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
265   __kmp_linear_barrier_gather_template<false>(
266       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
267 }
268 
269 static bool __kmp_linear_barrier_gather_cancellable(
270     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
271     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
272   return __kmp_linear_barrier_gather_template<true>(
273       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
274 }
275 
276 static void __kmp_linear_barrier_release(
277     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
278     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
279   __kmp_linear_barrier_release_template<false>(
280       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
281 }
282 
283 static bool __kmp_linear_barrier_release_cancellable(
284     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
285     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
286   return __kmp_linear_barrier_release_template<true>(
287       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
288 }
289 
290 // Tree barrier
291 static void __kmp_tree_barrier_gather(
292     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
293     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
294   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
295   kmp_team_t *team = this_thr->th.th_team;
296   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
297   kmp_info_t **other_threads = team->t.t_threads;
298   kmp_uint32 nproc = this_thr->th.th_team_nproc;
299   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
300   kmp_uint32 branch_factor = 1 << branch_bits;
301   kmp_uint32 child;
302   kmp_uint32 child_tid;
303   kmp_uint64 new_state = 0;
304 
305   KA_TRACE(
306       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
307            gtid, team->t.t_id, tid, bt));
308   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
309 
310 #if USE_ITT_BUILD && USE_ITT_NOTIFY
311   // Barrier imbalance - save arrive time to the thread
312   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
313     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
314         __itt_get_timestamp();
315   }
316 #endif
317   // Perform tree gather to wait until all threads have arrived; reduce any
318   // required data as we go
319   child_tid = (tid << branch_bits) + 1;
320   if (child_tid < nproc) {
321     // Parent threads wait for all their children to arrive
322     new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
323     child = 1;
324     do {
325       kmp_info_t *child_thr = other_threads[child_tid];
326       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
327 #if KMP_CACHE_MANAGE
328       // Prefetch next thread's arrived count
329       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
330         KMP_CACHE_PREFETCH(
331             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
332 #endif /* KMP_CACHE_MANAGE */
333       KA_TRACE(20,
334                ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
335                 "arrived(%p) == %llu\n",
336                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
337                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
338       // Wait for child to arrive
339       kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
340       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
341       ANNOTATE_BARRIER_END(child_thr);
342 #if USE_ITT_BUILD && USE_ITT_NOTIFY
343       // Barrier imbalance - write min of the thread time and a child time to
344       // the thread.
345       if (__kmp_forkjoin_frames_mode == 2) {
346         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
347                                                child_thr->th.th_bar_min_time);
348       }
349 #endif
350       if (reduce) {
351         KA_TRACE(100,
352                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
353                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
354                   team->t.t_id, child_tid));
355         ANNOTATE_REDUCE_AFTER(reduce);
356         OMPT_REDUCTION_DECL(this_thr, gtid);
357         OMPT_REDUCTION_BEGIN;
358         (*reduce)(this_thr->th.th_local.reduce_data,
359                   child_thr->th.th_local.reduce_data);
360         OMPT_REDUCTION_END;
361         ANNOTATE_REDUCE_BEFORE(reduce);
362         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
363       }
364       child++;
365       child_tid++;
366     } while (child <= branch_factor && child_tid < nproc);
367   }
368 
369   if (!KMP_MASTER_TID(tid)) { // Worker threads
370     kmp_int32 parent_tid = (tid - 1) >> branch_bits;
371 
372     KA_TRACE(20,
373              ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
374               "arrived(%p): %llu => %llu\n",
375               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
376               team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
377               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
378 
379     // Mark arrival to parent thread
380     /* After performing this write, a worker thread may not assume that the team
381        is valid any more - it could be deallocated by the primary thread at any
382        time.  */
383     ANNOTATE_BARRIER_BEGIN(this_thr);
384     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
385     flag.release();
386   } else {
387     // Need to update the team arrived pointer if we are the primary thread
388     if (nproc > 1) // New value was already computed above
389       team->t.t_bar[bt].b_arrived = new_state;
390     else
391       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
392     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
393                   "arrived(%p) = %llu\n",
394                   gtid, team->t.t_id, tid, team->t.t_id,
395                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
396   }
397   KA_TRACE(20,
398            ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
399             gtid, team->t.t_id, tid, bt));
400 }
401 
402 static void __kmp_tree_barrier_release(
403     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
404     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
405   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
406   kmp_team_t *team;
407   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
408   kmp_uint32 nproc;
409   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
410   kmp_uint32 branch_factor = 1 << branch_bits;
411   kmp_uint32 child;
412   kmp_uint32 child_tid;
413 
414   // Perform a tree release for all of the threads that have been gathered
415   if (!KMP_MASTER_TID(
416           tid)) { // Handle fork barrier workers who aren't part of a team yet
417     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
418                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
419     // Wait for parent thread to release us
420     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
421     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
422     ANNOTATE_BARRIER_END(this_thr);
423 #if USE_ITT_BUILD && USE_ITT_NOTIFY
424     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
425       // In fork barrier where we could not get the object reliably (or
426       // ITTNOTIFY is disabled)
427       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
428       // Cancel wait on previous parallel region...
429       __kmp_itt_task_starting(itt_sync_obj);
430 
431       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
432         return;
433 
434       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
435       if (itt_sync_obj != NULL)
436         // Call prepare as early as possible for "new" barrier
437         __kmp_itt_task_finished(itt_sync_obj);
438     } else
439 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
440         // Early exit for reaping threads releasing forkjoin barrier
441         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
442       return;
443 
444     // The worker thread may now assume that the team is valid.
445     team = __kmp_threads[gtid]->th.th_team;
446     KMP_DEBUG_ASSERT(team != NULL);
447     tid = __kmp_tid_from_gtid(gtid);
448 
449     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
450     KA_TRACE(20,
451              ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
452               team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
453     KMP_MB(); // Flush all pending memory write invalidates.
454   } else {
455     team = __kmp_threads[gtid]->th.th_team;
456     KMP_DEBUG_ASSERT(team != NULL);
457     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
458                   "barrier type %d\n",
459                   gtid, team->t.t_id, tid, bt));
460   }
461   nproc = this_thr->th.th_team_nproc;
462   child_tid = (tid << branch_bits) + 1;
463 
464   if (child_tid < nproc) {
465     kmp_info_t **other_threads = team->t.t_threads;
466     child = 1;
467     // Parent threads release all their children
468     do {
469       kmp_info_t *child_thr = other_threads[child_tid];
470       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
471 #if KMP_CACHE_MANAGE
472       // Prefetch next thread's go count
473       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
474         KMP_CACHE_PREFETCH(
475             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
476 #endif /* KMP_CACHE_MANAGE */
477 
478 #if KMP_BARRIER_ICV_PUSH
479       {
480         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
481         if (propagate_icvs) {
482           __kmp_init_implicit_task(team->t.t_ident,
483                                    team->t.t_threads[child_tid], team,
484                                    child_tid, FALSE);
485           copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
486                     &team->t.t_implicit_task_taskdata[0].td_icvs);
487         }
488       }
489 #endif // KMP_BARRIER_ICV_PUSH
490       KA_TRACE(20,
491                ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
492                 "go(%p): %u => %u\n",
493                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
494                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
495                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
496       // Release child from barrier
497       ANNOTATE_BARRIER_BEGIN(child_thr);
498       kmp_flag_64<> flag(&child_bar->b_go, child_thr);
499       flag.release();
500       child++;
501       child_tid++;
502     } while (child <= branch_factor && child_tid < nproc);
503   }
504   KA_TRACE(
505       20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
506            gtid, team->t.t_id, tid, bt));
507 }
508 
509 // Hyper Barrier
510 static void __kmp_hyper_barrier_gather(
511     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
512     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
513   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
514   kmp_team_t *team = this_thr->th.th_team;
515   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
516   kmp_info_t **other_threads = team->t.t_threads;
517   kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
518   kmp_uint32 num_threads = this_thr->th.th_team_nproc;
519   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
520   kmp_uint32 branch_factor = 1 << branch_bits;
521   kmp_uint32 offset;
522   kmp_uint32 level;
523 
524   KA_TRACE(
525       20,
526       ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
527        gtid, team->t.t_id, tid, bt));
528   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
529 
530 #if USE_ITT_BUILD && USE_ITT_NOTIFY
531   // Barrier imbalance - save arrive time to the thread
532   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
533     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
534         __itt_get_timestamp();
535   }
536 #endif
537   /* Perform a hypercube-embedded tree gather to wait until all of the threads
538      have arrived, and reduce any required data as we go.  */
539   kmp_flag_64<> p_flag(&thr_bar->b_arrived);
540   for (level = 0, offset = 1; offset < num_threads;
541        level += branch_bits, offset <<= branch_bits) {
542     kmp_uint32 child;
543     kmp_uint32 child_tid;
544 
545     if (((tid >> level) & (branch_factor - 1)) != 0) {
546       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
547 
548       KMP_MB(); // Synchronize parent and child threads.
549       KA_TRACE(20,
550                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
551                 "arrived(%p): %llu => %llu\n",
552                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
553                 team->t.t_id, parent_tid, &thr_bar->b_arrived,
554                 thr_bar->b_arrived,
555                 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
556       // Mark arrival to parent thread
557       /* After performing this write (in the last iteration of the enclosing for
558          loop), a worker thread may not assume that the team is valid any more
559          - it could be deallocated by the primary thread at any time.  */
560       ANNOTATE_BARRIER_BEGIN(this_thr);
561       p_flag.set_waiter(other_threads[parent_tid]);
562       p_flag.release();
563       break;
564     }
565 
566     // Parent threads wait for children to arrive
567     if (new_state == KMP_BARRIER_UNUSED_STATE)
568       new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
569     for (child = 1, child_tid = tid + (1 << level);
570          child < branch_factor && child_tid < num_threads;
571          child++, child_tid += (1 << level)) {
572       kmp_info_t *child_thr = other_threads[child_tid];
573       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
574 #if KMP_CACHE_MANAGE
575       kmp_uint32 next_child_tid = child_tid + (1 << level);
576       // Prefetch next thread's arrived count
577       if (child + 1 < branch_factor && next_child_tid < num_threads)
578         KMP_CACHE_PREFETCH(
579             &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
580 #endif /* KMP_CACHE_MANAGE */
581       KA_TRACE(20,
582                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
583                 "arrived(%p) == %llu\n",
584                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
585                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
586       // Wait for child to arrive
587       kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
588       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
589       ANNOTATE_BARRIER_END(child_thr);
590       KMP_MB(); // Synchronize parent and child threads.
591 #if USE_ITT_BUILD && USE_ITT_NOTIFY
592       // Barrier imbalance - write min of the thread time and a child time to
593       // the thread.
594       if (__kmp_forkjoin_frames_mode == 2) {
595         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
596                                                child_thr->th.th_bar_min_time);
597       }
598 #endif
599       if (reduce) {
600         KA_TRACE(100,
601                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
602                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
603                   team->t.t_id, child_tid));
604         ANNOTATE_REDUCE_AFTER(reduce);
605         OMPT_REDUCTION_DECL(this_thr, gtid);
606         OMPT_REDUCTION_BEGIN;
607         (*reduce)(this_thr->th.th_local.reduce_data,
608                   child_thr->th.th_local.reduce_data);
609         OMPT_REDUCTION_END;
610         ANNOTATE_REDUCE_BEFORE(reduce);
611         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
612       }
613     }
614   }
615 
616   if (KMP_MASTER_TID(tid)) {
617     // Need to update the team arrived pointer if we are the primary thread
618     if (new_state == KMP_BARRIER_UNUSED_STATE)
619       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
620     else
621       team->t.t_bar[bt].b_arrived = new_state;
622     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
623                   "arrived(%p) = %llu\n",
624                   gtid, team->t.t_id, tid, team->t.t_id,
625                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
626   }
627   KA_TRACE(
628       20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
629            gtid, team->t.t_id, tid, bt));
630 }
631 
632 // The reverse versions seem to beat the forward versions overall
633 #define KMP_REVERSE_HYPER_BAR
634 static void __kmp_hyper_barrier_release(
635     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
636     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
637   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
638   kmp_team_t *team;
639   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
640   kmp_info_t **other_threads;
641   kmp_uint32 num_threads;
642   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
643   kmp_uint32 branch_factor = 1 << branch_bits;
644   kmp_uint32 child;
645   kmp_uint32 child_tid;
646   kmp_uint32 offset;
647   kmp_uint32 level;
648 
649   /* Perform a hypercube-embedded tree release for all of the threads that have
650      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
651      are released in the reverse order of the corresponding gather, otherwise
652      threads are released in the same order. */
653   if (KMP_MASTER_TID(tid)) { // primary thread
654     team = __kmp_threads[gtid]->th.th_team;
655     KMP_DEBUG_ASSERT(team != NULL);
656     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
657                   "barrier type %d\n",
658                   gtid, team->t.t_id, tid, bt));
659 #if KMP_BARRIER_ICV_PUSH
660     if (propagate_icvs) { // primary already has ICVs in final destination; copy
661       copy_icvs(&thr_bar->th_fixed_icvs,
662                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
663     }
664 #endif
665   } else { // Handle fork barrier workers who aren't part of a team yet
666     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
667                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
668     // Wait for parent thread to release us
669     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
670     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
671     ANNOTATE_BARRIER_END(this_thr);
672 #if USE_ITT_BUILD && USE_ITT_NOTIFY
673     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
674       // In fork barrier where we could not get the object reliably
675       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
676       // Cancel wait on previous parallel region...
677       __kmp_itt_task_starting(itt_sync_obj);
678 
679       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
680         return;
681 
682       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
683       if (itt_sync_obj != NULL)
684         // Call prepare as early as possible for "new" barrier
685         __kmp_itt_task_finished(itt_sync_obj);
686     } else
687 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
688         // Early exit for reaping threads releasing forkjoin barrier
689         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
690       return;
691 
692     // The worker thread may now assume that the team is valid.
693     team = __kmp_threads[gtid]->th.th_team;
694     KMP_DEBUG_ASSERT(team != NULL);
695     tid = __kmp_tid_from_gtid(gtid);
696 
697     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
698     KA_TRACE(20,
699              ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
700               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
701     KMP_MB(); // Flush all pending memory write invalidates.
702   }
703   num_threads = this_thr->th.th_team_nproc;
704   other_threads = team->t.t_threads;
705 
706 #ifdef KMP_REVERSE_HYPER_BAR
707   // Count up to correct level for parent
708   for (level = 0, offset = 1;
709        offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
710        level += branch_bits, offset <<= branch_bits)
711     ;
712 
713   // Now go down from there
714   for (level -= branch_bits, offset >>= branch_bits; offset != 0;
715        level -= branch_bits, offset >>= branch_bits)
716 #else
717   // Go down the tree, level by level
718   for (level = 0, offset = 1; offset < num_threads;
719        level += branch_bits, offset <<= branch_bits)
720 #endif // KMP_REVERSE_HYPER_BAR
721   {
722 #ifdef KMP_REVERSE_HYPER_BAR
723     /* Now go in reverse order through the children, highest to lowest.
724        Initial setting of child is conservative here. */
725     child = num_threads >> ((level == 0) ? level : level - 1);
726     for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
727         child_tid = tid + (child << level);
728          child >= 1; child--, child_tid -= (1 << level))
729 #else
730     if (((tid >> level) & (branch_factor - 1)) != 0)
731       // No need to go lower than this, since this is the level parent would be
732       // notified
733       break;
734     // Iterate through children on this level of the tree
735     for (child = 1, child_tid = tid + (1 << level);
736          child < branch_factor && child_tid < num_threads;
737          child++, child_tid += (1 << level))
738 #endif // KMP_REVERSE_HYPER_BAR
739     {
740       if (child_tid >= num_threads)
741         continue; // Child doesn't exist so keep going
742       else {
743         kmp_info_t *child_thr = other_threads[child_tid];
744         kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
745 #if KMP_CACHE_MANAGE
746         kmp_uint32 next_child_tid = child_tid - (1 << level);
747 // Prefetch next thread's go count
748 #ifdef KMP_REVERSE_HYPER_BAR
749         if (child - 1 >= 1 && next_child_tid < num_threads)
750 #else
751         if (child + 1 < branch_factor && next_child_tid < num_threads)
752 #endif // KMP_REVERSE_HYPER_BAR
753           KMP_CACHE_PREFETCH(
754               &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
755 #endif /* KMP_CACHE_MANAGE */
756 
757 #if KMP_BARRIER_ICV_PUSH
758         if (propagate_icvs) // push my fixed ICVs to my child
759           copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
760 #endif // KMP_BARRIER_ICV_PUSH
761 
762         KA_TRACE(
763             20,
764             ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
765              "go(%p): %u => %u\n",
766              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
767              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
768              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
769         // Release child from barrier
770         ANNOTATE_BARRIER_BEGIN(child_thr);
771         kmp_flag_64<> flag(&child_bar->b_go, child_thr);
772         flag.release();
773       }
774     }
775   }
776 #if KMP_BARRIER_ICV_PUSH
777   if (propagate_icvs &&
778       !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
779     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
780                              FALSE);
781     copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
782               &thr_bar->th_fixed_icvs);
783   }
784 #endif
785   KA_TRACE(
786       20,
787       ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
788        gtid, team->t.t_id, tid, bt));
789 }
790 
791 // Hierarchical Barrier
792 
793 // Initialize thread barrier data
794 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
795    Performs the minimum amount of initialization required based on how the team
796    has changed. Returns true if leaf children will require both on-core and
797    traditional wake-up mechanisms. For example, if the team size increases,
798    threads already in the team will respond to on-core wakeup on their parent
799    thread, but threads newly added to the team will only be listening on the
800    their local b_go. */
801 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
802                                                    kmp_bstate_t *thr_bar,
803                                                    kmp_uint32 nproc, int gtid,
804                                                    int tid, kmp_team_t *team) {
805   // Checks to determine if (re-)initialization is needed
806   bool uninitialized = thr_bar->team == NULL;
807   bool team_changed = team != thr_bar->team;
808   bool team_sz_changed = nproc != thr_bar->nproc;
809   bool tid_changed = tid != thr_bar->old_tid;
810   bool retval = false;
811 
812   if (uninitialized || team_sz_changed) {
813     __kmp_get_hierarchy(nproc, thr_bar);
814   }
815 
816   if (uninitialized || team_sz_changed || tid_changed) {
817     thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
818     thr_bar->parent_tid = -1; // default for primary thread
819     if (!KMP_MASTER_TID(tid)) {
820       // if not primary thread, find parent thread in hierarchy
821       kmp_uint32 d = 0;
822       while (d < thr_bar->depth) { // find parent based on level of thread in
823         // hierarchy, and note level
824         kmp_uint32 rem;
825         if (d == thr_bar->depth - 2) { // reached level right below the primary
826           thr_bar->parent_tid = 0;
827           thr_bar->my_level = d;
828           break;
829         } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
830           // TODO: can we make the above op faster?
831           // thread is not a subtree root at next level, so this is max
832           thr_bar->parent_tid = tid - rem;
833           thr_bar->my_level = d;
834           break;
835         }
836         ++d;
837       }
838     }
839     __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
840                             (thr_bar->skip_per_level[thr_bar->my_level])),
841                        &(thr_bar->offset));
842     thr_bar->old_tid = tid;
843     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
844     thr_bar->team = team;
845     thr_bar->parent_bar =
846         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
847   }
848   if (uninitialized || team_changed || tid_changed) {
849     thr_bar->team = team;
850     thr_bar->parent_bar =
851         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
852     retval = true;
853   }
854   if (uninitialized || team_sz_changed || tid_changed) {
855     thr_bar->nproc = nproc;
856     thr_bar->leaf_kids = thr_bar->base_leaf_kids;
857     if (thr_bar->my_level == 0)
858       thr_bar->leaf_kids = 0;
859     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
860       __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
861     thr_bar->leaf_state = 0;
862     for (int i = 0; i < thr_bar->leaf_kids; ++i)
863       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
864   }
865   return retval;
866 }
867 
868 static void __kmp_hierarchical_barrier_gather(
869     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
870     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
871   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
872   kmp_team_t *team = this_thr->th.th_team;
873   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
874   kmp_uint32 nproc = this_thr->th.th_team_nproc;
875   kmp_info_t **other_threads = team->t.t_threads;
876   kmp_uint64 new_state = 0;
877 
878   int level = team->t.t_level;
879   if (other_threads[0]
880           ->th.th_teams_microtask) // are we inside the teams construct?
881     if (this_thr->th.th_teams_size.nteams > 1)
882       ++level; // level was not increased in teams construct for team_of_masters
883   if (level == 1)
884     thr_bar->use_oncore_barrier = 1;
885   else
886     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
887 
888   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
889                 "barrier type %d\n",
890                 gtid, team->t.t_id, tid, bt));
891   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
892 
893 #if USE_ITT_BUILD && USE_ITT_NOTIFY
894   // Barrier imbalance - save arrive time to the thread
895   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
896     this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
897   }
898 #endif
899 
900   (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
901                                                team);
902 
903   if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
904     kmp_int32 child_tid;
905     new_state =
906         (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
907     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
908         thr_bar->use_oncore_barrier) {
909       if (thr_bar->leaf_kids) {
910         // First, wait for leaf children to check-in on my b_arrived flag
911         kmp_uint64 leaf_state =
912             KMP_MASTER_TID(tid)
913                 ? thr_bar->b_arrived | thr_bar->leaf_state
914                 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
915         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
916                       "for leaf kids\n",
917                       gtid, team->t.t_id, tid));
918         kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
919         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
920         if (reduce) {
921           ANNOTATE_REDUCE_AFTER(reduce);
922           OMPT_REDUCTION_DECL(this_thr, gtid);
923           OMPT_REDUCTION_BEGIN;
924           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
925                ++child_tid) {
926             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
927                            "T#%d(%d:%d)\n",
928                            gtid, team->t.t_id, tid,
929                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
930                            child_tid));
931             ANNOTATE_BARRIER_END(other_threads[child_tid]);
932             (*reduce)(this_thr->th.th_local.reduce_data,
933                       other_threads[child_tid]->th.th_local.reduce_data);
934           }
935           OMPT_REDUCTION_END;
936           ANNOTATE_REDUCE_BEFORE(reduce);
937           ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
938         }
939         // clear leaf_state bits
940         KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
941       }
942       // Next, wait for higher level children on each child's b_arrived flag
943       for (kmp_uint32 d = 1; d < thr_bar->my_level;
944            ++d) { // gather lowest level threads first, but skip 0
945         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
946                    skip = thr_bar->skip_per_level[d];
947         if (last > nproc)
948           last = nproc;
949         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
950           kmp_info_t *child_thr = other_threads[child_tid];
951           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
952           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
953                         "T#%d(%d:%d) "
954                         "arrived(%p) == %llu\n",
955                         gtid, team->t.t_id, tid,
956                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
957                         child_tid, &child_bar->b_arrived, new_state));
958           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
959           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
960           ANNOTATE_BARRIER_END(child_thr);
961           if (reduce) {
962             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
963                            "T#%d(%d:%d)\n",
964                            gtid, team->t.t_id, tid,
965                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
966                            child_tid));
967             ANNOTATE_REDUCE_AFTER(reduce);
968             (*reduce)(this_thr->th.th_local.reduce_data,
969                       child_thr->th.th_local.reduce_data);
970             ANNOTATE_REDUCE_BEFORE(reduce);
971             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
972           }
973         }
974       }
975     } else { // Blocktime is not infinite
976       for (kmp_uint32 d = 0; d < thr_bar->my_level;
977            ++d) { // Gather lowest level threads first
978         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
979                    skip = thr_bar->skip_per_level[d];
980         if (last > nproc)
981           last = nproc;
982         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
983           kmp_info_t *child_thr = other_threads[child_tid];
984           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
986                         "T#%d(%d:%d) "
987                         "arrived(%p) == %llu\n",
988                         gtid, team->t.t_id, tid,
989                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
990                         child_tid, &child_bar->b_arrived, new_state));
991           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
992           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
993           ANNOTATE_BARRIER_END(child_thr);
994           if (reduce) {
995             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
996                            "T#%d(%d:%d)\n",
997                            gtid, team->t.t_id, tid,
998                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
999                            child_tid));
1000             ANNOTATE_REDUCE_AFTER(reduce);
1001             (*reduce)(this_thr->th.th_local.reduce_data,
1002                       child_thr->th.th_local.reduce_data);
1003             ANNOTATE_REDUCE_BEFORE(reduce);
1004             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1005           }
1006         }
1007       }
1008     }
1009   }
1010   // All subordinates are gathered; now release parent if not primary thread
1011 
1012   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1013     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1014                   " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1015                   gtid, team->t.t_id, tid,
1016                   __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1017                   thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1018                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1019     /* Mark arrival to parent: After performing this write, a worker thread may
1020        not assume that the team is valid any more - it could be deallocated by
1021        the primary thread at any time. */
1022     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1023         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1024       // flag; release it
1025       ANNOTATE_BARRIER_BEGIN(this_thr);
1026       kmp_flag_64<> flag(&thr_bar->b_arrived,
1027                          other_threads[thr_bar->parent_tid]);
1028       flag.release();
1029     } else {
1030       // Leaf does special release on "offset" bits of parent's b_arrived flag
1031       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1032       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1033                            thr_bar->offset + 1);
1034       flag.set_waiter(other_threads[thr_bar->parent_tid]);
1035       flag.release();
1036     }
1037   } else { // Primary thread needs to update the team's b_arrived value
1038     team->t.t_bar[bt].b_arrived = new_state;
1039     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1040                   "arrived(%p) = %llu\n",
1041                   gtid, team->t.t_id, tid, team->t.t_id,
1042                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1043   }
1044   // Is the team access below unsafe or just technically invalid?
1045   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1046                 "barrier type %d\n",
1047                 gtid, team->t.t_id, tid, bt));
1048 }
1049 
1050 static void __kmp_hierarchical_barrier_release(
1051     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1052     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1053   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1054   kmp_team_t *team;
1055   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1056   kmp_uint32 nproc;
1057   bool team_change = false; // indicates on-core barrier shouldn't be used
1058 
1059   if (KMP_MASTER_TID(tid)) {
1060     team = __kmp_threads[gtid]->th.th_team;
1061     KMP_DEBUG_ASSERT(team != NULL);
1062     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1063                   "entered barrier type %d\n",
1064                   gtid, team->t.t_id, tid, bt));
1065   } else { // Worker threads
1066     // Wait for parent thread to release me
1067     if (!thr_bar->use_oncore_barrier ||
1068         __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1069         thr_bar->team == NULL) {
1070       // Use traditional method of waiting on my own b_go flag
1071       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1072       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1073       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1074       ANNOTATE_BARRIER_END(this_thr);
1075       TCW_8(thr_bar->b_go,
1076             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1077     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1078       // infinite, not nested
1079       // Wait on my "offset" bits on parent's b_go flag
1080       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1081       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1082                            thr_bar->offset + 1, bt,
1083                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1084       flag.wait(this_thr, TRUE);
1085       if (thr_bar->wait_flag ==
1086           KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1087         TCW_8(thr_bar->b_go,
1088               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1089       } else { // Reset my bits on parent's b_go flag
1090         (RCAST(volatile char *,
1091                &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1092       }
1093     }
1094     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1095     // Early exit for reaping threads releasing forkjoin barrier
1096     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1097       return;
1098     // The worker thread may now assume that the team is valid.
1099     team = __kmp_threads[gtid]->th.th_team;
1100     KMP_DEBUG_ASSERT(team != NULL);
1101     tid = __kmp_tid_from_gtid(gtid);
1102 
1103     KA_TRACE(
1104         20,
1105         ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1106          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1107     KMP_MB(); // Flush all pending memory write invalidates.
1108   }
1109 
1110   nproc = this_thr->th.th_team_nproc;
1111   int level = team->t.t_level;
1112   if (team->t.t_threads[0]
1113           ->th.th_teams_microtask) { // are we inside the teams construct?
1114     if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1115         this_thr->th.th_teams_level == level)
1116       ++level; // level was not increased in teams construct for team_of_workers
1117     if (this_thr->th.th_teams_size.nteams > 1)
1118       ++level; // level was not increased in teams construct for team_of_masters
1119   }
1120   if (level == 1)
1121     thr_bar->use_oncore_barrier = 1;
1122   else
1123     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1124 
1125   // If the team size has increased, we still communicate with old leaves via
1126   // oncore barrier.
1127   unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1128   kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1129   team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1130                                                        tid, team);
1131   // But if the entire team changes, we won't use oncore barrier at all
1132   if (team_change)
1133     old_leaf_kids = 0;
1134 
1135 #if KMP_BARRIER_ICV_PUSH
1136   if (propagate_icvs) {
1137     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1138                              FALSE);
1139     if (KMP_MASTER_TID(
1140             tid)) { // primary already has copy in final destination; copy
1141       copy_icvs(&thr_bar->th_fixed_icvs,
1142                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1143     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1144                thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1145       if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1146         // leaves (on-core children) pull parent's fixed ICVs directly to local
1147         // ICV store
1148         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1149                   &thr_bar->parent_bar->th_fixed_icvs);
1150       // non-leaves will get ICVs piggybacked with b_go via NGO store
1151     } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1152       if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1153         // access
1154         copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1155       else // leaves copy parent's fixed ICVs directly to local ICV store
1156         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1157                   &thr_bar->parent_bar->th_fixed_icvs);
1158     }
1159   }
1160 #endif // KMP_BARRIER_ICV_PUSH
1161 
1162   // Now, release my children
1163   if (thr_bar->my_level) { // not a leaf
1164     kmp_int32 child_tid;
1165     kmp_uint32 last;
1166     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1167         thr_bar->use_oncore_barrier) {
1168       if (KMP_MASTER_TID(tid)) { // do a flat release
1169         // Set local b_go to bump children via NGO store of the cache line
1170         // containing IVCs and b_go.
1171         thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1172         // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1173         // the cache line
1174         ngo_load(&thr_bar->th_fixed_icvs);
1175         // This loops over all the threads skipping only the leaf nodes in the
1176         // hierarchy
1177         for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1178              child_tid += thr_bar->skip_per_level[1]) {
1179           kmp_bstate_t *child_bar =
1180               &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1181           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1182                         "releasing T#%d(%d:%d)"
1183                         " go(%p): %u => %u\n",
1184                         gtid, team->t.t_id, tid,
1185                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1186                         child_tid, &child_bar->b_go, child_bar->b_go,
1187                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1188           // Use ngo store (if available) to both store ICVs and release child
1189           // via child's b_go
1190           ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1191         }
1192         ngo_sync();
1193       }
1194       TCW_8(thr_bar->b_go,
1195             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1196       // Now, release leaf children
1197       if (thr_bar->leaf_kids) { // if there are any
1198         // We test team_change on the off-chance that the level 1 team changed.
1199         if (team_change ||
1200             old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1201           if (old_leaf_kids) { // release old leaf kids
1202             thr_bar->b_go |= old_leaf_state;
1203           }
1204           // Release new leaf kids
1205           last = tid + thr_bar->skip_per_level[1];
1206           if (last > nproc)
1207             last = nproc;
1208           for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1209                ++child_tid) { // skip_per_level[0]=1
1210             kmp_info_t *child_thr = team->t.t_threads[child_tid];
1211             kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1212             KA_TRACE(
1213                 20,
1214                 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1215                  " T#%d(%d:%d) go(%p): %u => %u\n",
1216                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1217                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1218                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1219             // Release child using child's b_go flag
1220             ANNOTATE_BARRIER_BEGIN(child_thr);
1221             kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1222             flag.release();
1223           }
1224         } else { // Release all children at once with leaf_state bits on my own
1225           // b_go flag
1226           thr_bar->b_go |= thr_bar->leaf_state;
1227         }
1228       }
1229     } else { // Blocktime is not infinite; do a simple hierarchical release
1230       for (int d = thr_bar->my_level - 1; d >= 0;
1231            --d) { // Release highest level threads first
1232         last = tid + thr_bar->skip_per_level[d + 1];
1233         kmp_uint32 skip = thr_bar->skip_per_level[d];
1234         if (last > nproc)
1235           last = nproc;
1236         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1237           kmp_info_t *child_thr = team->t.t_threads[child_tid];
1238           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1239           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1240                         "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1241                         gtid, team->t.t_id, tid,
1242                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1243                         child_tid, &child_bar->b_go, child_bar->b_go,
1244                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1245           // Release child using child's b_go flag
1246           ANNOTATE_BARRIER_BEGIN(child_thr);
1247           kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1248           flag.release();
1249         }
1250       }
1251     }
1252 #if KMP_BARRIER_ICV_PUSH
1253     if (propagate_icvs && !KMP_MASTER_TID(tid))
1254       // non-leaves copy ICVs from fixed ICVs to local dest
1255       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1256                 &thr_bar->th_fixed_icvs);
1257 #endif // KMP_BARRIER_ICV_PUSH
1258   }
1259   KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1260                 "barrier type %d\n",
1261                 gtid, team->t.t_id, tid, bt));
1262 }
1263 
1264 // End of Barrier Algorithms
1265 
1266 // type traits for cancellable value
1267 // if cancellable is true, then is_cancellable is a normal boolean variable
1268 // if cancellable is false, then is_cancellable is a compile time constant
1269 template <bool cancellable> struct is_cancellable {};
1270 template <> struct is_cancellable<true> {
1271   bool value;
1272   is_cancellable() : value(false) {}
1273   is_cancellable(bool b) : value(b) {}
1274   is_cancellable &operator=(bool b) {
1275     value = b;
1276     return *this;
1277   }
1278   operator bool() const { return value; }
1279 };
1280 template <> struct is_cancellable<false> {
1281   is_cancellable &operator=(bool b) { return *this; }
1282   constexpr operator bool() const { return false; }
1283 };
1284 
1285 // Internal function to do a barrier.
1286 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1287    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1288    barrier
1289    When cancellable = false,
1290      Returns 0 if primary thread, 1 if worker thread.
1291    When cancellable = true
1292      Returns 0 if not cancelled, 1 if cancelled.  */
1293 template <bool cancellable = false>
1294 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1295                                   size_t reduce_size, void *reduce_data,
1296                                   void (*reduce)(void *, void *)) {
1297   KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1298   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1299   int tid = __kmp_tid_from_gtid(gtid);
1300   kmp_info_t *this_thr = __kmp_threads[gtid];
1301   kmp_team_t *team = this_thr->th.th_team;
1302   int status = 0;
1303   is_cancellable<cancellable> cancelled;
1304 #if OMPT_SUPPORT && OMPT_OPTIONAL
1305   ompt_data_t *my_task_data;
1306   ompt_data_t *my_parallel_data;
1307   void *return_address;
1308   ompt_sync_region_t barrier_kind;
1309 #endif
1310 
1311   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1312                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1313 
1314   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1315 #if OMPT_SUPPORT
1316   if (ompt_enabled.enabled) {
1317 #if OMPT_OPTIONAL
1318     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1319     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1320     return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1321     barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1322     if (ompt_enabled.ompt_callback_sync_region) {
1323       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1324           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1325           return_address);
1326     }
1327     if (ompt_enabled.ompt_callback_sync_region_wait) {
1328       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1329           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1330           return_address);
1331     }
1332 #endif
1333     // It is OK to report the barrier state after the barrier begin callback.
1334     // According to the OMPT specification, a compliant implementation may
1335     // even delay reporting this state until the barrier begins to wait.
1336     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1337   }
1338 #endif
1339 
1340   if (!team->t.t_serialized) {
1341 #if USE_ITT_BUILD
1342     // This value will be used in itt notify events below.
1343     void *itt_sync_obj = NULL;
1344 #if USE_ITT_NOTIFY
1345     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1346       itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1347 #endif
1348 #endif /* USE_ITT_BUILD */
1349     if (__kmp_tasking_mode == tskm_extra_barrier) {
1350       __kmp_tasking_barrier(team, this_thr, gtid);
1351       KA_TRACE(15,
1352                ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1353                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1354     }
1355 
1356     /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1357        access it when the team struct is not guaranteed to exist. */
1358     // See note about the corresponding code in __kmp_join_barrier() being
1359     // performance-critical.
1360     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1361 #if KMP_USE_MONITOR
1362       this_thr->th.th_team_bt_intervals =
1363           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1364       this_thr->th.th_team_bt_set =
1365           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1366 #else
1367       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1368 #endif
1369     }
1370 
1371 #if USE_ITT_BUILD
1372     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1373       __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1374 #endif /* USE_ITT_BUILD */
1375 #if USE_DEBUGGER
1376     // Let the debugger know: the thread arrived to the barrier and waiting.
1377     if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1378       team->t.t_bar[bt].b_master_arrived += 1;
1379     } else {
1380       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1381     } // if
1382 #endif /* USE_DEBUGGER */
1383     if (reduce != NULL) {
1384       // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1385       this_thr->th.th_local.reduce_data = reduce_data;
1386     }
1387 
1388     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1389       // use 0 to only setup the current team if nthreads > 1
1390       __kmp_task_team_setup(this_thr, team, 0);
1391 
1392     if (cancellable) {
1393       cancelled = __kmp_linear_barrier_gather_cancellable(
1394           bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1395     } else {
1396       switch (__kmp_barrier_gather_pattern[bt]) {
1397       case bp_hyper_bar: {
1398         // don't set branch bits to 0; use linear
1399         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1400         __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1401                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1402         break;
1403       }
1404       case bp_hierarchical_bar: {
1405         __kmp_hierarchical_barrier_gather(
1406             bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1407         break;
1408       }
1409       case bp_tree_bar: {
1410         // don't set branch bits to 0; use linear
1411         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1412         __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1413                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1414         break;
1415       }
1416       default: {
1417         __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1418                                     reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1419       }
1420       }
1421     }
1422 
1423     KMP_MB();
1424 
1425     if (KMP_MASTER_TID(tid)) {
1426       status = 0;
1427       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1428         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1429       }
1430 #if USE_DEBUGGER
1431       // Let the debugger know: All threads are arrived and starting leaving the
1432       // barrier.
1433       team->t.t_bar[bt].b_team_arrived += 1;
1434 #endif
1435 
1436       if (__kmp_omp_cancellation) {
1437         kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1438         // Reset cancellation flag for worksharing constructs
1439         if (cancel_request == cancel_loop ||
1440             cancel_request == cancel_sections) {
1441           KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1442         }
1443       }
1444 #if USE_ITT_BUILD
1445       /* TODO: In case of split reduction barrier, primary thread may send
1446          acquired event early, before the final summation into the shared
1447          variable is done (final summation can be a long operation for array
1448          reductions).  */
1449       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1450         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1451 #endif /* USE_ITT_BUILD */
1452 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1453       // Barrier - report frame end (only if active_level == 1)
1454       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1455           __kmp_forkjoin_frames_mode &&
1456           (this_thr->th.th_teams_microtask == NULL || // either not in teams
1457            this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1458           team->t.t_active_level == 1) {
1459         ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1460         kmp_uint64 cur_time = __itt_get_timestamp();
1461         kmp_info_t **other_threads = team->t.t_threads;
1462         int nproc = this_thr->th.th_team_nproc;
1463         int i;
1464         switch (__kmp_forkjoin_frames_mode) {
1465         case 1:
1466           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1467                                  loc, nproc);
1468           this_thr->th.th_frame_time = cur_time;
1469           break;
1470         case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1471           // be fixed)
1472           __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1473                                  1, loc, nproc);
1474           break;
1475         case 3:
1476           if (__itt_metadata_add_ptr) {
1477             // Initialize with primary thread's wait time
1478             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1479             // Set arrive time to zero to be able to check it in
1480             // __kmp_invoke_task(); the same is done inside the loop below
1481             this_thr->th.th_bar_arrive_time = 0;
1482             for (i = 1; i < nproc; ++i) {
1483               delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1484               other_threads[i]->th.th_bar_arrive_time = 0;
1485             }
1486             __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1487                                          cur_time, delta,
1488                                          (kmp_uint64)(reduce != NULL));
1489           }
1490           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1491                                  loc, nproc);
1492           this_thr->th.th_frame_time = cur_time;
1493           break;
1494         }
1495       }
1496 #endif /* USE_ITT_BUILD */
1497     } else {
1498       status = 1;
1499 #if USE_ITT_BUILD
1500       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1501         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1502 #endif /* USE_ITT_BUILD */
1503     }
1504     if ((status == 1 || !is_split) && !cancelled) {
1505       if (cancellable) {
1506         cancelled = __kmp_linear_barrier_release_cancellable(
1507             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1508       } else {
1509         switch (__kmp_barrier_release_pattern[bt]) {
1510         case bp_hyper_bar: {
1511           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1512           __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1513                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1514           break;
1515         }
1516         case bp_hierarchical_bar: {
1517           __kmp_hierarchical_barrier_release(
1518               bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1519           break;
1520         }
1521         case bp_tree_bar: {
1522           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1523           __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1524                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1525           break;
1526         }
1527         default: {
1528           __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1529                                        FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1530         }
1531         }
1532       }
1533       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1534         __kmp_task_team_sync(this_thr, team);
1535       }
1536     }
1537 
1538 #if USE_ITT_BUILD
1539     /* GEH: TODO: Move this under if-condition above and also include in
1540        __kmp_end_split_barrier(). This will more accurately represent the actual
1541        release time of the threads for split barriers.  */
1542     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1543       __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1544 #endif /* USE_ITT_BUILD */
1545   } else { // Team is serialized.
1546     status = 0;
1547     if (__kmp_tasking_mode != tskm_immediate_exec) {
1548       if (this_thr->th.th_task_team != NULL) {
1549 #if USE_ITT_NOTIFY
1550         void *itt_sync_obj = NULL;
1551         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1552           itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1553           __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1554         }
1555 #endif
1556 
1557         KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1558                          TRUE);
1559         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1560         __kmp_task_team_setup(this_thr, team, 0);
1561 
1562 #if USE_ITT_BUILD
1563         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1564           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1565 #endif /* USE_ITT_BUILD */
1566       }
1567     }
1568   }
1569   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1570                 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1571                 __kmp_tid_from_gtid(gtid), status));
1572 
1573 #if OMPT_SUPPORT
1574   if (ompt_enabled.enabled) {
1575 #if OMPT_OPTIONAL
1576     if (ompt_enabled.ompt_callback_sync_region_wait) {
1577       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1578           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1579           return_address);
1580     }
1581     if (ompt_enabled.ompt_callback_sync_region) {
1582       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1583           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1584           return_address);
1585     }
1586 #endif
1587     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1588   }
1589 #endif
1590   ANNOTATE_BARRIER_END(&team->t.t_bar);
1591 
1592   if (cancellable)
1593     return (int)cancelled;
1594   return status;
1595 }
1596 
1597 // Returns 0 if primary thread, 1 if worker thread.
1598 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1599                   size_t reduce_size, void *reduce_data,
1600                   void (*reduce)(void *, void *)) {
1601   return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1602                                   reduce);
1603 }
1604 
1605 #if defined(KMP_GOMP_COMPAT)
1606 // Returns 1 if cancelled, 0 otherwise
1607 int __kmp_barrier_gomp_cancel(int gtid) {
1608   if (__kmp_omp_cancellation) {
1609     int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1610                                                  0, NULL, NULL);
1611     if (cancelled) {
1612       int tid = __kmp_tid_from_gtid(gtid);
1613       kmp_info_t *this_thr = __kmp_threads[gtid];
1614       if (KMP_MASTER_TID(tid)) {
1615         // Primary thread does not need to revert anything
1616       } else {
1617         // Workers need to revert their private b_arrived flag
1618         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1619             KMP_BARRIER_STATE_BUMP;
1620       }
1621     }
1622     return cancelled;
1623   }
1624   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1625   return FALSE;
1626 }
1627 #endif
1628 
1629 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1630   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1631   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1632   KMP_DEBUG_ASSERT(bt < bs_last_barrier);
1633   int tid = __kmp_tid_from_gtid(gtid);
1634   kmp_info_t *this_thr = __kmp_threads[gtid];
1635   kmp_team_t *team = this_thr->th.th_team;
1636 
1637   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1638   if (!team->t.t_serialized) {
1639     if (KMP_MASTER_GTID(gtid)) {
1640       switch (__kmp_barrier_release_pattern[bt]) {
1641       case bp_hyper_bar: {
1642         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1643         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1644                                     FALSE USE_ITT_BUILD_ARG(NULL));
1645         break;
1646       }
1647       case bp_hierarchical_bar: {
1648         __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1649                                            FALSE USE_ITT_BUILD_ARG(NULL));
1650         break;
1651       }
1652       case bp_tree_bar: {
1653         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1654         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1655                                    FALSE USE_ITT_BUILD_ARG(NULL));
1656         break;
1657       }
1658       default: {
1659         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1660                                      FALSE USE_ITT_BUILD_ARG(NULL));
1661       }
1662       }
1663       if (__kmp_tasking_mode != tskm_immediate_exec) {
1664         __kmp_task_team_sync(this_thr, team);
1665       } // if
1666     }
1667   }
1668   ANNOTATE_BARRIER_END(&team->t.t_bar);
1669 }
1670 
1671 void __kmp_join_barrier(int gtid) {
1672   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1673   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1674 
1675   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1676 
1677   kmp_info_t *this_thr = __kmp_threads[gtid];
1678   kmp_team_t *team;
1679   kmp_uint nproc;
1680   kmp_info_t *master_thread;
1681   int tid;
1682 #ifdef KMP_DEBUG
1683   int team_id;
1684 #endif /* KMP_DEBUG */
1685 #if USE_ITT_BUILD
1686   void *itt_sync_obj = NULL;
1687 #if USE_ITT_NOTIFY
1688   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1689     // Get object created at fork_barrier
1690     itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1691 #endif
1692 #endif /* USE_ITT_BUILD */
1693   KMP_MB();
1694 
1695   // Get current info
1696   team = this_thr->th.th_team;
1697   nproc = this_thr->th.th_team_nproc;
1698   KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1699   tid = __kmp_tid_from_gtid(gtid);
1700 #ifdef KMP_DEBUG
1701   team_id = team->t.t_id;
1702 #endif /* KMP_DEBUG */
1703   master_thread = this_thr->th.th_team_master;
1704 #ifdef KMP_DEBUG
1705   if (master_thread != team->t.t_threads[0]) {
1706     __kmp_print_structure();
1707   }
1708 #endif /* KMP_DEBUG */
1709   KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1710   KMP_MB();
1711 
1712   // Verify state
1713   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1714   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1715   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1716   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1717                 gtid, team_id, tid));
1718 
1719   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1720 #if OMPT_SUPPORT
1721   if (ompt_enabled.enabled) {
1722 #if OMPT_OPTIONAL
1723     ompt_data_t *my_task_data;
1724     ompt_data_t *my_parallel_data;
1725     void *codeptr = NULL;
1726     int ds_tid = this_thr->th.th_info.ds.ds_tid;
1727     if (KMP_MASTER_TID(ds_tid) &&
1728         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1729          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1730       codeptr = team->t.ompt_team_info.master_return_address;
1731     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1732     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1733     if (ompt_enabled.ompt_callback_sync_region) {
1734       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1735           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1736           my_task_data, codeptr);
1737     }
1738     if (ompt_enabled.ompt_callback_sync_region_wait) {
1739       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1740           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1741           my_task_data, codeptr);
1742     }
1743     if (!KMP_MASTER_TID(ds_tid))
1744       this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1745 #endif
1746     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1747   }
1748 #endif
1749 
1750   if (__kmp_tasking_mode == tskm_extra_barrier) {
1751     __kmp_tasking_barrier(team, this_thr, gtid);
1752     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1753                   team_id, tid));
1754   }
1755 #ifdef KMP_DEBUG
1756   if (__kmp_tasking_mode != tskm_immediate_exec) {
1757     KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1758                   "%p, th_task_team = %p\n",
1759                   __kmp_gtid_from_thread(this_thr), team_id,
1760                   team->t.t_task_team[this_thr->th.th_task_state],
1761                   this_thr->th.th_task_team));
1762     KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1763                      team->t.t_task_team[this_thr->th.th_task_state]);
1764   }
1765 #endif /* KMP_DEBUG */
1766 
1767   /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1768      access it when the team struct is not guaranteed to exist. Doing these
1769      loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1770      we do not perform the copy if blocktime=infinite, since the values are not
1771      used by __kmp_wait_template() in that case. */
1772   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1773 #if KMP_USE_MONITOR
1774     this_thr->th.th_team_bt_intervals =
1775         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1776     this_thr->th.th_team_bt_set =
1777         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1778 #else
1779     this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1780 #endif
1781   }
1782 
1783 #if USE_ITT_BUILD
1784   if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1785     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1786 #endif /* USE_ITT_BUILD */
1787 
1788   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1789   case bp_hyper_bar: {
1790     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1791     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1792                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1793     break;
1794   }
1795   case bp_hierarchical_bar: {
1796     __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1797                                       NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1798     break;
1799   }
1800   case bp_tree_bar: {
1801     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1802     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1803                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1804     break;
1805   }
1806   default: {
1807     __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1808                                 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1809   }
1810   }
1811 
1812   /* From this point on, the team data structure may be deallocated at any time
1813      by the primary thread - it is unsafe to reference it in any of the worker
1814      threads. Any per-team data items that need to be referenced before the
1815      end of the barrier should be moved to the kmp_task_team_t structs.  */
1816   if (KMP_MASTER_TID(tid)) {
1817     if (__kmp_tasking_mode != tskm_immediate_exec) {
1818       __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1819     }
1820     if (__kmp_display_affinity) {
1821       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1822     }
1823 #if KMP_STATS_ENABLED
1824     // Have primary thread flag the workers to indicate they are now waiting for
1825     // next parallel region, Also wake them up so they switch their timers to
1826     // idle.
1827     for (int i = 0; i < team->t.t_nproc; ++i) {
1828       kmp_info_t *team_thread = team->t.t_threads[i];
1829       if (team_thread == this_thr)
1830         continue;
1831       team_thread->th.th_stats->setIdleFlag();
1832       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1833           team_thread->th.th_sleep_loc != NULL)
1834         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1835                                   team_thread->th.th_sleep_loc);
1836     }
1837 #endif
1838 #if USE_ITT_BUILD
1839     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1840       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1841 #endif /* USE_ITT_BUILD */
1842 
1843 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1844     // Join barrier - report frame end
1845     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1846         __kmp_forkjoin_frames_mode &&
1847         (this_thr->th.th_teams_microtask == NULL || // either not in teams
1848          this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1849         team->t.t_active_level == 1) {
1850       kmp_uint64 cur_time = __itt_get_timestamp();
1851       ident_t *loc = team->t.t_ident;
1852       kmp_info_t **other_threads = team->t.t_threads;
1853       int nproc = this_thr->th.th_team_nproc;
1854       int i;
1855       switch (__kmp_forkjoin_frames_mode) {
1856       case 1:
1857         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1858                                loc, nproc);
1859         break;
1860       case 2:
1861         __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1862                                loc, nproc);
1863         break;
1864       case 3:
1865         if (__itt_metadata_add_ptr) {
1866           // Initialize with primary thread's wait time
1867           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1868           // Set arrive time to zero to be able to check it in
1869           // __kmp_invoke_task(); the same is done inside the loop below
1870           this_thr->th.th_bar_arrive_time = 0;
1871           for (i = 1; i < nproc; ++i) {
1872             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1873             other_threads[i]->th.th_bar_arrive_time = 0;
1874           }
1875           __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1876                                        cur_time, delta, 0);
1877         }
1878         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1879                                loc, nproc);
1880         this_thr->th.th_frame_time = cur_time;
1881         break;
1882       }
1883     }
1884 #endif /* USE_ITT_BUILD */
1885   }
1886 #if USE_ITT_BUILD
1887   else {
1888     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1889       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1890   }
1891 #endif /* USE_ITT_BUILD */
1892 
1893 #if KMP_DEBUG
1894   if (KMP_MASTER_TID(tid)) {
1895     KA_TRACE(
1896         15,
1897         ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1898          gtid, team_id, tid, nproc));
1899   }
1900 #endif /* KMP_DEBUG */
1901 
1902   // TODO now, mark worker threads as done so they may be disbanded
1903   KMP_MB(); // Flush all pending memory write invalidates.
1904   KA_TRACE(10,
1905            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1906 
1907   ANNOTATE_BARRIER_END(&team->t.t_bar);
1908 }
1909 
1910 // TODO release worker threads' fork barriers as we are ready instead of all at
1911 // once
1912 void __kmp_fork_barrier(int gtid, int tid) {
1913   KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1914   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1915   kmp_info_t *this_thr = __kmp_threads[gtid];
1916   kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1917 #if USE_ITT_BUILD
1918   void *itt_sync_obj = NULL;
1919 #endif /* USE_ITT_BUILD */
1920   if (team)
1921     ANNOTATE_BARRIER_END(&team->t.t_bar);
1922 
1923   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1924                 (team != NULL) ? team->t.t_id : -1, tid));
1925 
1926   // th_team pointer only valid for primary thread here
1927   if (KMP_MASTER_TID(tid)) {
1928 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1929     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1930       // Create itt barrier object
1931       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1932       __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1933     }
1934 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1935 
1936 #ifdef KMP_DEBUG
1937     KMP_DEBUG_ASSERT(team);
1938     kmp_info_t **other_threads = team->t.t_threads;
1939     int i;
1940 
1941     // Verify state
1942     KMP_MB();
1943 
1944     for (i = 1; i < team->t.t_nproc; ++i) {
1945       KA_TRACE(500,
1946                ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1947                 "== %u.\n",
1948                 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1949                 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1950                 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1951       KMP_DEBUG_ASSERT(
1952           (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1953            ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1954       KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1955     }
1956 #endif
1957 
1958     if (__kmp_tasking_mode != tskm_immediate_exec) {
1959       // 0 indicates setup current task team if nthreads > 1
1960       __kmp_task_team_setup(this_thr, team, 0);
1961     }
1962 
1963     /* The primary thread may have changed its blocktime between join barrier
1964        and fork barrier. Copy the blocktime info to the thread, where
1965        __kmp_wait_template() can access it when the team struct is not
1966        guaranteed to exist. */
1967     // See note about the corresponding code in __kmp_join_barrier() being
1968     // performance-critical
1969     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1970 #if KMP_USE_MONITOR
1971       this_thr->th.th_team_bt_intervals =
1972           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1973       this_thr->th.th_team_bt_set =
1974           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1975 #else
1976       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1977 #endif
1978     }
1979   } // primary thread
1980 
1981   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1982   case bp_hyper_bar: {
1983     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1984     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1985                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1986     break;
1987   }
1988   case bp_hierarchical_bar: {
1989     __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1990                                        TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1991     break;
1992   }
1993   case bp_tree_bar: {
1994     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1995     __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1996                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1997     break;
1998   }
1999   default: {
2000     __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2001                                  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2002   }
2003   }
2004 
2005 #if OMPT_SUPPORT
2006   if (ompt_enabled.enabled &&
2007       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2008     int ds_tid = this_thr->th.th_info.ds.ds_tid;
2009     ompt_data_t *task_data = (team)
2010                                  ? OMPT_CUR_TASK_DATA(this_thr)
2011                                  : &(this_thr->th.ompt_thread_info.task_data);
2012     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2013 #if OMPT_OPTIONAL
2014     void *codeptr = NULL;
2015     if (KMP_MASTER_TID(ds_tid) &&
2016         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2017          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2018       codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2019     if (ompt_enabled.ompt_callback_sync_region_wait) {
2020       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2021           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2022           codeptr);
2023     }
2024     if (ompt_enabled.ompt_callback_sync_region) {
2025       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2026           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2027           codeptr);
2028     }
2029 #endif
2030     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2031       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2032           ompt_scope_end, NULL, task_data, 0, ds_tid,
2033           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2034     }
2035   }
2036 #endif
2037 
2038   // Early exit for reaping threads releasing forkjoin barrier
2039   if (TCR_4(__kmp_global.g.g_done)) {
2040     this_thr->th.th_task_team = NULL;
2041 
2042 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2043     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2044       if (!KMP_MASTER_TID(tid)) {
2045         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2046         if (itt_sync_obj)
2047           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2048       }
2049     }
2050 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2051     KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2052     return;
2053   }
2054 
2055   /* We can now assume that a valid team structure has been allocated by the
2056      primary thread and propagated to all worker threads. The current thread,
2057      however, may not be part of the team, so we can't blindly assume that the
2058      team pointer is non-null.  */
2059   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2060   KMP_DEBUG_ASSERT(team != NULL);
2061   tid = __kmp_tid_from_gtid(gtid);
2062 
2063 #if KMP_BARRIER_ICV_PULL
2064   /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2065      __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2066      implicit task has this data before this function is called. We cannot
2067      modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2068      thread struct, because it is not always the case that the threads arrays
2069      have been allocated when __kmp_fork_call() is executed. */
2070   {
2071     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2072     if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2073       // Copy the initial ICVs from the primary thread's thread struct to the
2074       // implicit task for this tid.
2075       KA_TRACE(10,
2076                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2077       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2078                                tid, FALSE);
2079       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2080                 &team->t.t_threads[0]
2081                      ->th.th_bar[bs_forkjoin_barrier]
2082                      .bb.th_fixed_icvs);
2083     }
2084   }
2085 #endif // KMP_BARRIER_ICV_PULL
2086 
2087   if (__kmp_tasking_mode != tskm_immediate_exec) {
2088     __kmp_task_team_sync(this_thr, team);
2089   }
2090 
2091 #if KMP_AFFINITY_SUPPORTED
2092   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2093   if (proc_bind == proc_bind_intel) {
2094     // Call dynamic affinity settings
2095     if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2096       __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2097     }
2098   } else if (proc_bind != proc_bind_false) {
2099     if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2100       KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2101                      __kmp_gtid_from_thread(this_thr),
2102                      this_thr->th.th_current_place));
2103     } else {
2104       __kmp_affinity_set_place(gtid);
2105     }
2106   }
2107 #endif // KMP_AFFINITY_SUPPORTED
2108   // Perform the display affinity functionality
2109   if (__kmp_display_affinity) {
2110     if (team->t.t_display_affinity
2111 #if KMP_AFFINITY_SUPPORTED
2112         || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2113 #endif
2114     ) {
2115       // NULL means use the affinity-format-var ICV
2116       __kmp_aux_display_affinity(gtid, NULL);
2117       this_thr->th.th_prev_num_threads = team->t.t_nproc;
2118       this_thr->th.th_prev_level = team->t.t_level;
2119     }
2120   }
2121   if (!KMP_MASTER_TID(tid))
2122     KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2123 
2124 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2125   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2126     if (!KMP_MASTER_TID(tid)) {
2127       // Get correct barrier object
2128       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2129       __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2130     } // (prepare called inside barrier_release)
2131   }
2132 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2133   ANNOTATE_BARRIER_END(&team->t.t_bar);
2134   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2135                 team->t.t_id, tid));
2136 }
2137 
2138 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2139                           kmp_internal_control_t *new_icvs, ident_t *loc) {
2140   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2141 
2142   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2143   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2144 
2145 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2146    __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2147    implicit task has this data before this function is called. */
2148 #if KMP_BARRIER_ICV_PULL
2149   /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2150      remains untouched), where all of the worker threads can access them and
2151      make their own copies after the barrier. */
2152   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2153   // allocated at this point
2154   copy_icvs(
2155       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2156       new_icvs);
2157   KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2158                 team->t.t_threads[0], team));
2159 #elif KMP_BARRIER_ICV_PUSH
2160   // The ICVs will be propagated in the fork barrier, so nothing needs to be
2161   // done here.
2162   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2163                 team->t.t_threads[0], team));
2164 #else
2165   // Copy the ICVs to each of the non-primary threads.  This takes O(nthreads)
2166   // time.
2167   ngo_load(new_icvs);
2168   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2169   // allocated at this point
2170   for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2171     // TODO: GEH - pass in better source location info since usually NULL here
2172     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2173                   f, team->t.t_threads[f], team));
2174     __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2175     ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2176     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2177                   f, team->t.t_threads[f], team));
2178   }
2179   ngo_sync();
2180 #endif // KMP_BARRIER_ICV_PULL
2181 }
2182