1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #include "tsan_annotations.h"
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 
44 // Linear Barrier
45 template <bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50   kmp_team_t *team = this_thr->th.th_team;
51   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52   kmp_info_t **other_threads = team->t.t_threads;
53 
54   KA_TRACE(
55       20,
56       ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57        gtid, team->t.t_id, tid, bt));
58   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61   // Barrier imbalance - save arrive time to the thread
62   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64         __itt_get_timestamp();
65   }
66 #endif
67   // We now perform a linear reduction to signal that all of the threads have
68   // arrived.
69   if (!KMP_MASTER_TID(tid)) {
70     KA_TRACE(20,
71              ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72               "arrived(%p): %llu => %llu\n",
73               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76     // Mark arrival to master thread
77     /* After performing this write, a worker thread may not assume that the team
78        is valid any more - it could be deallocated by the master thread at any
79        time. */
80     ANNOTATE_BARRIER_BEGIN(this_thr);
81     kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
82     flag.release();
83   } else {
84     kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85     int nproc = this_thr->th.th_team_nproc;
86     int i;
87     // Don't have to worry about sleep bit here or atomic since team setting
88     kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89 
90     // Collect all the worker team member threads.
91     for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93       // Prefetch next thread's arrived count
94       if (i + 1 < nproc)
95         KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97       KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98                     "arrived(%p) == %llu\n",
99                     gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100                     team->t.t_id, i,
101                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102 
103       // Wait for worker thread to arrive
104       kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
105                        new_state);
106       if (cancellable) {
107         bool cancelled = flag.wait_cancellable_nosleep(
108             this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109         if (cancelled)
110           return true;
111       } else {
112         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113       }
114       ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116       // Barrier imbalance - write min of the thread time and the other thread
117       // time to the thread.
118       if (__kmp_forkjoin_frames_mode == 2) {
119         this_thr->th.th_bar_min_time = KMP_MIN(
120             this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121       }
122 #endif
123       if (reduce) {
124         KA_TRACE(100,
125                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127                   team->t.t_id, i));
128         ANNOTATE_REDUCE_AFTER(reduce);
129         OMPT_REDUCTION_DECL(this_thr, gtid);
130         OMPT_REDUCTION_BEGIN;
131         (*reduce)(this_thr->th.th_local.reduce_data,
132                   other_threads[i]->th.th_local.reduce_data);
133         OMPT_REDUCTION_END;
134         ANNOTATE_REDUCE_BEFORE(reduce);
135         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136       }
137     }
138     // Don't have to worry about sleep bit here or atomic since team setting
139     team_bar->b_arrived = new_state;
140     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141                   "arrived(%p) = %llu\n",
142                   gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143                   new_state));
144   }
145   KA_TRACE(
146       20,
147       ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148        gtid, team->t.t_id, tid, bt));
149   return false;
150 }
151 
152 template <bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158   kmp_team_t *team;
159 
160   if (KMP_MASTER_TID(tid)) {
161     unsigned int i;
162     kmp_uint32 nproc = this_thr->th.th_team_nproc;
163     kmp_info_t **other_threads;
164 
165     team = __kmp_threads[gtid]->th.th_team;
166     KMP_DEBUG_ASSERT(team != NULL);
167     other_threads = team->t.t_threads;
168 
169     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170                   "barrier type %d\n",
171                   gtid, team->t.t_id, tid, bt));
172 
173     if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175       {
176         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177         if (propagate_icvs) {
178           ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179           for (i = 1; i < nproc; ++i) {
180             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181                                      team, i, FALSE);
182             ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183                            &team->t.t_implicit_task_taskdata[0].td_icvs);
184           }
185           ngo_sync();
186         }
187       }
188 #endif // KMP_BARRIER_ICV_PUSH
189 
190       // Now, release all of the worker threads
191       for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193         // Prefetch next thread's go flag
194         if (i + 1 < nproc)
195           KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197         KA_TRACE(
198             20,
199             ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200              "go(%p): %u => %u\n",
201              gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203              other_threads[i]->th.th_bar[bt].bb.b_go,
204              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205         ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206         kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207                          other_threads[i]);
208         flag.release();
209       }
210     }
211   } else { // Wait for the MASTER thread to release us
212     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
215     if (cancellable) {
216       bool cancelled = flag.wait_cancellable_nosleep(
217           this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
218       if (cancelled) {
219         return true;
220       }
221     } else {
222       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
223     }
224     ANNOTATE_BARRIER_END(this_thr);
225 #if USE_ITT_BUILD && USE_ITT_NOTIFY
226     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
227       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
228       // disabled)
229       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
230       // Cancel wait on previous parallel region...
231       __kmp_itt_task_starting(itt_sync_obj);
232 
233       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
234         return false;
235 
236       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
237       if (itt_sync_obj != NULL)
238         // Call prepare as early as possible for "new" barrier
239         __kmp_itt_task_finished(itt_sync_obj);
240     } else
241 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
242         // Early exit for reaping threads releasing forkjoin barrier
243         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
244       return false;
245 // The worker thread may now assume that the team is valid.
246 #ifdef KMP_DEBUG
247     tid = __kmp_tid_from_gtid(gtid);
248     team = __kmp_threads[gtid]->th.th_team;
249 #endif
250     KMP_DEBUG_ASSERT(team != NULL);
251     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
252     KA_TRACE(20,
253              ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
254               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
255     KMP_MB(); // Flush all pending memory write invalidates.
256   }
257   KA_TRACE(
258       20,
259       ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
260        gtid, team->t.t_id, tid, bt));
261   return false;
262 }
263 
264 static void __kmp_linear_barrier_gather(
265     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
266     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
267   __kmp_linear_barrier_gather_template<false>(
268       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
269 }
270 
271 static bool __kmp_linear_barrier_gather_cancellable(
272     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
273     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
274   return __kmp_linear_barrier_gather_template<true>(
275       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
276 }
277 
278 static void __kmp_linear_barrier_release(
279     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
280     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
281   __kmp_linear_barrier_release_template<false>(
282       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
283 }
284 
285 static bool __kmp_linear_barrier_release_cancellable(
286     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
287     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
288   return __kmp_linear_barrier_release_template<true>(
289       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
290 }
291 
292 // Tree barrier
293 static void
294 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
295                           int tid, void (*reduce)(void *, void *)
296                                        USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
297   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
298   kmp_team_t *team = this_thr->th.th_team;
299   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
300   kmp_info_t **other_threads = team->t.t_threads;
301   kmp_uint32 nproc = this_thr->th.th_team_nproc;
302   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
303   kmp_uint32 branch_factor = 1 << branch_bits;
304   kmp_uint32 child;
305   kmp_uint32 child_tid;
306   kmp_uint64 new_state;
307 
308   KA_TRACE(
309       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
310            gtid, team->t.t_id, tid, bt));
311   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
312 
313 #if USE_ITT_BUILD && USE_ITT_NOTIFY
314   // Barrier imbalance - save arrive time to the thread
315   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
316     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
317         __itt_get_timestamp();
318   }
319 #endif
320   // Perform tree gather to wait until all threads have arrived; reduce any
321   // required data as we go
322   child_tid = (tid << branch_bits) + 1;
323   if (child_tid < nproc) {
324     // Parent threads wait for all their children to arrive
325     new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
326     child = 1;
327     do {
328       kmp_info_t *child_thr = other_threads[child_tid];
329       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
330 #if KMP_CACHE_MANAGE
331       // Prefetch next thread's arrived count
332       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
333         KMP_CACHE_PREFETCH(
334             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
335 #endif /* KMP_CACHE_MANAGE */
336       KA_TRACE(20,
337                ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
338                 "arrived(%p) == %llu\n",
339                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
340                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
341       // Wait for child to arrive
342       kmp_flag_64 flag(&child_bar->b_arrived, new_state);
343       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
344       ANNOTATE_BARRIER_END(child_thr);
345 #if USE_ITT_BUILD && USE_ITT_NOTIFY
346       // Barrier imbalance - write min of the thread time and a child time to
347       // the thread.
348       if (__kmp_forkjoin_frames_mode == 2) {
349         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
350                                                child_thr->th.th_bar_min_time);
351       }
352 #endif
353       if (reduce) {
354         KA_TRACE(100,
355                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
356                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
357                   team->t.t_id, child_tid));
358         ANNOTATE_REDUCE_AFTER(reduce);
359         OMPT_REDUCTION_DECL(this_thr, gtid);
360         OMPT_REDUCTION_BEGIN;
361         (*reduce)(this_thr->th.th_local.reduce_data,
362                   child_thr->th.th_local.reduce_data);
363         OMPT_REDUCTION_END;
364         ANNOTATE_REDUCE_BEFORE(reduce);
365         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
366       }
367       child++;
368       child_tid++;
369     } while (child <= branch_factor && child_tid < nproc);
370   }
371 
372   if (!KMP_MASTER_TID(tid)) { // Worker threads
373     kmp_int32 parent_tid = (tid - 1) >> branch_bits;
374 
375     KA_TRACE(20,
376              ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
377               "arrived(%p): %llu => %llu\n",
378               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
379               team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
380               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
381 
382     // Mark arrival to parent thread
383     /* After performing this write, a worker thread may not assume that the team
384        is valid any more - it could be deallocated by the master thread at any
385        time.  */
386     ANNOTATE_BARRIER_BEGIN(this_thr);
387     kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
388     flag.release();
389   } else {
390     // Need to update the team arrived pointer if we are the master thread
391     if (nproc > 1) // New value was already computed above
392       team->t.t_bar[bt].b_arrived = new_state;
393     else
394       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
395     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
396                   "arrived(%p) = %llu\n",
397                   gtid, team->t.t_id, tid, team->t.t_id,
398                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
399   }
400   KA_TRACE(20,
401            ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
402             gtid, team->t.t_id, tid, bt));
403 }
404 
405 static void __kmp_tree_barrier_release(
406     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
407     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
408   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
409   kmp_team_t *team;
410   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
411   kmp_uint32 nproc;
412   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
413   kmp_uint32 branch_factor = 1 << branch_bits;
414   kmp_uint32 child;
415   kmp_uint32 child_tid;
416 
417   // Perform a tree release for all of the threads that have been gathered
418   if (!KMP_MASTER_TID(
419           tid)) { // Handle fork barrier workers who aren't part of a team yet
420     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
421                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
422     // Wait for parent thread to release us
423     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
424     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
425     ANNOTATE_BARRIER_END(this_thr);
426 #if USE_ITT_BUILD && USE_ITT_NOTIFY
427     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
428       // In fork barrier where we could not get the object reliably (or
429       // ITTNOTIFY is disabled)
430       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
431       // Cancel wait on previous parallel region...
432       __kmp_itt_task_starting(itt_sync_obj);
433 
434       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
435         return;
436 
437       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
438       if (itt_sync_obj != NULL)
439         // Call prepare as early as possible for "new" barrier
440         __kmp_itt_task_finished(itt_sync_obj);
441     } else
442 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
443         // Early exit for reaping threads releasing forkjoin barrier
444         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
445       return;
446 
447     // The worker thread may now assume that the team is valid.
448     team = __kmp_threads[gtid]->th.th_team;
449     KMP_DEBUG_ASSERT(team != NULL);
450     tid = __kmp_tid_from_gtid(gtid);
451 
452     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
453     KA_TRACE(20,
454              ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
455               team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
456     KMP_MB(); // Flush all pending memory write invalidates.
457   } else {
458     team = __kmp_threads[gtid]->th.th_team;
459     KMP_DEBUG_ASSERT(team != NULL);
460     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
461                   "barrier type %d\n",
462                   gtid, team->t.t_id, tid, bt));
463   }
464   nproc = this_thr->th.th_team_nproc;
465   child_tid = (tid << branch_bits) + 1;
466 
467   if (child_tid < nproc) {
468     kmp_info_t **other_threads = team->t.t_threads;
469     child = 1;
470     // Parent threads release all their children
471     do {
472       kmp_info_t *child_thr = other_threads[child_tid];
473       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474 #if KMP_CACHE_MANAGE
475       // Prefetch next thread's go count
476       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
477         KMP_CACHE_PREFETCH(
478             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
479 #endif /* KMP_CACHE_MANAGE */
480 
481 #if KMP_BARRIER_ICV_PUSH
482       {
483         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
484         if (propagate_icvs) {
485           __kmp_init_implicit_task(team->t.t_ident,
486                                    team->t.t_threads[child_tid], team,
487                                    child_tid, FALSE);
488           copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
489                     &team->t.t_implicit_task_taskdata[0].td_icvs);
490         }
491       }
492 #endif // KMP_BARRIER_ICV_PUSH
493       KA_TRACE(20,
494                ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
495                 "go(%p): %u => %u\n",
496                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
497                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
498                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
499       // Release child from barrier
500       ANNOTATE_BARRIER_BEGIN(child_thr);
501       kmp_flag_64 flag(&child_bar->b_go, child_thr);
502       flag.release();
503       child++;
504       child_tid++;
505     } while (child <= branch_factor && child_tid < nproc);
506   }
507   KA_TRACE(
508       20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
509            gtid, team->t.t_id, tid, bt));
510 }
511 
512 // Hyper Barrier
513 static void
514 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
515                            int tid, void (*reduce)(void *, void *)
516                                         USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
517   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
518   kmp_team_t *team = this_thr->th.th_team;
519   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
520   kmp_info_t **other_threads = team->t.t_threads;
521   kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
522   kmp_uint32 num_threads = this_thr->th.th_team_nproc;
523   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
524   kmp_uint32 branch_factor = 1 << branch_bits;
525   kmp_uint32 offset;
526   kmp_uint32 level;
527 
528   KA_TRACE(
529       20,
530       ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
531        gtid, team->t.t_id, tid, bt));
532   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
533 
534 #if USE_ITT_BUILD && USE_ITT_NOTIFY
535   // Barrier imbalance - save arrive time to the thread
536   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
537     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
538         __itt_get_timestamp();
539   }
540 #endif
541   /* Perform a hypercube-embedded tree gather to wait until all of the threads
542      have arrived, and reduce any required data as we go.  */
543   kmp_flag_64 p_flag(&thr_bar->b_arrived);
544   for (level = 0, offset = 1; offset < num_threads;
545        level += branch_bits, offset <<= branch_bits) {
546     kmp_uint32 child;
547     kmp_uint32 child_tid;
548 
549     if (((tid >> level) & (branch_factor - 1)) != 0) {
550       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
551 
552       KMP_MB(); // Synchronize parent and child threads.
553       KA_TRACE(20,
554                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
555                 "arrived(%p): %llu => %llu\n",
556                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
557                 team->t.t_id, parent_tid, &thr_bar->b_arrived,
558                 thr_bar->b_arrived,
559                 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
560       // Mark arrival to parent thread
561       /* After performing this write (in the last iteration of the enclosing for
562          loop), a worker thread may not assume that the team is valid any more
563          - it could be deallocated by the master thread at any time.  */
564       ANNOTATE_BARRIER_BEGIN(this_thr);
565       p_flag.set_waiter(other_threads[parent_tid]);
566       p_flag.release();
567       break;
568     }
569 
570     // Parent threads wait for children to arrive
571     if (new_state == KMP_BARRIER_UNUSED_STATE)
572       new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
573     for (child = 1, child_tid = tid + (1 << level);
574          child < branch_factor && child_tid < num_threads;
575          child++, child_tid += (1 << level)) {
576       kmp_info_t *child_thr = other_threads[child_tid];
577       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
578 #if KMP_CACHE_MANAGE
579       kmp_uint32 next_child_tid = child_tid + (1 << level);
580       // Prefetch next thread's arrived count
581       if (child + 1 < branch_factor && next_child_tid < num_threads)
582         KMP_CACHE_PREFETCH(
583             &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
584 #endif /* KMP_CACHE_MANAGE */
585       KA_TRACE(20,
586                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
587                 "arrived(%p) == %llu\n",
588                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
589                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
590       // Wait for child to arrive
591       kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
592       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
593       ANNOTATE_BARRIER_END(child_thr);
594       KMP_MB(); // Synchronize parent and child threads.
595 #if USE_ITT_BUILD && USE_ITT_NOTIFY
596       // Barrier imbalance - write min of the thread time and a child time to
597       // the thread.
598       if (__kmp_forkjoin_frames_mode == 2) {
599         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
600                                                child_thr->th.th_bar_min_time);
601       }
602 #endif
603       if (reduce) {
604         KA_TRACE(100,
605                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
606                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
607                   team->t.t_id, child_tid));
608         ANNOTATE_REDUCE_AFTER(reduce);
609         OMPT_REDUCTION_DECL(this_thr, gtid);
610         OMPT_REDUCTION_BEGIN;
611         (*reduce)(this_thr->th.th_local.reduce_data,
612                   child_thr->th.th_local.reduce_data);
613         OMPT_REDUCTION_END;
614         ANNOTATE_REDUCE_BEFORE(reduce);
615         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
616       }
617     }
618   }
619 
620   if (KMP_MASTER_TID(tid)) {
621     // Need to update the team arrived pointer if we are the master thread
622     if (new_state == KMP_BARRIER_UNUSED_STATE)
623       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
624     else
625       team->t.t_bar[bt].b_arrived = new_state;
626     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
627                   "arrived(%p) = %llu\n",
628                   gtid, team->t.t_id, tid, team->t.t_id,
629                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
630   }
631   KA_TRACE(
632       20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
633            gtid, team->t.t_id, tid, bt));
634 }
635 
636 // The reverse versions seem to beat the forward versions overall
637 #define KMP_REVERSE_HYPER_BAR
638 static void __kmp_hyper_barrier_release(
639     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
640     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
641   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
642   kmp_team_t *team;
643   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
644   kmp_info_t **other_threads;
645   kmp_uint32 num_threads;
646   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
647   kmp_uint32 branch_factor = 1 << branch_bits;
648   kmp_uint32 child;
649   kmp_uint32 child_tid;
650   kmp_uint32 offset;
651   kmp_uint32 level;
652 
653   /* Perform a hypercube-embedded tree release for all of the threads that have
654      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
655      are released in the reverse order of the corresponding gather, otherwise
656      threads are released in the same order. */
657   if (KMP_MASTER_TID(tid)) { // master
658     team = __kmp_threads[gtid]->th.th_team;
659     KMP_DEBUG_ASSERT(team != NULL);
660     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
661                   "barrier type %d\n",
662                   gtid, team->t.t_id, tid, bt));
663 #if KMP_BARRIER_ICV_PUSH
664     if (propagate_icvs) { // master already has ICVs in final destination; copy
665       copy_icvs(&thr_bar->th_fixed_icvs,
666                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
667     }
668 #endif
669   } else { // Handle fork barrier workers who aren't part of a team yet
670     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
671                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
672     // Wait for parent thread to release us
673     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
674     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
675     ANNOTATE_BARRIER_END(this_thr);
676 #if USE_ITT_BUILD && USE_ITT_NOTIFY
677     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
678       // In fork barrier where we could not get the object reliably
679       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
680       // Cancel wait on previous parallel region...
681       __kmp_itt_task_starting(itt_sync_obj);
682 
683       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
684         return;
685 
686       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
687       if (itt_sync_obj != NULL)
688         // Call prepare as early as possible for "new" barrier
689         __kmp_itt_task_finished(itt_sync_obj);
690     } else
691 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
692         // Early exit for reaping threads releasing forkjoin barrier
693         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
694       return;
695 
696     // The worker thread may now assume that the team is valid.
697     team = __kmp_threads[gtid]->th.th_team;
698     KMP_DEBUG_ASSERT(team != NULL);
699     tid = __kmp_tid_from_gtid(gtid);
700 
701     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
702     KA_TRACE(20,
703              ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
704               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
705     KMP_MB(); // Flush all pending memory write invalidates.
706   }
707   num_threads = this_thr->th.th_team_nproc;
708   other_threads = team->t.t_threads;
709 
710 #ifdef KMP_REVERSE_HYPER_BAR
711   // Count up to correct level for parent
712   for (level = 0, offset = 1;
713        offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
714        level += branch_bits, offset <<= branch_bits)
715     ;
716 
717   // Now go down from there
718   for (level -= branch_bits, offset >>= branch_bits; offset != 0;
719        level -= branch_bits, offset >>= branch_bits)
720 #else
721   // Go down the tree, level by level
722   for (level = 0, offset = 1; offset < num_threads;
723        level += branch_bits, offset <<= branch_bits)
724 #endif // KMP_REVERSE_HYPER_BAR
725   {
726 #ifdef KMP_REVERSE_HYPER_BAR
727     /* Now go in reverse order through the children, highest to lowest.
728        Initial setting of child is conservative here. */
729     child = num_threads >> ((level == 0) ? level : level - 1);
730     for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
731         child_tid = tid + (child << level);
732          child >= 1; child--, child_tid -= (1 << level))
733 #else
734     if (((tid >> level) & (branch_factor - 1)) != 0)
735       // No need to go lower than this, since this is the level parent would be
736       // notified
737       break;
738     // Iterate through children on this level of the tree
739     for (child = 1, child_tid = tid + (1 << level);
740          child < branch_factor && child_tid < num_threads;
741          child++, child_tid += (1 << level))
742 #endif // KMP_REVERSE_HYPER_BAR
743     {
744       if (child_tid >= num_threads)
745         continue; // Child doesn't exist so keep going
746       else {
747         kmp_info_t *child_thr = other_threads[child_tid];
748         kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
749 #if KMP_CACHE_MANAGE
750         kmp_uint32 next_child_tid = child_tid - (1 << level);
751 // Prefetch next thread's go count
752 #ifdef KMP_REVERSE_HYPER_BAR
753         if (child - 1 >= 1 && next_child_tid < num_threads)
754 #else
755         if (child + 1 < branch_factor && next_child_tid < num_threads)
756 #endif // KMP_REVERSE_HYPER_BAR
757           KMP_CACHE_PREFETCH(
758               &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
759 #endif /* KMP_CACHE_MANAGE */
760 
761 #if KMP_BARRIER_ICV_PUSH
762         if (propagate_icvs) // push my fixed ICVs to my child
763           copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
764 #endif // KMP_BARRIER_ICV_PUSH
765 
766         KA_TRACE(
767             20,
768             ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
769              "go(%p): %u => %u\n",
770              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
771              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
772              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
773         // Release child from barrier
774         ANNOTATE_BARRIER_BEGIN(child_thr);
775         kmp_flag_64 flag(&child_bar->b_go, child_thr);
776         flag.release();
777       }
778     }
779   }
780 #if KMP_BARRIER_ICV_PUSH
781   if (propagate_icvs &&
782       !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
783     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
784                              FALSE);
785     copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
786               &thr_bar->th_fixed_icvs);
787   }
788 #endif
789   KA_TRACE(
790       20,
791       ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
792        gtid, team->t.t_id, tid, bt));
793 }
794 
795 // Hierarchical Barrier
796 
797 // Initialize thread barrier data
798 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
799    Performs the minimum amount of initialization required based on how the team
800    has changed. Returns true if leaf children will require both on-core and
801    traditional wake-up mechanisms. For example, if the team size increases,
802    threads already in the team will respond to on-core wakeup on their parent
803    thread, but threads newly added to the team will only be listening on the
804    their local b_go. */
805 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
806                                                    kmp_bstate_t *thr_bar,
807                                                    kmp_uint32 nproc, int gtid,
808                                                    int tid, kmp_team_t *team) {
809   // Checks to determine if (re-)initialization is needed
810   bool uninitialized = thr_bar->team == NULL;
811   bool team_changed = team != thr_bar->team;
812   bool team_sz_changed = nproc != thr_bar->nproc;
813   bool tid_changed = tid != thr_bar->old_tid;
814   bool retval = false;
815 
816   if (uninitialized || team_sz_changed) {
817     __kmp_get_hierarchy(nproc, thr_bar);
818   }
819 
820   if (uninitialized || team_sz_changed || tid_changed) {
821     thr_bar->my_level = thr_bar->depth - 1; // default for master
822     thr_bar->parent_tid = -1; // default for master
823     if (!KMP_MASTER_TID(
824             tid)) { // if not master, find parent thread in hierarchy
825       kmp_uint32 d = 0;
826       while (d < thr_bar->depth) { // find parent based on level of thread in
827         // hierarchy, and note level
828         kmp_uint32 rem;
829         if (d == thr_bar->depth - 2) { // reached level right below the master
830           thr_bar->parent_tid = 0;
831           thr_bar->my_level = d;
832           break;
833         } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
834                    0) { // TODO: can we make this op faster?
835           // thread is not a subtree root at next level, so this is max
836           thr_bar->parent_tid = tid - rem;
837           thr_bar->my_level = d;
838           break;
839         }
840         ++d;
841       }
842     }
843     thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
844     thr_bar->old_tid = tid;
845     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
846     thr_bar->team = team;
847     thr_bar->parent_bar =
848         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
849   }
850   if (uninitialized || team_changed || tid_changed) {
851     thr_bar->team = team;
852     thr_bar->parent_bar =
853         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
854     retval = true;
855   }
856   if (uninitialized || team_sz_changed || tid_changed) {
857     thr_bar->nproc = nproc;
858     thr_bar->leaf_kids = thr_bar->base_leaf_kids;
859     if (thr_bar->my_level == 0)
860       thr_bar->leaf_kids = 0;
861     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
862       thr_bar->leaf_kids = nproc - tid - 1;
863     thr_bar->leaf_state = 0;
864     for (int i = 0; i < thr_bar->leaf_kids; ++i)
865       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
866   }
867   return retval;
868 }
869 
870 static void __kmp_hierarchical_barrier_gather(
871     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
872     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
873   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
874   kmp_team_t *team = this_thr->th.th_team;
875   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
876   kmp_uint32 nproc = this_thr->th.th_team_nproc;
877   kmp_info_t **other_threads = team->t.t_threads;
878   kmp_uint64 new_state;
879 
880   int level = team->t.t_level;
881   if (other_threads[0]
882           ->th.th_teams_microtask) // are we inside the teams construct?
883     if (this_thr->th.th_teams_size.nteams > 1)
884       ++level; // level was not increased in teams construct for team_of_masters
885   if (level == 1)
886     thr_bar->use_oncore_barrier = 1;
887   else
888     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
889 
890   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
891                 "barrier type %d\n",
892                 gtid, team->t.t_id, tid, bt));
893   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
894 
895 #if USE_ITT_BUILD && USE_ITT_NOTIFY
896   // Barrier imbalance - save arrive time to the thread
897   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
898     this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
899   }
900 #endif
901 
902   (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
903                                                team);
904 
905   if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
906     kmp_int32 child_tid;
907     new_state =
908         (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
909     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
910         thr_bar->use_oncore_barrier) {
911       if (thr_bar->leaf_kids) {
912         // First, wait for leaf children to check-in on my b_arrived flag
913         kmp_uint64 leaf_state =
914             KMP_MASTER_TID(tid)
915                 ? thr_bar->b_arrived | thr_bar->leaf_state
916                 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
917         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
918                       "for leaf kids\n",
919                       gtid, team->t.t_id, tid));
920         kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
921         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
922         if (reduce) {
923           ANNOTATE_REDUCE_AFTER(reduce);
924           OMPT_REDUCTION_DECL(this_thr, gtid);
925           OMPT_REDUCTION_BEGIN;
926           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
927                ++child_tid) {
928             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
929                            "T#%d(%d:%d)\n",
930                            gtid, team->t.t_id, tid,
931                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
932                            child_tid));
933             ANNOTATE_BARRIER_END(other_threads[child_tid]);
934             (*reduce)(this_thr->th.th_local.reduce_data,
935                       other_threads[child_tid]->th.th_local.reduce_data);
936           }
937           OMPT_REDUCTION_END;
938           ANNOTATE_REDUCE_BEFORE(reduce);
939           ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
940         }
941         // clear leaf_state bits
942         KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
943       }
944       // Next, wait for higher level children on each child's b_arrived flag
945       for (kmp_uint32 d = 1; d < thr_bar->my_level;
946            ++d) { // gather lowest level threads first, but skip 0
947         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948                    skip = thr_bar->skip_per_level[d];
949         if (last > nproc)
950           last = nproc;
951         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952           kmp_info_t *child_thr = other_threads[child_tid];
953           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
955                         "T#%d(%d:%d) "
956                         "arrived(%p) == %llu\n",
957                         gtid, team->t.t_id, tid,
958                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959                         child_tid, &child_bar->b_arrived, new_state));
960           kmp_flag_64 flag(&child_bar->b_arrived, new_state);
961           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
962           ANNOTATE_BARRIER_END(child_thr);
963           if (reduce) {
964             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
965                            "T#%d(%d:%d)\n",
966                            gtid, team->t.t_id, tid,
967                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
968                            child_tid));
969             ANNOTATE_REDUCE_AFTER(reduce);
970             (*reduce)(this_thr->th.th_local.reduce_data,
971                       child_thr->th.th_local.reduce_data);
972             ANNOTATE_REDUCE_BEFORE(reduce);
973             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
974           }
975         }
976       }
977     } else { // Blocktime is not infinite
978       for (kmp_uint32 d = 0; d < thr_bar->my_level;
979            ++d) { // Gather lowest level threads first
980         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
981                    skip = thr_bar->skip_per_level[d];
982         if (last > nproc)
983           last = nproc;
984         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
985           kmp_info_t *child_thr = other_threads[child_tid];
986           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
987           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
988                         "T#%d(%d:%d) "
989                         "arrived(%p) == %llu\n",
990                         gtid, team->t.t_id, tid,
991                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
992                         child_tid, &child_bar->b_arrived, new_state));
993           kmp_flag_64 flag(&child_bar->b_arrived, new_state);
994           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
995           ANNOTATE_BARRIER_END(child_thr);
996           if (reduce) {
997             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
998                            "T#%d(%d:%d)\n",
999                            gtid, team->t.t_id, tid,
1000                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1001                            child_tid));
1002             ANNOTATE_REDUCE_AFTER(reduce);
1003             (*reduce)(this_thr->th.th_local.reduce_data,
1004                       child_thr->th.th_local.reduce_data);
1005             ANNOTATE_REDUCE_BEFORE(reduce);
1006             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1007           }
1008         }
1009       }
1010     }
1011   }
1012   // All subordinates are gathered; now release parent if not master thread
1013 
1014   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1015     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1016                   " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1017                   gtid, team->t.t_id, tid,
1018                   __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1019                   thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1020                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1021     /* Mark arrival to parent: After performing this write, a worker thread may
1022        not assume that the team is valid any more - it could be deallocated by
1023        the master thread at any time. */
1024     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1025         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1026       // flag; release it
1027       ANNOTATE_BARRIER_BEGIN(this_thr);
1028       kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1029       flag.release();
1030     } else {
1031       // Leaf does special release on "offset" bits of parent's b_arrived flag
1032       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1033       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1034       flag.set_waiter(other_threads[thr_bar->parent_tid]);
1035       flag.release();
1036     }
1037   } else { // Master thread needs to update the team's b_arrived value
1038     team->t.t_bar[bt].b_arrived = new_state;
1039     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1040                   "arrived(%p) = %llu\n",
1041                   gtid, team->t.t_id, tid, team->t.t_id,
1042                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1043   }
1044   // Is the team access below unsafe or just technically invalid?
1045   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1046                 "barrier type %d\n",
1047                 gtid, team->t.t_id, tid, bt));
1048 }
1049 
1050 static void __kmp_hierarchical_barrier_release(
1051     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1052     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1053   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1054   kmp_team_t *team;
1055   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1056   kmp_uint32 nproc;
1057   bool team_change = false; // indicates on-core barrier shouldn't be used
1058 
1059   if (KMP_MASTER_TID(tid)) {
1060     team = __kmp_threads[gtid]->th.th_team;
1061     KMP_DEBUG_ASSERT(team != NULL);
1062     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1063                   "entered barrier type %d\n",
1064                   gtid, team->t.t_id, tid, bt));
1065   } else { // Worker threads
1066     // Wait for parent thread to release me
1067     if (!thr_bar->use_oncore_barrier ||
1068         __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1069         thr_bar->team == NULL) {
1070       // Use traditional method of waiting on my own b_go flag
1071       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1072       kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1073       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1074       ANNOTATE_BARRIER_END(this_thr);
1075       TCW_8(thr_bar->b_go,
1076             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1077     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1078       // infinite, not nested
1079       // Wait on my "offset" bits on parent's b_go flag
1080       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1081       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1082                            thr_bar->offset, bt,
1083                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1084       flag.wait(this_thr, TRUE);
1085       if (thr_bar->wait_flag ==
1086           KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1087         TCW_8(thr_bar->b_go,
1088               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1089       } else { // Reset my bits on parent's b_go flag
1090         (RCAST(volatile char *,
1091                &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1092       }
1093     }
1094     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1095     // Early exit for reaping threads releasing forkjoin barrier
1096     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1097       return;
1098     // The worker thread may now assume that the team is valid.
1099     team = __kmp_threads[gtid]->th.th_team;
1100     KMP_DEBUG_ASSERT(team != NULL);
1101     tid = __kmp_tid_from_gtid(gtid);
1102 
1103     KA_TRACE(
1104         20,
1105         ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1106          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1107     KMP_MB(); // Flush all pending memory write invalidates.
1108   }
1109 
1110   nproc = this_thr->th.th_team_nproc;
1111   int level = team->t.t_level;
1112   if (team->t.t_threads[0]
1113           ->th.th_teams_microtask) { // are we inside the teams construct?
1114     if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1115         this_thr->th.th_teams_level == level)
1116       ++level; // level was not increased in teams construct for team_of_workers
1117     if (this_thr->th.th_teams_size.nteams > 1)
1118       ++level; // level was not increased in teams construct for team_of_masters
1119   }
1120   if (level == 1)
1121     thr_bar->use_oncore_barrier = 1;
1122   else
1123     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1124 
1125   // If the team size has increased, we still communicate with old leaves via
1126   // oncore barrier.
1127   unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1128   kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1129   team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1130                                                        tid, team);
1131   // But if the entire team changes, we won't use oncore barrier at all
1132   if (team_change)
1133     old_leaf_kids = 0;
1134 
1135 #if KMP_BARRIER_ICV_PUSH
1136   if (propagate_icvs) {
1137     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1138                              FALSE);
1139     if (KMP_MASTER_TID(
1140             tid)) { // master already has copy in final destination; copy
1141       copy_icvs(&thr_bar->th_fixed_icvs,
1142                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1143     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1144                thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1145       if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1146         // leaves (on-core children) pull parent's fixed ICVs directly to local
1147         // ICV store
1148         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1149                   &thr_bar->parent_bar->th_fixed_icvs);
1150       // non-leaves will get ICVs piggybacked with b_go via NGO store
1151     } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1152       if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1153         // access
1154         copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1155       else // leaves copy parent's fixed ICVs directly to local ICV store
1156         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1157                   &thr_bar->parent_bar->th_fixed_icvs);
1158     }
1159   }
1160 #endif // KMP_BARRIER_ICV_PUSH
1161 
1162   // Now, release my children
1163   if (thr_bar->my_level) { // not a leaf
1164     kmp_int32 child_tid;
1165     kmp_uint32 last;
1166     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1167         thr_bar->use_oncore_barrier) {
1168       if (KMP_MASTER_TID(tid)) { // do a flat release
1169         // Set local b_go to bump children via NGO store of the cache line
1170         // containing IVCs and b_go.
1171         thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1172         // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1173         // the cache line
1174         ngo_load(&thr_bar->th_fixed_icvs);
1175         // This loops over all the threads skipping only the leaf nodes in the
1176         // hierarchy
1177         for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1178              child_tid += thr_bar->skip_per_level[1]) {
1179           kmp_bstate_t *child_bar =
1180               &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1181           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1182                         "releasing T#%d(%d:%d)"
1183                         " go(%p): %u => %u\n",
1184                         gtid, team->t.t_id, tid,
1185                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1186                         child_tid, &child_bar->b_go, child_bar->b_go,
1187                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1188           // Use ngo store (if available) to both store ICVs and release child
1189           // via child's b_go
1190           ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1191         }
1192         ngo_sync();
1193       }
1194       TCW_8(thr_bar->b_go,
1195             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1196       // Now, release leaf children
1197       if (thr_bar->leaf_kids) { // if there are any
1198         // We test team_change on the off-chance that the level 1 team changed.
1199         if (team_change ||
1200             old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1201           if (old_leaf_kids) { // release old leaf kids
1202             thr_bar->b_go |= old_leaf_state;
1203           }
1204           // Release new leaf kids
1205           last = tid + thr_bar->skip_per_level[1];
1206           if (last > nproc)
1207             last = nproc;
1208           for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1209                ++child_tid) { // skip_per_level[0]=1
1210             kmp_info_t *child_thr = team->t.t_threads[child_tid];
1211             kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1212             KA_TRACE(
1213                 20,
1214                 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1215                  " T#%d(%d:%d) go(%p): %u => %u\n",
1216                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1217                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1218                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1219             // Release child using child's b_go flag
1220             ANNOTATE_BARRIER_BEGIN(child_thr);
1221             kmp_flag_64 flag(&child_bar->b_go, child_thr);
1222             flag.release();
1223           }
1224         } else { // Release all children at once with leaf_state bits on my own
1225           // b_go flag
1226           thr_bar->b_go |= thr_bar->leaf_state;
1227         }
1228       }
1229     } else { // Blocktime is not infinite; do a simple hierarchical release
1230       for (int d = thr_bar->my_level - 1; d >= 0;
1231            --d) { // Release highest level threads first
1232         last = tid + thr_bar->skip_per_level[d + 1];
1233         kmp_uint32 skip = thr_bar->skip_per_level[d];
1234         if (last > nproc)
1235           last = nproc;
1236         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1237           kmp_info_t *child_thr = team->t.t_threads[child_tid];
1238           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1239           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1240                         "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1241                         gtid, team->t.t_id, tid,
1242                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1243                         child_tid, &child_bar->b_go, child_bar->b_go,
1244                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1245           // Release child using child's b_go flag
1246           ANNOTATE_BARRIER_BEGIN(child_thr);
1247           kmp_flag_64 flag(&child_bar->b_go, child_thr);
1248           flag.release();
1249         }
1250       }
1251     }
1252 #if KMP_BARRIER_ICV_PUSH
1253     if (propagate_icvs && !KMP_MASTER_TID(tid))
1254       // non-leaves copy ICVs from fixed ICVs to local dest
1255       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1256                 &thr_bar->th_fixed_icvs);
1257 #endif // KMP_BARRIER_ICV_PUSH
1258   }
1259   KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1260                 "barrier type %d\n",
1261                 gtid, team->t.t_id, tid, bt));
1262 }
1263 
1264 // End of Barrier Algorithms
1265 
1266 // type traits for cancellable value
1267 // if cancellable is true, then is_cancellable is a normal boolean variable
1268 // if cancellable is false, then is_cancellable is a compile time constant
1269 template <bool cancellable> struct is_cancellable {};
1270 template <> struct is_cancellable<true> {
1271   bool value;
1272   is_cancellable() : value(false) {}
1273   is_cancellable(bool b) : value(b) {}
1274   is_cancellable &operator=(bool b) {
1275     value = b;
1276     return *this;
1277   }
1278   operator bool() const { return value; }
1279 };
1280 template <> struct is_cancellable<false> {
1281   is_cancellable &operator=(bool b) { return *this; }
1282   constexpr operator bool() const { return false; }
1283 };
1284 
1285 // Internal function to do a barrier.
1286 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1287    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1288    barrier
1289    When cancellable = false,
1290      Returns 0 if master thread, 1 if worker thread.
1291    When cancellable = true
1292      Returns 0 if not cancelled, 1 if cancelled.  */
1293 template <bool cancellable = false>
1294 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1295                                   size_t reduce_size, void *reduce_data,
1296                                   void (*reduce)(void *, void *)) {
1297   KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1298   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1299   int tid = __kmp_tid_from_gtid(gtid);
1300   kmp_info_t *this_thr = __kmp_threads[gtid];
1301   kmp_team_t *team = this_thr->th.th_team;
1302   int status = 0;
1303   is_cancellable<cancellable> cancelled;
1304 #if OMPT_SUPPORT && OMPT_OPTIONAL
1305   ompt_data_t *my_task_data;
1306   ompt_data_t *my_parallel_data;
1307   void *return_address;
1308   ompt_sync_region_t barrier_kind;
1309 #endif
1310 
1311   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1312                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1313 
1314   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1315 #if OMPT_SUPPORT
1316   if (ompt_enabled.enabled) {
1317 #if OMPT_OPTIONAL
1318     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1319     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1320     return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1321     barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1322     if (ompt_enabled.ompt_callback_sync_region) {
1323       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1324           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1325           return_address);
1326     }
1327     if (ompt_enabled.ompt_callback_sync_region_wait) {
1328       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1329           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1330           return_address);
1331     }
1332 #endif
1333     // It is OK to report the barrier state after the barrier begin callback.
1334     // According to the OMPT specification, a compliant implementation may
1335     // even delay reporting this state until the barrier begins to wait.
1336     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1337   }
1338 #endif
1339 
1340   if (!team->t.t_serialized) {
1341 #if USE_ITT_BUILD
1342     // This value will be used in itt notify events below.
1343     void *itt_sync_obj = NULL;
1344 #if USE_ITT_NOTIFY
1345     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1346       itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1347 #endif
1348 #endif /* USE_ITT_BUILD */
1349     if (__kmp_tasking_mode == tskm_extra_barrier) {
1350       __kmp_tasking_barrier(team, this_thr, gtid);
1351       KA_TRACE(15,
1352                ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1353                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1354     }
1355 
1356     /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1357        access it when the team struct is not guaranteed to exist. */
1358     // See note about the corresponding code in __kmp_join_barrier() being
1359     // performance-critical.
1360     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1361 #if KMP_USE_MONITOR
1362       this_thr->th.th_team_bt_intervals =
1363           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1364       this_thr->th.th_team_bt_set =
1365           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1366 #else
1367       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1368 #endif
1369     }
1370 
1371 #if USE_ITT_BUILD
1372     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1373       __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1374 #endif /* USE_ITT_BUILD */
1375 #if USE_DEBUGGER
1376     // Let the debugger know: the thread arrived to the barrier and waiting.
1377     if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1378       team->t.t_bar[bt].b_master_arrived += 1;
1379     } else {
1380       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1381     } // if
1382 #endif /* USE_DEBUGGER */
1383     if (reduce != NULL) {
1384       // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1385       this_thr->th.th_local.reduce_data = reduce_data;
1386     }
1387 
1388     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1389       // use 0 to only setup the current team if nthreads > 1
1390       __kmp_task_team_setup(this_thr, team, 0);
1391 
1392     if (cancellable) {
1393       cancelled = __kmp_linear_barrier_gather_cancellable(
1394           bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1395     } else {
1396       switch (__kmp_barrier_gather_pattern[bt]) {
1397       case bp_hyper_bar: {
1398         // don't set branch bits to 0; use linear
1399         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1400         __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1401                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1402         break;
1403       }
1404       case bp_hierarchical_bar: {
1405         __kmp_hierarchical_barrier_gather(
1406             bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1407         break;
1408       }
1409       case bp_tree_bar: {
1410         // don't set branch bits to 0; use linear
1411         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1412         __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1413                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1414         break;
1415       }
1416       default: {
1417         __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1418                                     reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1419       }
1420       }
1421     }
1422 
1423     KMP_MB();
1424 
1425     if (KMP_MASTER_TID(tid)) {
1426       status = 0;
1427       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1428         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1429       }
1430 #if USE_DEBUGGER
1431       // Let the debugger know: All threads are arrived and starting leaving the
1432       // barrier.
1433       team->t.t_bar[bt].b_team_arrived += 1;
1434 #endif
1435 
1436       if (__kmp_omp_cancellation) {
1437         kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1438         // Reset cancellation flag for worksharing constructs
1439         if (cancel_request == cancel_loop ||
1440             cancel_request == cancel_sections) {
1441           KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1442         }
1443       }
1444 #if USE_ITT_BUILD
1445       /* TODO: In case of split reduction barrier, master thread may send
1446          acquired event early, before the final summation into the shared
1447          variable is done (final summation can be a long operation for array
1448          reductions).  */
1449       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1450         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1451 #endif /* USE_ITT_BUILD */
1452 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1453       // Barrier - report frame end (only if active_level == 1)
1454       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1455           __kmp_forkjoin_frames_mode &&
1456           (this_thr->th.th_teams_microtask == NULL || // either not in teams
1457            this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1458           team->t.t_active_level == 1) {
1459         ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1460         kmp_uint64 cur_time = __itt_get_timestamp();
1461         kmp_info_t **other_threads = team->t.t_threads;
1462         int nproc = this_thr->th.th_team_nproc;
1463         int i;
1464         switch (__kmp_forkjoin_frames_mode) {
1465         case 1:
1466           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1467                                  loc, nproc);
1468           this_thr->th.th_frame_time = cur_time;
1469           break;
1470         case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1471           // be fixed)
1472           __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1473                                  1, loc, nproc);
1474           break;
1475         case 3:
1476           if (__itt_metadata_add_ptr) {
1477             // Initialize with master's wait time
1478             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1479             // Set arrive time to zero to be able to check it in
1480             // __kmp_invoke_task(); the same is done inside the loop below
1481             this_thr->th.th_bar_arrive_time = 0;
1482             for (i = 1; i < nproc; ++i) {
1483               delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1484               other_threads[i]->th.th_bar_arrive_time = 0;
1485             }
1486             __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1487                                          cur_time, delta,
1488                                          (kmp_uint64)(reduce != NULL));
1489           }
1490           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1491                                  loc, nproc);
1492           this_thr->th.th_frame_time = cur_time;
1493           break;
1494         }
1495       }
1496 #endif /* USE_ITT_BUILD */
1497     } else {
1498       status = 1;
1499 #if USE_ITT_BUILD
1500       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1501         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1502 #endif /* USE_ITT_BUILD */
1503     }
1504     if ((status == 1 || !is_split) && !cancelled) {
1505       if (cancellable) {
1506         cancelled = __kmp_linear_barrier_release_cancellable(
1507             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1508       } else {
1509         switch (__kmp_barrier_release_pattern[bt]) {
1510         case bp_hyper_bar: {
1511           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1512           __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1513                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1514           break;
1515         }
1516         case bp_hierarchical_bar: {
1517           __kmp_hierarchical_barrier_release(
1518               bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1519           break;
1520         }
1521         case bp_tree_bar: {
1522           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1523           __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1524                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1525           break;
1526         }
1527         default: {
1528           __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1529                                        FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1530         }
1531         }
1532       }
1533       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1534         __kmp_task_team_sync(this_thr, team);
1535       }
1536     }
1537 
1538 #if USE_ITT_BUILD
1539     /* GEH: TODO: Move this under if-condition above and also include in
1540        __kmp_end_split_barrier(). This will more accurately represent the actual
1541        release time of the threads for split barriers.  */
1542     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1543       __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1544 #endif /* USE_ITT_BUILD */
1545   } else { // Team is serialized.
1546     status = 0;
1547     if (__kmp_tasking_mode != tskm_immediate_exec) {
1548       if (this_thr->th.th_task_team != NULL) {
1549 #if USE_ITT_NOTIFY
1550         void *itt_sync_obj = NULL;
1551         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1552           itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1553           __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1554         }
1555 #endif
1556 
1557         KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1558                          TRUE);
1559         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1560         __kmp_task_team_setup(this_thr, team, 0);
1561 
1562 #if USE_ITT_BUILD
1563         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1564           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1565 #endif /* USE_ITT_BUILD */
1566       }
1567     }
1568   }
1569   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1570                 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1571                 __kmp_tid_from_gtid(gtid), status));
1572 
1573 #if OMPT_SUPPORT
1574   if (ompt_enabled.enabled) {
1575 #if OMPT_OPTIONAL
1576     if (ompt_enabled.ompt_callback_sync_region_wait) {
1577       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1578           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1579           return_address);
1580     }
1581     if (ompt_enabled.ompt_callback_sync_region) {
1582       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1583           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1584           return_address);
1585     }
1586 #endif
1587     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1588   }
1589 #endif
1590   ANNOTATE_BARRIER_END(&team->t.t_bar);
1591 
1592   if (cancellable)
1593     return (int)cancelled;
1594   return status;
1595 }
1596 
1597 // Returns 0 if master thread, 1 if worker thread.
1598 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1599                   size_t reduce_size, void *reduce_data,
1600                   void (*reduce)(void *, void *)) {
1601   return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1602                                   reduce);
1603 }
1604 
1605 #if defined(KMP_GOMP_COMPAT)
1606 // Returns 1 if cancelled, 0 otherwise
1607 int __kmp_barrier_gomp_cancel(int gtid) {
1608   if (__kmp_omp_cancellation) {
1609     int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1610                                                  0, NULL, NULL);
1611     if (cancelled) {
1612       int tid = __kmp_tid_from_gtid(gtid);
1613       kmp_info_t *this_thr = __kmp_threads[gtid];
1614       if (KMP_MASTER_TID(tid)) {
1615         // Master does not need to revert anything
1616       } else {
1617         // Workers need to revert their private b_arrived flag
1618         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1619             KMP_BARRIER_STATE_BUMP;
1620       }
1621     }
1622     return cancelled;
1623   }
1624   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1625   return FALSE;
1626 }
1627 #endif
1628 
1629 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1630   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1631   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1632   int tid = __kmp_tid_from_gtid(gtid);
1633   kmp_info_t *this_thr = __kmp_threads[gtid];
1634   kmp_team_t *team = this_thr->th.th_team;
1635 
1636   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1637   if (!team->t.t_serialized) {
1638     if (KMP_MASTER_GTID(gtid)) {
1639       switch (__kmp_barrier_release_pattern[bt]) {
1640       case bp_hyper_bar: {
1641         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1642         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1643                                     FALSE USE_ITT_BUILD_ARG(NULL));
1644         break;
1645       }
1646       case bp_hierarchical_bar: {
1647         __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1648                                            FALSE USE_ITT_BUILD_ARG(NULL));
1649         break;
1650       }
1651       case bp_tree_bar: {
1652         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1653         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1654                                    FALSE USE_ITT_BUILD_ARG(NULL));
1655         break;
1656       }
1657       default: {
1658         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1659                                      FALSE USE_ITT_BUILD_ARG(NULL));
1660       }
1661       }
1662       if (__kmp_tasking_mode != tskm_immediate_exec) {
1663         __kmp_task_team_sync(this_thr, team);
1664       } // if
1665     }
1666   }
1667   ANNOTATE_BARRIER_END(&team->t.t_bar);
1668 }
1669 
1670 void __kmp_join_barrier(int gtid) {
1671   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1672   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1673   kmp_info_t *this_thr = __kmp_threads[gtid];
1674   kmp_team_t *team;
1675   kmp_uint nproc;
1676   kmp_info_t *master_thread;
1677   int tid;
1678 #ifdef KMP_DEBUG
1679   int team_id;
1680 #endif /* KMP_DEBUG */
1681 #if USE_ITT_BUILD
1682   void *itt_sync_obj = NULL;
1683 #if USE_ITT_NOTIFY
1684   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1685     // Get object created at fork_barrier
1686     itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1687 #endif
1688 #endif /* USE_ITT_BUILD */
1689   KMP_MB();
1690 
1691   // Get current info
1692   team = this_thr->th.th_team;
1693   nproc = this_thr->th.th_team_nproc;
1694   KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1695   tid = __kmp_tid_from_gtid(gtid);
1696 #ifdef KMP_DEBUG
1697   team_id = team->t.t_id;
1698 #endif /* KMP_DEBUG */
1699   master_thread = this_thr->th.th_team_master;
1700 #ifdef KMP_DEBUG
1701   if (master_thread != team->t.t_threads[0]) {
1702     __kmp_print_structure();
1703   }
1704 #endif /* KMP_DEBUG */
1705   KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1706   KMP_MB();
1707 
1708   // Verify state
1709   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1710   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1711   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1712   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1713   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1714                 gtid, team_id, tid));
1715 
1716   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1717 #if OMPT_SUPPORT
1718   if (ompt_enabled.enabled) {
1719 #if OMPT_OPTIONAL
1720     ompt_data_t *my_task_data;
1721     ompt_data_t *my_parallel_data;
1722     void *codeptr = NULL;
1723     int ds_tid = this_thr->th.th_info.ds.ds_tid;
1724     if (KMP_MASTER_TID(ds_tid) &&
1725         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1726          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1727       codeptr = team->t.ompt_team_info.master_return_address;
1728     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1729     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1730     if (ompt_enabled.ompt_callback_sync_region) {
1731       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1732           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1733           my_task_data, codeptr);
1734     }
1735     if (ompt_enabled.ompt_callback_sync_region_wait) {
1736       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1737           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1738           my_task_data, codeptr);
1739     }
1740     if (!KMP_MASTER_TID(ds_tid))
1741       this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1742 #endif
1743     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1744   }
1745 #endif
1746 
1747   if (__kmp_tasking_mode == tskm_extra_barrier) {
1748     __kmp_tasking_barrier(team, this_thr, gtid);
1749     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1750                   team_id, tid));
1751   }
1752 #ifdef KMP_DEBUG
1753   if (__kmp_tasking_mode != tskm_immediate_exec) {
1754     KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1755                   "%p, th_task_team = %p\n",
1756                   __kmp_gtid_from_thread(this_thr), team_id,
1757                   team->t.t_task_team[this_thr->th.th_task_state],
1758                   this_thr->th.th_task_team));
1759     KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1760                      team->t.t_task_team[this_thr->th.th_task_state]);
1761   }
1762 #endif /* KMP_DEBUG */
1763 
1764   /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1765      access it when the team struct is not guaranteed to exist. Doing these
1766      loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1767      we do not perform the copy if blocktime=infinite, since the values are not
1768      used by __kmp_wait_template() in that case. */
1769   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1770 #if KMP_USE_MONITOR
1771     this_thr->th.th_team_bt_intervals =
1772         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1773     this_thr->th.th_team_bt_set =
1774         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1775 #else
1776     this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1777 #endif
1778   }
1779 
1780 #if USE_ITT_BUILD
1781   if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1782     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1783 #endif /* USE_ITT_BUILD */
1784 
1785   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1786   case bp_hyper_bar: {
1787     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1788     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1789                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1790     break;
1791   }
1792   case bp_hierarchical_bar: {
1793     __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1794                                       NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1795     break;
1796   }
1797   case bp_tree_bar: {
1798     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1799     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1800                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1801     break;
1802   }
1803   default: {
1804     __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1805                                 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1806   }
1807   }
1808 
1809   /* From this point on, the team data structure may be deallocated at any time
1810      by the master thread - it is unsafe to reference it in any of the worker
1811      threads. Any per-team data items that need to be referenced before the
1812      end of the barrier should be moved to the kmp_task_team_t structs.  */
1813   if (KMP_MASTER_TID(tid)) {
1814     if (__kmp_tasking_mode != tskm_immediate_exec) {
1815       __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1816     }
1817     if (__kmp_display_affinity) {
1818       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1819     }
1820 #if KMP_STATS_ENABLED
1821     // Have master thread flag the workers to indicate they are now waiting for
1822     // next parallel region, Also wake them up so they switch their timers to
1823     // idle.
1824     for (int i = 0; i < team->t.t_nproc; ++i) {
1825       kmp_info_t *team_thread = team->t.t_threads[i];
1826       if (team_thread == this_thr)
1827         continue;
1828       team_thread->th.th_stats->setIdleFlag();
1829       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1830           team_thread->th.th_sleep_loc != NULL)
1831         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1832                                   team_thread->th.th_sleep_loc);
1833     }
1834 #endif
1835 #if USE_ITT_BUILD
1836     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1837       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1838 #endif /* USE_ITT_BUILD */
1839 
1840 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1841     // Join barrier - report frame end
1842     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1843         __kmp_forkjoin_frames_mode &&
1844         (this_thr->th.th_teams_microtask == NULL || // either not in teams
1845          this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1846         team->t.t_active_level == 1) {
1847       kmp_uint64 cur_time = __itt_get_timestamp();
1848       ident_t *loc = team->t.t_ident;
1849       kmp_info_t **other_threads = team->t.t_threads;
1850       int nproc = this_thr->th.th_team_nproc;
1851       int i;
1852       switch (__kmp_forkjoin_frames_mode) {
1853       case 1:
1854         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1855                                loc, nproc);
1856         break;
1857       case 2:
1858         __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1859                                loc, nproc);
1860         break;
1861       case 3:
1862         if (__itt_metadata_add_ptr) {
1863           // Initialize with master's wait time
1864           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1865           // Set arrive time to zero to be able to check it in
1866           // __kmp_invoke_task(); the same is done inside the loop below
1867           this_thr->th.th_bar_arrive_time = 0;
1868           for (i = 1; i < nproc; ++i) {
1869             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1870             other_threads[i]->th.th_bar_arrive_time = 0;
1871           }
1872           __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1873                                        cur_time, delta, 0);
1874         }
1875         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1876                                loc, nproc);
1877         this_thr->th.th_frame_time = cur_time;
1878         break;
1879       }
1880     }
1881 #endif /* USE_ITT_BUILD */
1882   }
1883 #if USE_ITT_BUILD
1884   else {
1885     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1886       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1887   }
1888 #endif /* USE_ITT_BUILD */
1889 
1890 #if KMP_DEBUG
1891   if (KMP_MASTER_TID(tid)) {
1892     KA_TRACE(
1893         15,
1894         ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1895          gtid, team_id, tid, nproc));
1896   }
1897 #endif /* KMP_DEBUG */
1898 
1899   // TODO now, mark worker threads as done so they may be disbanded
1900   KMP_MB(); // Flush all pending memory write invalidates.
1901   KA_TRACE(10,
1902            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1903 
1904   ANNOTATE_BARRIER_END(&team->t.t_bar);
1905 }
1906 
1907 // TODO release worker threads' fork barriers as we are ready instead of all at
1908 // once
1909 void __kmp_fork_barrier(int gtid, int tid) {
1910   KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1911   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1912   kmp_info_t *this_thr = __kmp_threads[gtid];
1913   kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1914 #if USE_ITT_BUILD
1915   void *itt_sync_obj = NULL;
1916 #endif /* USE_ITT_BUILD */
1917   if (team)
1918     ANNOTATE_BARRIER_END(&team->t.t_bar);
1919 
1920   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1921                 (team != NULL) ? team->t.t_id : -1, tid));
1922 
1923   // th_team pointer only valid for master thread here
1924   if (KMP_MASTER_TID(tid)) {
1925 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1926     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1927       // Create itt barrier object
1928       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1929       __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1930     }
1931 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1932 
1933 #ifdef KMP_DEBUG
1934     kmp_info_t **other_threads = team->t.t_threads;
1935     int i;
1936 
1937     // Verify state
1938     KMP_MB();
1939 
1940     for (i = 1; i < team->t.t_nproc; ++i) {
1941       KA_TRACE(500,
1942                ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1943                 "== %u.\n",
1944                 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1945                 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1946                 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1947       KMP_DEBUG_ASSERT(
1948           (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1949            ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1950       KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1951     }
1952 #endif
1953 
1954     if (__kmp_tasking_mode != tskm_immediate_exec) {
1955       // 0 indicates setup current task team if nthreads > 1
1956       __kmp_task_team_setup(this_thr, team, 0);
1957     }
1958 
1959     /* The master thread may have changed its blocktime between the join barrier
1960        and the fork barrier. Copy the blocktime info to the thread, where
1961        __kmp_wait_template() can access it when the team struct is not
1962        guaranteed to exist. */
1963     // See note about the corresponding code in __kmp_join_barrier() being
1964     // performance-critical
1965     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1966 #if KMP_USE_MONITOR
1967       this_thr->th.th_team_bt_intervals =
1968           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1969       this_thr->th.th_team_bt_set =
1970           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1971 #else
1972       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1973 #endif
1974     }
1975   } // master
1976 
1977   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1978   case bp_hyper_bar: {
1979     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1980     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1981                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1982     break;
1983   }
1984   case bp_hierarchical_bar: {
1985     __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1986                                        TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1987     break;
1988   }
1989   case bp_tree_bar: {
1990     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1991     __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1992                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1993     break;
1994   }
1995   default: {
1996     __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1997                                  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1998   }
1999   }
2000 
2001 #if OMPT_SUPPORT
2002   if (ompt_enabled.enabled &&
2003       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2004     int ds_tid = this_thr->th.th_info.ds.ds_tid;
2005     ompt_data_t *task_data = (team)
2006                                  ? OMPT_CUR_TASK_DATA(this_thr)
2007                                  : &(this_thr->th.ompt_thread_info.task_data);
2008     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2009 #if OMPT_OPTIONAL
2010     void *codeptr = NULL;
2011     if (KMP_MASTER_TID(ds_tid) &&
2012         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2013          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2014       codeptr = team->t.ompt_team_info.master_return_address;
2015     if (ompt_enabled.ompt_callback_sync_region_wait) {
2016       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2017           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2018           codeptr);
2019     }
2020     if (ompt_enabled.ompt_callback_sync_region) {
2021       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2022           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2023           codeptr);
2024     }
2025 #endif
2026     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2027       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2028           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2029     }
2030   }
2031 #endif
2032 
2033   // Early exit for reaping threads releasing forkjoin barrier
2034   if (TCR_4(__kmp_global.g.g_done)) {
2035     this_thr->th.th_task_team = NULL;
2036 
2037 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2038     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2039       if (!KMP_MASTER_TID(tid)) {
2040         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2041         if (itt_sync_obj)
2042           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2043       }
2044     }
2045 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2046     KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2047     return;
2048   }
2049 
2050   /* We can now assume that a valid team structure has been allocated by the
2051      master and propagated to all worker threads. The current thread, however,
2052      may not be part of the team, so we can't blindly assume that the team
2053      pointer is non-null.  */
2054   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2055   KMP_DEBUG_ASSERT(team != NULL);
2056   tid = __kmp_tid_from_gtid(gtid);
2057 
2058 #if KMP_BARRIER_ICV_PULL
2059   /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2060      __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2061      implicit task has this data before this function is called. We cannot
2062      modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2063      struct, because it is not always the case that the threads arrays have
2064      been allocated when __kmp_fork_call() is executed. */
2065   {
2066     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2067     if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2068       // Copy the initial ICVs from the master's thread struct to the implicit
2069       // task for this tid.
2070       KA_TRACE(10,
2071                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2072       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2073                                tid, FALSE);
2074       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2075                 &team->t.t_threads[0]
2076                      ->th.th_bar[bs_forkjoin_barrier]
2077                      .bb.th_fixed_icvs);
2078     }
2079   }
2080 #endif // KMP_BARRIER_ICV_PULL
2081 
2082   if (__kmp_tasking_mode != tskm_immediate_exec) {
2083     __kmp_task_team_sync(this_thr, team);
2084   }
2085 
2086 #if KMP_AFFINITY_SUPPORTED
2087   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2088   if (proc_bind == proc_bind_intel) {
2089     // Call dynamic affinity settings
2090     if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2091       __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2092     }
2093   } else if (proc_bind != proc_bind_false) {
2094     if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2095       KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2096                      __kmp_gtid_from_thread(this_thr),
2097                      this_thr->th.th_current_place));
2098     } else {
2099       __kmp_affinity_set_place(gtid);
2100     }
2101   }
2102 #endif // KMP_AFFINITY_SUPPORTED
2103   // Perform the display affinity functionality
2104   if (__kmp_display_affinity) {
2105     if (team->t.t_display_affinity
2106 #if KMP_AFFINITY_SUPPORTED
2107         || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2108 #endif
2109             ) {
2110       // NULL means use the affinity-format-var ICV
2111       __kmp_aux_display_affinity(gtid, NULL);
2112       this_thr->th.th_prev_num_threads = team->t.t_nproc;
2113       this_thr->th.th_prev_level = team->t.t_level;
2114     }
2115   }
2116   if (!KMP_MASTER_TID(tid))
2117     KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2118 
2119 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2120   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2121     if (!KMP_MASTER_TID(tid)) {
2122       // Get correct barrier object
2123       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2124       __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2125     } // (prepare called inside barrier_release)
2126   }
2127 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2128   ANNOTATE_BARRIER_END(&team->t.t_bar);
2129   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2130                 team->t.t_id, tid));
2131 }
2132 
2133 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2134                           kmp_internal_control_t *new_icvs, ident_t *loc) {
2135   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2136 
2137   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2138   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2139 
2140 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2141    __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2142    implicit task has this data before this function is called. */
2143 #if KMP_BARRIER_ICV_PULL
2144   /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2145      untouched), where all of the worker threads can access them and make their
2146      own copies after the barrier. */
2147   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2148   // allocated at this point
2149   copy_icvs(
2150       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2151       new_icvs);
2152   KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2153                 team->t.t_threads[0], team));
2154 #elif KMP_BARRIER_ICV_PUSH
2155   // The ICVs will be propagated in the fork barrier, so nothing needs to be
2156   // done here.
2157   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2158                 team->t.t_threads[0], team));
2159 #else
2160   // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
2161   // time.
2162   ngo_load(new_icvs);
2163   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2164   // allocated at this point
2165   for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2166     // TODO: GEH - pass in better source location info since usually NULL here
2167     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2168                   f, team->t.t_threads[f], team));
2169     __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2170     ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2171     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2172                   f, team->t.t_threads[f], team));
2173   }
2174   ngo_sync();
2175 #endif // KMP_BARRIER_ICV_PULL
2176 }
2177