1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #include "tsan_annotations.h"
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 
44 // Linear Barrier
45 template <bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50   kmp_team_t *team = this_thr->th.th_team;
51   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52   kmp_info_t **other_threads = team->t.t_threads;
53 
54   KA_TRACE(
55       20,
56       ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57        gtid, team->t.t_id, tid, bt));
58   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61   // Barrier imbalance - save arrive time to the thread
62   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64         __itt_get_timestamp();
65   }
66 #endif
67   // We now perform a linear reduction to signal that all of the threads have
68   // arrived.
69   if (!KMP_MASTER_TID(tid)) {
70     KA_TRACE(20,
71              ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72               "arrived(%p): %llu => %llu\n",
73               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76     // Mark arrival to master thread
77     /* After performing this write, a worker thread may not assume that the team
78        is valid any more - it could be deallocated by the master thread at any
79        time. */
80     ANNOTATE_BARRIER_BEGIN(this_thr);
81     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
82     flag.release();
83   } else {
84     kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85     int nproc = this_thr->th.th_team_nproc;
86     int i;
87     // Don't have to worry about sleep bit here or atomic since team setting
88     kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89 
90     // Collect all the worker team member threads.
91     for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93       // Prefetch next thread's arrived count
94       if (i + 1 < nproc)
95         KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97       KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98                     "arrived(%p) == %llu\n",
99                     gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100                     team->t.t_id, i,
101                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102 
103       // Wait for worker thread to arrive
104       if (cancellable) {
105         kmp_flag_64<true, false> flag(
106             &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
107         if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
108           return true;
109       } else {
110         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
111                            new_state);
112         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113       }
114       ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116       // Barrier imbalance - write min of the thread time and the other thread
117       // time to the thread.
118       if (__kmp_forkjoin_frames_mode == 2) {
119         this_thr->th.th_bar_min_time = KMP_MIN(
120             this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121       }
122 #endif
123       if (reduce) {
124         KA_TRACE(100,
125                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127                   team->t.t_id, i));
128         ANNOTATE_REDUCE_AFTER(reduce);
129         OMPT_REDUCTION_DECL(this_thr, gtid);
130         OMPT_REDUCTION_BEGIN;
131         (*reduce)(this_thr->th.th_local.reduce_data,
132                   other_threads[i]->th.th_local.reduce_data);
133         OMPT_REDUCTION_END;
134         ANNOTATE_REDUCE_BEFORE(reduce);
135         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136       }
137     }
138     // Don't have to worry about sleep bit here or atomic since team setting
139     team_bar->b_arrived = new_state;
140     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141                   "arrived(%p) = %llu\n",
142                   gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143                   new_state));
144   }
145   KA_TRACE(
146       20,
147       ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148        gtid, team->t.t_id, tid, bt));
149   return false;
150 }
151 
152 template <bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158   kmp_team_t *team;
159 
160   if (KMP_MASTER_TID(tid)) {
161     unsigned int i;
162     kmp_uint32 nproc = this_thr->th.th_team_nproc;
163     kmp_info_t **other_threads;
164 
165     team = __kmp_threads[gtid]->th.th_team;
166     KMP_DEBUG_ASSERT(team != NULL);
167     other_threads = team->t.t_threads;
168 
169     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170                   "barrier type %d\n",
171                   gtid, team->t.t_id, tid, bt));
172 
173     if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175       {
176         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177         if (propagate_icvs) {
178           ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179           for (i = 1; i < nproc; ++i) {
180             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181                                      team, i, FALSE);
182             ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183                            &team->t.t_implicit_task_taskdata[0].td_icvs);
184           }
185           ngo_sync();
186         }
187       }
188 #endif // KMP_BARRIER_ICV_PUSH
189 
190       // Now, release all of the worker threads
191       for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193         // Prefetch next thread's go flag
194         if (i + 1 < nproc)
195           KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197         KA_TRACE(
198             20,
199             ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200              "go(%p): %u => %u\n",
201              gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203              other_threads[i]->th.th_bar[bt].bb.b_go,
204              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205         ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207                            other_threads[i]);
208         flag.release();
209       }
210     }
211   } else { // Wait for the MASTER thread to release us
212     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214     if (cancellable) {
215       kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
216       if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
217         return true;
218     } else {
219       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
220       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
221     }
222     ANNOTATE_BARRIER_END(this_thr);
223 #if USE_ITT_BUILD && USE_ITT_NOTIFY
224     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
225       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
226       // disabled)
227       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
228       // Cancel wait on previous parallel region...
229       __kmp_itt_task_starting(itt_sync_obj);
230 
231       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
232         return false;
233 
234       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
235       if (itt_sync_obj != NULL)
236         // Call prepare as early as possible for "new" barrier
237         __kmp_itt_task_finished(itt_sync_obj);
238     } else
239 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
240         // Early exit for reaping threads releasing forkjoin barrier
241         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
242       return false;
243 // The worker thread may now assume that the team is valid.
244 #ifdef KMP_DEBUG
245     tid = __kmp_tid_from_gtid(gtid);
246     team = __kmp_threads[gtid]->th.th_team;
247 #endif
248     KMP_DEBUG_ASSERT(team != NULL);
249     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
250     KA_TRACE(20,
251              ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
252               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
253     KMP_MB(); // Flush all pending memory write invalidates.
254   }
255   KA_TRACE(
256       20,
257       ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
258        gtid, team->t.t_id, tid, bt));
259   return false;
260 }
261 
262 static void __kmp_linear_barrier_gather(
263     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
264     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
265   __kmp_linear_barrier_gather_template<false>(
266       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
267 }
268 
269 static bool __kmp_linear_barrier_gather_cancellable(
270     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
271     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
272   return __kmp_linear_barrier_gather_template<true>(
273       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
274 }
275 
276 static void __kmp_linear_barrier_release(
277     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
278     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
279   __kmp_linear_barrier_release_template<false>(
280       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
281 }
282 
283 static bool __kmp_linear_barrier_release_cancellable(
284     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
285     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
286   return __kmp_linear_barrier_release_template<true>(
287       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
288 }
289 
290 // Tree barrier
291 static void __kmp_tree_barrier_gather(
292     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
293     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
294   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
295   kmp_team_t *team = this_thr->th.th_team;
296   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
297   kmp_info_t **other_threads = team->t.t_threads;
298   kmp_uint32 nproc = this_thr->th.th_team_nproc;
299   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
300   kmp_uint32 branch_factor = 1 << branch_bits;
301   kmp_uint32 child;
302   kmp_uint32 child_tid;
303   kmp_uint64 new_state;
304 
305   KA_TRACE(
306       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
307            gtid, team->t.t_id, tid, bt));
308   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
309 
310 #if USE_ITT_BUILD && USE_ITT_NOTIFY
311   // Barrier imbalance - save arrive time to the thread
312   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
313     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
314         __itt_get_timestamp();
315   }
316 #endif
317   // Perform tree gather to wait until all threads have arrived; reduce any
318   // required data as we go
319   child_tid = (tid << branch_bits) + 1;
320   if (child_tid < nproc) {
321     // Parent threads wait for all their children to arrive
322     new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
323     child = 1;
324     do {
325       kmp_info_t *child_thr = other_threads[child_tid];
326       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
327 #if KMP_CACHE_MANAGE
328       // Prefetch next thread's arrived count
329       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
330         KMP_CACHE_PREFETCH(
331             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
332 #endif /* KMP_CACHE_MANAGE */
333       KA_TRACE(20,
334                ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
335                 "arrived(%p) == %llu\n",
336                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
337                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
338       // Wait for child to arrive
339       kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
340       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
341       ANNOTATE_BARRIER_END(child_thr);
342 #if USE_ITT_BUILD && USE_ITT_NOTIFY
343       // Barrier imbalance - write min of the thread time and a child time to
344       // the thread.
345       if (__kmp_forkjoin_frames_mode == 2) {
346         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
347                                                child_thr->th.th_bar_min_time);
348       }
349 #endif
350       if (reduce) {
351         KA_TRACE(100,
352                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
353                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
354                   team->t.t_id, child_tid));
355         ANNOTATE_REDUCE_AFTER(reduce);
356         OMPT_REDUCTION_DECL(this_thr, gtid);
357         OMPT_REDUCTION_BEGIN;
358         (*reduce)(this_thr->th.th_local.reduce_data,
359                   child_thr->th.th_local.reduce_data);
360         OMPT_REDUCTION_END;
361         ANNOTATE_REDUCE_BEFORE(reduce);
362         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
363       }
364       child++;
365       child_tid++;
366     } while (child <= branch_factor && child_tid < nproc);
367   }
368 
369   if (!KMP_MASTER_TID(tid)) { // Worker threads
370     kmp_int32 parent_tid = (tid - 1) >> branch_bits;
371 
372     KA_TRACE(20,
373              ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
374               "arrived(%p): %llu => %llu\n",
375               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
376               team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
377               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
378 
379     // Mark arrival to parent thread
380     /* After performing this write, a worker thread may not assume that the team
381        is valid any more - it could be deallocated by the master thread at any
382        time.  */
383     ANNOTATE_BARRIER_BEGIN(this_thr);
384     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
385     flag.release();
386   } else {
387     // Need to update the team arrived pointer if we are the master thread
388     if (nproc > 1) // New value was already computed above
389       team->t.t_bar[bt].b_arrived = new_state;
390     else
391       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
392     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
393                   "arrived(%p) = %llu\n",
394                   gtid, team->t.t_id, tid, team->t.t_id,
395                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
396   }
397   KA_TRACE(20,
398            ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
399             gtid, team->t.t_id, tid, bt));
400 }
401 
402 static void __kmp_tree_barrier_release(
403     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
404     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
405   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
406   kmp_team_t *team;
407   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
408   kmp_uint32 nproc;
409   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
410   kmp_uint32 branch_factor = 1 << branch_bits;
411   kmp_uint32 child;
412   kmp_uint32 child_tid;
413 
414   // Perform a tree release for all of the threads that have been gathered
415   if (!KMP_MASTER_TID(
416           tid)) { // Handle fork barrier workers who aren't part of a team yet
417     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
418                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
419     // Wait for parent thread to release us
420     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
421     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
422     ANNOTATE_BARRIER_END(this_thr);
423 #if USE_ITT_BUILD && USE_ITT_NOTIFY
424     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
425       // In fork barrier where we could not get the object reliably (or
426       // ITTNOTIFY is disabled)
427       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
428       // Cancel wait on previous parallel region...
429       __kmp_itt_task_starting(itt_sync_obj);
430 
431       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
432         return;
433 
434       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
435       if (itt_sync_obj != NULL)
436         // Call prepare as early as possible for "new" barrier
437         __kmp_itt_task_finished(itt_sync_obj);
438     } else
439 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
440         // Early exit for reaping threads releasing forkjoin barrier
441         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
442       return;
443 
444     // The worker thread may now assume that the team is valid.
445     team = __kmp_threads[gtid]->th.th_team;
446     KMP_DEBUG_ASSERT(team != NULL);
447     tid = __kmp_tid_from_gtid(gtid);
448 
449     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
450     KA_TRACE(20,
451              ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
452               team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
453     KMP_MB(); // Flush all pending memory write invalidates.
454   } else {
455     team = __kmp_threads[gtid]->th.th_team;
456     KMP_DEBUG_ASSERT(team != NULL);
457     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
458                   "barrier type %d\n",
459                   gtid, team->t.t_id, tid, bt));
460   }
461   nproc = this_thr->th.th_team_nproc;
462   child_tid = (tid << branch_bits) + 1;
463 
464   if (child_tid < nproc) {
465     kmp_info_t **other_threads = team->t.t_threads;
466     child = 1;
467     // Parent threads release all their children
468     do {
469       kmp_info_t *child_thr = other_threads[child_tid];
470       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
471 #if KMP_CACHE_MANAGE
472       // Prefetch next thread's go count
473       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
474         KMP_CACHE_PREFETCH(
475             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
476 #endif /* KMP_CACHE_MANAGE */
477 
478 #if KMP_BARRIER_ICV_PUSH
479       {
480         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
481         if (propagate_icvs) {
482           __kmp_init_implicit_task(team->t.t_ident,
483                                    team->t.t_threads[child_tid], team,
484                                    child_tid, FALSE);
485           copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
486                     &team->t.t_implicit_task_taskdata[0].td_icvs);
487         }
488       }
489 #endif // KMP_BARRIER_ICV_PUSH
490       KA_TRACE(20,
491                ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
492                 "go(%p): %u => %u\n",
493                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
494                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
495                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
496       // Release child from barrier
497       ANNOTATE_BARRIER_BEGIN(child_thr);
498       kmp_flag_64<> flag(&child_bar->b_go, child_thr);
499       flag.release();
500       child++;
501       child_tid++;
502     } while (child <= branch_factor && child_tid < nproc);
503   }
504   KA_TRACE(
505       20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
506            gtid, team->t.t_id, tid, bt));
507 }
508 
509 // Hyper Barrier
510 static void __kmp_hyper_barrier_gather(
511     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
512     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
513   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
514   kmp_team_t *team = this_thr->th.th_team;
515   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
516   kmp_info_t **other_threads = team->t.t_threads;
517   kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
518   kmp_uint32 num_threads = this_thr->th.th_team_nproc;
519   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
520   kmp_uint32 branch_factor = 1 << branch_bits;
521   kmp_uint32 offset;
522   kmp_uint32 level;
523 
524   KA_TRACE(
525       20,
526       ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
527        gtid, team->t.t_id, tid, bt));
528   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
529 
530 #if USE_ITT_BUILD && USE_ITT_NOTIFY
531   // Barrier imbalance - save arrive time to the thread
532   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
533     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
534         __itt_get_timestamp();
535   }
536 #endif
537   /* Perform a hypercube-embedded tree gather to wait until all of the threads
538      have arrived, and reduce any required data as we go.  */
539   kmp_flag_64<> p_flag(&thr_bar->b_arrived);
540   for (level = 0, offset = 1; offset < num_threads;
541        level += branch_bits, offset <<= branch_bits) {
542     kmp_uint32 child;
543     kmp_uint32 child_tid;
544 
545     if (((tid >> level) & (branch_factor - 1)) != 0) {
546       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
547 
548       KMP_MB(); // Synchronize parent and child threads.
549       KA_TRACE(20,
550                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
551                 "arrived(%p): %llu => %llu\n",
552                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
553                 team->t.t_id, parent_tid, &thr_bar->b_arrived,
554                 thr_bar->b_arrived,
555                 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
556       // Mark arrival to parent thread
557       /* After performing this write (in the last iteration of the enclosing for
558          loop), a worker thread may not assume that the team is valid any more
559          - it could be deallocated by the master thread at any time.  */
560       ANNOTATE_BARRIER_BEGIN(this_thr);
561       p_flag.set_waiter(other_threads[parent_tid]);
562       p_flag.release();
563       break;
564     }
565 
566     // Parent threads wait for children to arrive
567     if (new_state == KMP_BARRIER_UNUSED_STATE)
568       new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
569     for (child = 1, child_tid = tid + (1 << level);
570          child < branch_factor && child_tid < num_threads;
571          child++, child_tid += (1 << level)) {
572       kmp_info_t *child_thr = other_threads[child_tid];
573       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
574 #if KMP_CACHE_MANAGE
575       kmp_uint32 next_child_tid = child_tid + (1 << level);
576       // Prefetch next thread's arrived count
577       if (child + 1 < branch_factor && next_child_tid < num_threads)
578         KMP_CACHE_PREFETCH(
579             &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
580 #endif /* KMP_CACHE_MANAGE */
581       KA_TRACE(20,
582                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
583                 "arrived(%p) == %llu\n",
584                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
585                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
586       // Wait for child to arrive
587       kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
588       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
589       ANNOTATE_BARRIER_END(child_thr);
590       KMP_MB(); // Synchronize parent and child threads.
591 #if USE_ITT_BUILD && USE_ITT_NOTIFY
592       // Barrier imbalance - write min of the thread time and a child time to
593       // the thread.
594       if (__kmp_forkjoin_frames_mode == 2) {
595         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
596                                                child_thr->th.th_bar_min_time);
597       }
598 #endif
599       if (reduce) {
600         KA_TRACE(100,
601                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
602                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
603                   team->t.t_id, child_tid));
604         ANNOTATE_REDUCE_AFTER(reduce);
605         OMPT_REDUCTION_DECL(this_thr, gtid);
606         OMPT_REDUCTION_BEGIN;
607         (*reduce)(this_thr->th.th_local.reduce_data,
608                   child_thr->th.th_local.reduce_data);
609         OMPT_REDUCTION_END;
610         ANNOTATE_REDUCE_BEFORE(reduce);
611         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
612       }
613     }
614   }
615 
616   if (KMP_MASTER_TID(tid)) {
617     // Need to update the team arrived pointer if we are the master thread
618     if (new_state == KMP_BARRIER_UNUSED_STATE)
619       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
620     else
621       team->t.t_bar[bt].b_arrived = new_state;
622     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
623                   "arrived(%p) = %llu\n",
624                   gtid, team->t.t_id, tid, team->t.t_id,
625                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
626   }
627   KA_TRACE(
628       20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
629            gtid, team->t.t_id, tid, bt));
630 }
631 
632 // The reverse versions seem to beat the forward versions overall
633 #define KMP_REVERSE_HYPER_BAR
634 static void __kmp_hyper_barrier_release(
635     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
636     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
637   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
638   kmp_team_t *team;
639   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
640   kmp_info_t **other_threads;
641   kmp_uint32 num_threads;
642   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
643   kmp_uint32 branch_factor = 1 << branch_bits;
644   kmp_uint32 child;
645   kmp_uint32 child_tid;
646   kmp_uint32 offset;
647   kmp_uint32 level;
648 
649   /* Perform a hypercube-embedded tree release for all of the threads that have
650      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
651      are released in the reverse order of the corresponding gather, otherwise
652      threads are released in the same order. */
653   if (KMP_MASTER_TID(tid)) { // master
654     team = __kmp_threads[gtid]->th.th_team;
655     KMP_DEBUG_ASSERT(team != NULL);
656     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
657                   "barrier type %d\n",
658                   gtid, team->t.t_id, tid, bt));
659 #if KMP_BARRIER_ICV_PUSH
660     if (propagate_icvs) { // master already has ICVs in final destination; copy
661       copy_icvs(&thr_bar->th_fixed_icvs,
662                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
663     }
664 #endif
665   } else { // Handle fork barrier workers who aren't part of a team yet
666     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
667                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
668     // Wait for parent thread to release us
669     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
670     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
671     ANNOTATE_BARRIER_END(this_thr);
672 #if USE_ITT_BUILD && USE_ITT_NOTIFY
673     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
674       // In fork barrier where we could not get the object reliably
675       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
676       // Cancel wait on previous parallel region...
677       __kmp_itt_task_starting(itt_sync_obj);
678 
679       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
680         return;
681 
682       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
683       if (itt_sync_obj != NULL)
684         // Call prepare as early as possible for "new" barrier
685         __kmp_itt_task_finished(itt_sync_obj);
686     } else
687 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
688         // Early exit for reaping threads releasing forkjoin barrier
689         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
690       return;
691 
692     // The worker thread may now assume that the team is valid.
693     team = __kmp_threads[gtid]->th.th_team;
694     KMP_DEBUG_ASSERT(team != NULL);
695     tid = __kmp_tid_from_gtid(gtid);
696 
697     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
698     KA_TRACE(20,
699              ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
700               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
701     KMP_MB(); // Flush all pending memory write invalidates.
702   }
703   num_threads = this_thr->th.th_team_nproc;
704   other_threads = team->t.t_threads;
705 
706 #ifdef KMP_REVERSE_HYPER_BAR
707   // Count up to correct level for parent
708   for (level = 0, offset = 1;
709        offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
710        level += branch_bits, offset <<= branch_bits)
711     ;
712 
713   // Now go down from there
714   for (level -= branch_bits, offset >>= branch_bits; offset != 0;
715        level -= branch_bits, offset >>= branch_bits)
716 #else
717   // Go down the tree, level by level
718   for (level = 0, offset = 1; offset < num_threads;
719        level += branch_bits, offset <<= branch_bits)
720 #endif // KMP_REVERSE_HYPER_BAR
721   {
722 #ifdef KMP_REVERSE_HYPER_BAR
723     /* Now go in reverse order through the children, highest to lowest.
724        Initial setting of child is conservative here. */
725     child = num_threads >> ((level == 0) ? level : level - 1);
726     for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
727         child_tid = tid + (child << level);
728          child >= 1; child--, child_tid -= (1 << level))
729 #else
730     if (((tid >> level) & (branch_factor - 1)) != 0)
731       // No need to go lower than this, since this is the level parent would be
732       // notified
733       break;
734     // Iterate through children on this level of the tree
735     for (child = 1, child_tid = tid + (1 << level);
736          child < branch_factor && child_tid < num_threads;
737          child++, child_tid += (1 << level))
738 #endif // KMP_REVERSE_HYPER_BAR
739     {
740       if (child_tid >= num_threads)
741         continue; // Child doesn't exist so keep going
742       else {
743         kmp_info_t *child_thr = other_threads[child_tid];
744         kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
745 #if KMP_CACHE_MANAGE
746         kmp_uint32 next_child_tid = child_tid - (1 << level);
747 // Prefetch next thread's go count
748 #ifdef KMP_REVERSE_HYPER_BAR
749         if (child - 1 >= 1 && next_child_tid < num_threads)
750 #else
751         if (child + 1 < branch_factor && next_child_tid < num_threads)
752 #endif // KMP_REVERSE_HYPER_BAR
753           KMP_CACHE_PREFETCH(
754               &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
755 #endif /* KMP_CACHE_MANAGE */
756 
757 #if KMP_BARRIER_ICV_PUSH
758         if (propagate_icvs) // push my fixed ICVs to my child
759           copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
760 #endif // KMP_BARRIER_ICV_PUSH
761 
762         KA_TRACE(
763             20,
764             ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
765              "go(%p): %u => %u\n",
766              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
767              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
768              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
769         // Release child from barrier
770         ANNOTATE_BARRIER_BEGIN(child_thr);
771         kmp_flag_64<> flag(&child_bar->b_go, child_thr);
772         flag.release();
773       }
774     }
775   }
776 #if KMP_BARRIER_ICV_PUSH
777   if (propagate_icvs &&
778       !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
779     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
780                              FALSE);
781     copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
782               &thr_bar->th_fixed_icvs);
783   }
784 #endif
785   KA_TRACE(
786       20,
787       ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
788        gtid, team->t.t_id, tid, bt));
789 }
790 
791 // Hierarchical Barrier
792 
793 // Initialize thread barrier data
794 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
795    Performs the minimum amount of initialization required based on how the team
796    has changed. Returns true if leaf children will require both on-core and
797    traditional wake-up mechanisms. For example, if the team size increases,
798    threads already in the team will respond to on-core wakeup on their parent
799    thread, but threads newly added to the team will only be listening on the
800    their local b_go. */
801 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
802                                                    kmp_bstate_t *thr_bar,
803                                                    kmp_uint32 nproc, int gtid,
804                                                    int tid, kmp_team_t *team) {
805   // Checks to determine if (re-)initialization is needed
806   bool uninitialized = thr_bar->team == NULL;
807   bool team_changed = team != thr_bar->team;
808   bool team_sz_changed = nproc != thr_bar->nproc;
809   bool tid_changed = tid != thr_bar->old_tid;
810   bool retval = false;
811 
812   if (uninitialized || team_sz_changed) {
813     __kmp_get_hierarchy(nproc, thr_bar);
814   }
815 
816   if (uninitialized || team_sz_changed || tid_changed) {
817     thr_bar->my_level = thr_bar->depth - 1; // default for master
818     thr_bar->parent_tid = -1; // default for master
819     if (!KMP_MASTER_TID(
820             tid)) { // if not master, find parent thread in hierarchy
821       kmp_uint32 d = 0;
822       while (d < thr_bar->depth) { // find parent based on level of thread in
823         // hierarchy, and note level
824         kmp_uint32 rem;
825         if (d == thr_bar->depth - 2) { // reached level right below the master
826           thr_bar->parent_tid = 0;
827           thr_bar->my_level = d;
828           break;
829         } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
830           // TODO: can we make the above op faster?
831           // thread is not a subtree root at next level, so this is max
832           thr_bar->parent_tid = tid - rem;
833           thr_bar->my_level = d;
834           break;
835         }
836         ++d;
837       }
838     }
839     __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
840                             (thr_bar->skip_per_level[thr_bar->my_level])),
841                        &(thr_bar->offset));
842     thr_bar->old_tid = tid;
843     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
844     thr_bar->team = team;
845     thr_bar->parent_bar =
846         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
847   }
848   if (uninitialized || team_changed || tid_changed) {
849     thr_bar->team = team;
850     thr_bar->parent_bar =
851         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
852     retval = true;
853   }
854   if (uninitialized || team_sz_changed || tid_changed) {
855     thr_bar->nproc = nproc;
856     thr_bar->leaf_kids = thr_bar->base_leaf_kids;
857     if (thr_bar->my_level == 0)
858       thr_bar->leaf_kids = 0;
859     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
860       __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
861     thr_bar->leaf_state = 0;
862     for (int i = 0; i < thr_bar->leaf_kids; ++i)
863       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
864   }
865   return retval;
866 }
867 
868 static void __kmp_hierarchical_barrier_gather(
869     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
870     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
871   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
872   kmp_team_t *team = this_thr->th.th_team;
873   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
874   kmp_uint32 nproc = this_thr->th.th_team_nproc;
875   kmp_info_t **other_threads = team->t.t_threads;
876   kmp_uint64 new_state;
877 
878   int level = team->t.t_level;
879   if (other_threads[0]
880           ->th.th_teams_microtask) // are we inside the teams construct?
881     if (this_thr->th.th_teams_size.nteams > 1)
882       ++level; // level was not increased in teams construct for team_of_masters
883   if (level == 1)
884     thr_bar->use_oncore_barrier = 1;
885   else
886     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
887 
888   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
889                 "barrier type %d\n",
890                 gtid, team->t.t_id, tid, bt));
891   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
892 
893 #if USE_ITT_BUILD && USE_ITT_NOTIFY
894   // Barrier imbalance - save arrive time to the thread
895   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
896     this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
897   }
898 #endif
899 
900   (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
901                                                team);
902 
903   if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
904     kmp_int32 child_tid;
905     new_state =
906         (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
907     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
908         thr_bar->use_oncore_barrier) {
909       if (thr_bar->leaf_kids) {
910         // First, wait for leaf children to check-in on my b_arrived flag
911         kmp_uint64 leaf_state =
912             KMP_MASTER_TID(tid)
913                 ? thr_bar->b_arrived | thr_bar->leaf_state
914                 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
915         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
916                       "for leaf kids\n",
917                       gtid, team->t.t_id, tid));
918         kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
919         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
920         if (reduce) {
921           ANNOTATE_REDUCE_AFTER(reduce);
922           OMPT_REDUCTION_DECL(this_thr, gtid);
923           OMPT_REDUCTION_BEGIN;
924           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
925                ++child_tid) {
926             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
927                            "T#%d(%d:%d)\n",
928                            gtid, team->t.t_id, tid,
929                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
930                            child_tid));
931             ANNOTATE_BARRIER_END(other_threads[child_tid]);
932             (*reduce)(this_thr->th.th_local.reduce_data,
933                       other_threads[child_tid]->th.th_local.reduce_data);
934           }
935           OMPT_REDUCTION_END;
936           ANNOTATE_REDUCE_BEFORE(reduce);
937           ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
938         }
939         // clear leaf_state bits
940         KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
941       }
942       // Next, wait for higher level children on each child's b_arrived flag
943       for (kmp_uint32 d = 1; d < thr_bar->my_level;
944            ++d) { // gather lowest level threads first, but skip 0
945         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
946                    skip = thr_bar->skip_per_level[d];
947         if (last > nproc)
948           last = nproc;
949         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
950           kmp_info_t *child_thr = other_threads[child_tid];
951           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
952           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
953                         "T#%d(%d:%d) "
954                         "arrived(%p) == %llu\n",
955                         gtid, team->t.t_id, tid,
956                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
957                         child_tid, &child_bar->b_arrived, new_state));
958           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
959           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
960           ANNOTATE_BARRIER_END(child_thr);
961           if (reduce) {
962             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
963                            "T#%d(%d:%d)\n",
964                            gtid, team->t.t_id, tid,
965                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
966                            child_tid));
967             ANNOTATE_REDUCE_AFTER(reduce);
968             (*reduce)(this_thr->th.th_local.reduce_data,
969                       child_thr->th.th_local.reduce_data);
970             ANNOTATE_REDUCE_BEFORE(reduce);
971             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
972           }
973         }
974       }
975     } else { // Blocktime is not infinite
976       for (kmp_uint32 d = 0; d < thr_bar->my_level;
977            ++d) { // Gather lowest level threads first
978         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
979                    skip = thr_bar->skip_per_level[d];
980         if (last > nproc)
981           last = nproc;
982         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
983           kmp_info_t *child_thr = other_threads[child_tid];
984           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
986                         "T#%d(%d:%d) "
987                         "arrived(%p) == %llu\n",
988                         gtid, team->t.t_id, tid,
989                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
990                         child_tid, &child_bar->b_arrived, new_state));
991           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
992           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
993           ANNOTATE_BARRIER_END(child_thr);
994           if (reduce) {
995             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
996                            "T#%d(%d:%d)\n",
997                            gtid, team->t.t_id, tid,
998                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
999                            child_tid));
1000             ANNOTATE_REDUCE_AFTER(reduce);
1001             (*reduce)(this_thr->th.th_local.reduce_data,
1002                       child_thr->th.th_local.reduce_data);
1003             ANNOTATE_REDUCE_BEFORE(reduce);
1004             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1005           }
1006         }
1007       }
1008     }
1009   }
1010   // All subordinates are gathered; now release parent if not master thread
1011 
1012   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1013     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1014                   " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1015                   gtid, team->t.t_id, tid,
1016                   __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1017                   thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1018                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1019     /* Mark arrival to parent: After performing this write, a worker thread may
1020        not assume that the team is valid any more - it could be deallocated by
1021        the master thread at any time. */
1022     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1023         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1024       // flag; release it
1025       ANNOTATE_BARRIER_BEGIN(this_thr);
1026       kmp_flag_64<> flag(&thr_bar->b_arrived,
1027                          other_threads[thr_bar->parent_tid]);
1028       flag.release();
1029     } else {
1030       // Leaf does special release on "offset" bits of parent's b_arrived flag
1031       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1032       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1033                            thr_bar->offset + 1);
1034       flag.set_waiter(other_threads[thr_bar->parent_tid]);
1035       flag.release();
1036     }
1037   } else { // Master thread needs to update the team's b_arrived value
1038     team->t.t_bar[bt].b_arrived = new_state;
1039     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1040                   "arrived(%p) = %llu\n",
1041                   gtid, team->t.t_id, tid, team->t.t_id,
1042                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1043   }
1044   // Is the team access below unsafe or just technically invalid?
1045   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1046                 "barrier type %d\n",
1047                 gtid, team->t.t_id, tid, bt));
1048 }
1049 
1050 static void __kmp_hierarchical_barrier_release(
1051     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1052     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1053   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1054   kmp_team_t *team;
1055   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1056   kmp_uint32 nproc;
1057   bool team_change = false; // indicates on-core barrier shouldn't be used
1058 
1059   if (KMP_MASTER_TID(tid)) {
1060     team = __kmp_threads[gtid]->th.th_team;
1061     KMP_DEBUG_ASSERT(team != NULL);
1062     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1063                   "entered barrier type %d\n",
1064                   gtid, team->t.t_id, tid, bt));
1065   } else { // Worker threads
1066     // Wait for parent thread to release me
1067     if (!thr_bar->use_oncore_barrier ||
1068         __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1069         thr_bar->team == NULL) {
1070       // Use traditional method of waiting on my own b_go flag
1071       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1072       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1073       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1074       ANNOTATE_BARRIER_END(this_thr);
1075       TCW_8(thr_bar->b_go,
1076             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1077     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1078       // infinite, not nested
1079       // Wait on my "offset" bits on parent's b_go flag
1080       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1081       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1082                            thr_bar->offset + 1, bt,
1083                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1084       flag.wait(this_thr, TRUE);
1085       if (thr_bar->wait_flag ==
1086           KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1087         TCW_8(thr_bar->b_go,
1088               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1089       } else { // Reset my bits on parent's b_go flag
1090         (RCAST(volatile char *,
1091                &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1092       }
1093     }
1094     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1095     // Early exit for reaping threads releasing forkjoin barrier
1096     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1097       return;
1098     // The worker thread may now assume that the team is valid.
1099     team = __kmp_threads[gtid]->th.th_team;
1100     KMP_DEBUG_ASSERT(team != NULL);
1101     tid = __kmp_tid_from_gtid(gtid);
1102 
1103     KA_TRACE(
1104         20,
1105         ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1106          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1107     KMP_MB(); // Flush all pending memory write invalidates.
1108   }
1109 
1110   nproc = this_thr->th.th_team_nproc;
1111   int level = team->t.t_level;
1112   if (team->t.t_threads[0]
1113           ->th.th_teams_microtask) { // are we inside the teams construct?
1114     if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1115         this_thr->th.th_teams_level == level)
1116       ++level; // level was not increased in teams construct for team_of_workers
1117     if (this_thr->th.th_teams_size.nteams > 1)
1118       ++level; // level was not increased in teams construct for team_of_masters
1119   }
1120   if (level == 1)
1121     thr_bar->use_oncore_barrier = 1;
1122   else
1123     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1124 
1125   // If the team size has increased, we still communicate with old leaves via
1126   // oncore barrier.
1127   unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1128   kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1129   team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1130                                                        tid, team);
1131   // But if the entire team changes, we won't use oncore barrier at all
1132   if (team_change)
1133     old_leaf_kids = 0;
1134 
1135 #if KMP_BARRIER_ICV_PUSH
1136   if (propagate_icvs) {
1137     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1138                              FALSE);
1139     if (KMP_MASTER_TID(
1140             tid)) { // master already has copy in final destination; copy
1141       copy_icvs(&thr_bar->th_fixed_icvs,
1142                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1143     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1144                thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1145       if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1146         // leaves (on-core children) pull parent's fixed ICVs directly to local
1147         // ICV store
1148         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1149                   &thr_bar->parent_bar->th_fixed_icvs);
1150       // non-leaves will get ICVs piggybacked with b_go via NGO store
1151     } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1152       if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1153         // access
1154         copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1155       else // leaves copy parent's fixed ICVs directly to local ICV store
1156         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1157                   &thr_bar->parent_bar->th_fixed_icvs);
1158     }
1159   }
1160 #endif // KMP_BARRIER_ICV_PUSH
1161 
1162   // Now, release my children
1163   if (thr_bar->my_level) { // not a leaf
1164     kmp_int32 child_tid;
1165     kmp_uint32 last;
1166     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1167         thr_bar->use_oncore_barrier) {
1168       if (KMP_MASTER_TID(tid)) { // do a flat release
1169         // Set local b_go to bump children via NGO store of the cache line
1170         // containing IVCs and b_go.
1171         thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1172         // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1173         // the cache line
1174         ngo_load(&thr_bar->th_fixed_icvs);
1175         // This loops over all the threads skipping only the leaf nodes in the
1176         // hierarchy
1177         for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1178              child_tid += thr_bar->skip_per_level[1]) {
1179           kmp_bstate_t *child_bar =
1180               &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1181           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1182                         "releasing T#%d(%d:%d)"
1183                         " go(%p): %u => %u\n",
1184                         gtid, team->t.t_id, tid,
1185                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1186                         child_tid, &child_bar->b_go, child_bar->b_go,
1187                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1188           // Use ngo store (if available) to both store ICVs and release child
1189           // via child's b_go
1190           ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1191         }
1192         ngo_sync();
1193       }
1194       TCW_8(thr_bar->b_go,
1195             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1196       // Now, release leaf children
1197       if (thr_bar->leaf_kids) { // if there are any
1198         // We test team_change on the off-chance that the level 1 team changed.
1199         if (team_change ||
1200             old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1201           if (old_leaf_kids) { // release old leaf kids
1202             thr_bar->b_go |= old_leaf_state;
1203           }
1204           // Release new leaf kids
1205           last = tid + thr_bar->skip_per_level[1];
1206           if (last > nproc)
1207             last = nproc;
1208           for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1209                ++child_tid) { // skip_per_level[0]=1
1210             kmp_info_t *child_thr = team->t.t_threads[child_tid];
1211             kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1212             KA_TRACE(
1213                 20,
1214                 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1215                  " T#%d(%d:%d) go(%p): %u => %u\n",
1216                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1217                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1218                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1219             // Release child using child's b_go flag
1220             ANNOTATE_BARRIER_BEGIN(child_thr);
1221             kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1222             flag.release();
1223           }
1224         } else { // Release all children at once with leaf_state bits on my own
1225           // b_go flag
1226           thr_bar->b_go |= thr_bar->leaf_state;
1227         }
1228       }
1229     } else { // Blocktime is not infinite; do a simple hierarchical release
1230       for (int d = thr_bar->my_level - 1; d >= 0;
1231            --d) { // Release highest level threads first
1232         last = tid + thr_bar->skip_per_level[d + 1];
1233         kmp_uint32 skip = thr_bar->skip_per_level[d];
1234         if (last > nproc)
1235           last = nproc;
1236         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1237           kmp_info_t *child_thr = team->t.t_threads[child_tid];
1238           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1239           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1240                         "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1241                         gtid, team->t.t_id, tid,
1242                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1243                         child_tid, &child_bar->b_go, child_bar->b_go,
1244                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1245           // Release child using child's b_go flag
1246           ANNOTATE_BARRIER_BEGIN(child_thr);
1247           kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1248           flag.release();
1249         }
1250       }
1251     }
1252 #if KMP_BARRIER_ICV_PUSH
1253     if (propagate_icvs && !KMP_MASTER_TID(tid))
1254       // non-leaves copy ICVs from fixed ICVs to local dest
1255       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1256                 &thr_bar->th_fixed_icvs);
1257 #endif // KMP_BARRIER_ICV_PUSH
1258   }
1259   KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1260                 "barrier type %d\n",
1261                 gtid, team->t.t_id, tid, bt));
1262 }
1263 
1264 // End of Barrier Algorithms
1265 
1266 // type traits for cancellable value
1267 // if cancellable is true, then is_cancellable is a normal boolean variable
1268 // if cancellable is false, then is_cancellable is a compile time constant
1269 template <bool cancellable> struct is_cancellable {};
1270 template <> struct is_cancellable<true> {
1271   bool value;
1272   is_cancellable() : value(false) {}
1273   is_cancellable(bool b) : value(b) {}
1274   is_cancellable &operator=(bool b) {
1275     value = b;
1276     return *this;
1277   }
1278   operator bool() const { return value; }
1279 };
1280 template <> struct is_cancellable<false> {
1281   is_cancellable &operator=(bool b) { return *this; }
1282   constexpr operator bool() const { return false; }
1283 };
1284 
1285 // Internal function to do a barrier.
1286 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1287    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1288    barrier
1289    When cancellable = false,
1290      Returns 0 if master thread, 1 if worker thread.
1291    When cancellable = true
1292      Returns 0 if not cancelled, 1 if cancelled.  */
1293 template <bool cancellable = false>
1294 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1295                                   size_t reduce_size, void *reduce_data,
1296                                   void (*reduce)(void *, void *)) {
1297   KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1298   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1299   int tid = __kmp_tid_from_gtid(gtid);
1300   kmp_info_t *this_thr = __kmp_threads[gtid];
1301   kmp_team_t *team = this_thr->th.th_team;
1302   int status = 0;
1303   is_cancellable<cancellable> cancelled;
1304 #if OMPT_SUPPORT && OMPT_OPTIONAL
1305   ompt_data_t *my_task_data;
1306   ompt_data_t *my_parallel_data;
1307   void *return_address;
1308   ompt_sync_region_t barrier_kind;
1309 #endif
1310 
1311   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1312                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1313 
1314   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1315 #if OMPT_SUPPORT
1316   if (ompt_enabled.enabled) {
1317 #if OMPT_OPTIONAL
1318     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1319     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1320     return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1321     barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1322     if (ompt_enabled.ompt_callback_sync_region) {
1323       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1324           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1325           return_address);
1326     }
1327     if (ompt_enabled.ompt_callback_sync_region_wait) {
1328       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1329           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1330           return_address);
1331     }
1332 #endif
1333     // It is OK to report the barrier state after the barrier begin callback.
1334     // According to the OMPT specification, a compliant implementation may
1335     // even delay reporting this state until the barrier begins to wait.
1336     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1337   }
1338 #endif
1339 
1340   if (!team->t.t_serialized) {
1341 #if USE_ITT_BUILD
1342     // This value will be used in itt notify events below.
1343     void *itt_sync_obj = NULL;
1344 #if USE_ITT_NOTIFY
1345     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1346       itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1347 #endif
1348 #endif /* USE_ITT_BUILD */
1349     if (__kmp_tasking_mode == tskm_extra_barrier) {
1350       __kmp_tasking_barrier(team, this_thr, gtid);
1351       KA_TRACE(15,
1352                ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1353                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1354     }
1355 
1356     /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1357        access it when the team struct is not guaranteed to exist. */
1358     // See note about the corresponding code in __kmp_join_barrier() being
1359     // performance-critical.
1360     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1361 #if KMP_USE_MONITOR
1362       this_thr->th.th_team_bt_intervals =
1363           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1364       this_thr->th.th_team_bt_set =
1365           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1366 #else
1367       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1368 #endif
1369     }
1370 
1371 #if USE_ITT_BUILD
1372     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1373       __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1374 #endif /* USE_ITT_BUILD */
1375 #if USE_DEBUGGER
1376     // Let the debugger know: the thread arrived to the barrier and waiting.
1377     if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1378       team->t.t_bar[bt].b_master_arrived += 1;
1379     } else {
1380       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1381     } // if
1382 #endif /* USE_DEBUGGER */
1383     if (reduce != NULL) {
1384       // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1385       this_thr->th.th_local.reduce_data = reduce_data;
1386     }
1387 
1388     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1389       // use 0 to only setup the current team if nthreads > 1
1390       __kmp_task_team_setup(this_thr, team, 0);
1391 
1392     if (cancellable) {
1393       cancelled = __kmp_linear_barrier_gather_cancellable(
1394           bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1395     } else {
1396       switch (__kmp_barrier_gather_pattern[bt]) {
1397       case bp_hyper_bar: {
1398         // don't set branch bits to 0; use linear
1399         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1400         __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1401                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1402         break;
1403       }
1404       case bp_hierarchical_bar: {
1405         __kmp_hierarchical_barrier_gather(
1406             bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1407         break;
1408       }
1409       case bp_tree_bar: {
1410         // don't set branch bits to 0; use linear
1411         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1412         __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1413                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1414         break;
1415       }
1416       default: {
1417         __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1418                                     reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1419       }
1420       }
1421     }
1422 
1423     KMP_MB();
1424 
1425     if (KMP_MASTER_TID(tid)) {
1426       status = 0;
1427       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1428         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1429       }
1430 #if USE_DEBUGGER
1431       // Let the debugger know: All threads are arrived and starting leaving the
1432       // barrier.
1433       team->t.t_bar[bt].b_team_arrived += 1;
1434 #endif
1435 
1436       if (__kmp_omp_cancellation) {
1437         kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1438         // Reset cancellation flag for worksharing constructs
1439         if (cancel_request == cancel_loop ||
1440             cancel_request == cancel_sections) {
1441           KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1442         }
1443       }
1444 #if USE_ITT_BUILD
1445       /* TODO: In case of split reduction barrier, master thread may send
1446          acquired event early, before the final summation into the shared
1447          variable is done (final summation can be a long operation for array
1448          reductions).  */
1449       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1450         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1451 #endif /* USE_ITT_BUILD */
1452 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1453       // Barrier - report frame end (only if active_level == 1)
1454       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1455           __kmp_forkjoin_frames_mode &&
1456           (this_thr->th.th_teams_microtask == NULL || // either not in teams
1457            this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1458           team->t.t_active_level == 1) {
1459         ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1460         kmp_uint64 cur_time = __itt_get_timestamp();
1461         kmp_info_t **other_threads = team->t.t_threads;
1462         int nproc = this_thr->th.th_team_nproc;
1463         int i;
1464         switch (__kmp_forkjoin_frames_mode) {
1465         case 1:
1466           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1467                                  loc, nproc);
1468           this_thr->th.th_frame_time = cur_time;
1469           break;
1470         case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1471           // be fixed)
1472           __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1473                                  1, loc, nproc);
1474           break;
1475         case 3:
1476           if (__itt_metadata_add_ptr) {
1477             // Initialize with master's wait time
1478             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1479             // Set arrive time to zero to be able to check it in
1480             // __kmp_invoke_task(); the same is done inside the loop below
1481             this_thr->th.th_bar_arrive_time = 0;
1482             for (i = 1; i < nproc; ++i) {
1483               delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1484               other_threads[i]->th.th_bar_arrive_time = 0;
1485             }
1486             __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1487                                          cur_time, delta,
1488                                          (kmp_uint64)(reduce != NULL));
1489           }
1490           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1491                                  loc, nproc);
1492           this_thr->th.th_frame_time = cur_time;
1493           break;
1494         }
1495       }
1496 #endif /* USE_ITT_BUILD */
1497     } else {
1498       status = 1;
1499 #if USE_ITT_BUILD
1500       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1501         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1502 #endif /* USE_ITT_BUILD */
1503     }
1504     if ((status == 1 || !is_split) && !cancelled) {
1505       if (cancellable) {
1506         cancelled = __kmp_linear_barrier_release_cancellable(
1507             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1508       } else {
1509         switch (__kmp_barrier_release_pattern[bt]) {
1510         case bp_hyper_bar: {
1511           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1512           __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1513                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1514           break;
1515         }
1516         case bp_hierarchical_bar: {
1517           __kmp_hierarchical_barrier_release(
1518               bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1519           break;
1520         }
1521         case bp_tree_bar: {
1522           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1523           __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1524                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1525           break;
1526         }
1527         default: {
1528           __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1529                                        FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1530         }
1531         }
1532       }
1533       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1534         __kmp_task_team_sync(this_thr, team);
1535       }
1536     }
1537 
1538 #if USE_ITT_BUILD
1539     /* GEH: TODO: Move this under if-condition above and also include in
1540        __kmp_end_split_barrier(). This will more accurately represent the actual
1541        release time of the threads for split barriers.  */
1542     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1543       __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1544 #endif /* USE_ITT_BUILD */
1545   } else { // Team is serialized.
1546     status = 0;
1547     if (__kmp_tasking_mode != tskm_immediate_exec) {
1548       if (this_thr->th.th_task_team != NULL) {
1549 #if USE_ITT_NOTIFY
1550         void *itt_sync_obj = NULL;
1551         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1552           itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1553           __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1554         }
1555 #endif
1556 
1557         KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1558                          TRUE);
1559         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1560         __kmp_task_team_setup(this_thr, team, 0);
1561 
1562 #if USE_ITT_BUILD
1563         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1564           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1565 #endif /* USE_ITT_BUILD */
1566       }
1567     }
1568   }
1569   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1570                 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1571                 __kmp_tid_from_gtid(gtid), status));
1572 
1573 #if OMPT_SUPPORT
1574   if (ompt_enabled.enabled) {
1575 #if OMPT_OPTIONAL
1576     if (ompt_enabled.ompt_callback_sync_region_wait) {
1577       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1578           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1579           return_address);
1580     }
1581     if (ompt_enabled.ompt_callback_sync_region) {
1582       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1583           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1584           return_address);
1585     }
1586 #endif
1587     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1588   }
1589 #endif
1590   ANNOTATE_BARRIER_END(&team->t.t_bar);
1591 
1592   if (cancellable)
1593     return (int)cancelled;
1594   return status;
1595 }
1596 
1597 // Returns 0 if master thread, 1 if worker thread.
1598 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1599                   size_t reduce_size, void *reduce_data,
1600                   void (*reduce)(void *, void *)) {
1601   return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1602                                   reduce);
1603 }
1604 
1605 #if defined(KMP_GOMP_COMPAT)
1606 // Returns 1 if cancelled, 0 otherwise
1607 int __kmp_barrier_gomp_cancel(int gtid) {
1608   if (__kmp_omp_cancellation) {
1609     int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1610                                                  0, NULL, NULL);
1611     if (cancelled) {
1612       int tid = __kmp_tid_from_gtid(gtid);
1613       kmp_info_t *this_thr = __kmp_threads[gtid];
1614       if (KMP_MASTER_TID(tid)) {
1615         // Master does not need to revert anything
1616       } else {
1617         // Workers need to revert their private b_arrived flag
1618         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1619             KMP_BARRIER_STATE_BUMP;
1620       }
1621     }
1622     return cancelled;
1623   }
1624   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1625   return FALSE;
1626 }
1627 #endif
1628 
1629 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1630   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1631   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1632   int tid = __kmp_tid_from_gtid(gtid);
1633   kmp_info_t *this_thr = __kmp_threads[gtid];
1634   kmp_team_t *team = this_thr->th.th_team;
1635 
1636   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1637   if (!team->t.t_serialized) {
1638     if (KMP_MASTER_GTID(gtid)) {
1639       switch (__kmp_barrier_release_pattern[bt]) {
1640       case bp_hyper_bar: {
1641         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1642         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1643                                     FALSE USE_ITT_BUILD_ARG(NULL));
1644         break;
1645       }
1646       case bp_hierarchical_bar: {
1647         __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1648                                            FALSE USE_ITT_BUILD_ARG(NULL));
1649         break;
1650       }
1651       case bp_tree_bar: {
1652         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1653         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1654                                    FALSE USE_ITT_BUILD_ARG(NULL));
1655         break;
1656       }
1657       default: {
1658         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1659                                      FALSE USE_ITT_BUILD_ARG(NULL));
1660       }
1661       }
1662       if (__kmp_tasking_mode != tskm_immediate_exec) {
1663         __kmp_task_team_sync(this_thr, team);
1664       } // if
1665     }
1666   }
1667   ANNOTATE_BARRIER_END(&team->t.t_bar);
1668 }
1669 
1670 void __kmp_join_barrier(int gtid) {
1671   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1672   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1673   kmp_info_t *this_thr = __kmp_threads[gtid];
1674   kmp_team_t *team;
1675   kmp_uint nproc;
1676   kmp_info_t *master_thread;
1677   int tid;
1678 #ifdef KMP_DEBUG
1679   int team_id;
1680 #endif /* KMP_DEBUG */
1681 #if USE_ITT_BUILD
1682   void *itt_sync_obj = NULL;
1683 #if USE_ITT_NOTIFY
1684   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1685     // Get object created at fork_barrier
1686     itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1687 #endif
1688 #endif /* USE_ITT_BUILD */
1689   KMP_MB();
1690 
1691   // Get current info
1692   team = this_thr->th.th_team;
1693   nproc = this_thr->th.th_team_nproc;
1694   KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1695   tid = __kmp_tid_from_gtid(gtid);
1696 #ifdef KMP_DEBUG
1697   team_id = team->t.t_id;
1698 #endif /* KMP_DEBUG */
1699   master_thread = this_thr->th.th_team_master;
1700 #ifdef KMP_DEBUG
1701   if (master_thread != team->t.t_threads[0]) {
1702     __kmp_print_structure();
1703   }
1704 #endif /* KMP_DEBUG */
1705   KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1706   KMP_MB();
1707 
1708   // Verify state
1709   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1710   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1711   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1712   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1713   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1714                 gtid, team_id, tid));
1715 
1716   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1717 #if OMPT_SUPPORT
1718   if (ompt_enabled.enabled) {
1719 #if OMPT_OPTIONAL
1720     ompt_data_t *my_task_data;
1721     ompt_data_t *my_parallel_data;
1722     void *codeptr = NULL;
1723     int ds_tid = this_thr->th.th_info.ds.ds_tid;
1724     if (KMP_MASTER_TID(ds_tid) &&
1725         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1726          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1727       codeptr = team->t.ompt_team_info.master_return_address;
1728     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1729     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1730     if (ompt_enabled.ompt_callback_sync_region) {
1731       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1732           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1733           my_task_data, codeptr);
1734     }
1735     if (ompt_enabled.ompt_callback_sync_region_wait) {
1736       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1737           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1738           my_task_data, codeptr);
1739     }
1740     if (!KMP_MASTER_TID(ds_tid))
1741       this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1742 #endif
1743     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1744   }
1745 #endif
1746 
1747   if (__kmp_tasking_mode == tskm_extra_barrier) {
1748     __kmp_tasking_barrier(team, this_thr, gtid);
1749     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1750                   team_id, tid));
1751   }
1752 #ifdef KMP_DEBUG
1753   if (__kmp_tasking_mode != tskm_immediate_exec) {
1754     KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1755                   "%p, th_task_team = %p\n",
1756                   __kmp_gtid_from_thread(this_thr), team_id,
1757                   team->t.t_task_team[this_thr->th.th_task_state],
1758                   this_thr->th.th_task_team));
1759     KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1760                      team->t.t_task_team[this_thr->th.th_task_state]);
1761   }
1762 #endif /* KMP_DEBUG */
1763 
1764   /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1765      access it when the team struct is not guaranteed to exist. Doing these
1766      loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1767      we do not perform the copy if blocktime=infinite, since the values are not
1768      used by __kmp_wait_template() in that case. */
1769   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1770 #if KMP_USE_MONITOR
1771     this_thr->th.th_team_bt_intervals =
1772         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1773     this_thr->th.th_team_bt_set =
1774         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1775 #else
1776     this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1777 #endif
1778   }
1779 
1780 #if USE_ITT_BUILD
1781   if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1782     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1783 #endif /* USE_ITT_BUILD */
1784 
1785   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1786   case bp_hyper_bar: {
1787     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1788     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1789                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1790     break;
1791   }
1792   case bp_hierarchical_bar: {
1793     __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1794                                       NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1795     break;
1796   }
1797   case bp_tree_bar: {
1798     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1799     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1800                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1801     break;
1802   }
1803   default: {
1804     __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1805                                 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1806   }
1807   }
1808 
1809   /* From this point on, the team data structure may be deallocated at any time
1810      by the master thread - it is unsafe to reference it in any of the worker
1811      threads. Any per-team data items that need to be referenced before the
1812      end of the barrier should be moved to the kmp_task_team_t structs.  */
1813   if (KMP_MASTER_TID(tid)) {
1814     if (__kmp_tasking_mode != tskm_immediate_exec) {
1815       __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1816     }
1817     if (__kmp_display_affinity) {
1818       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1819     }
1820 #if KMP_STATS_ENABLED
1821     // Have master thread flag the workers to indicate they are now waiting for
1822     // next parallel region, Also wake them up so they switch their timers to
1823     // idle.
1824     for (int i = 0; i < team->t.t_nproc; ++i) {
1825       kmp_info_t *team_thread = team->t.t_threads[i];
1826       if (team_thread == this_thr)
1827         continue;
1828       team_thread->th.th_stats->setIdleFlag();
1829       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1830           team_thread->th.th_sleep_loc != NULL)
1831         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1832                                   team_thread->th.th_sleep_loc);
1833     }
1834 #endif
1835 #if USE_ITT_BUILD
1836     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1837       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1838 #endif /* USE_ITT_BUILD */
1839 
1840 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1841     // Join barrier - report frame end
1842     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1843         __kmp_forkjoin_frames_mode &&
1844         (this_thr->th.th_teams_microtask == NULL || // either not in teams
1845          this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1846         team->t.t_active_level == 1) {
1847       kmp_uint64 cur_time = __itt_get_timestamp();
1848       ident_t *loc = team->t.t_ident;
1849       kmp_info_t **other_threads = team->t.t_threads;
1850       int nproc = this_thr->th.th_team_nproc;
1851       int i;
1852       switch (__kmp_forkjoin_frames_mode) {
1853       case 1:
1854         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1855                                loc, nproc);
1856         break;
1857       case 2:
1858         __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1859                                loc, nproc);
1860         break;
1861       case 3:
1862         if (__itt_metadata_add_ptr) {
1863           // Initialize with master's wait time
1864           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1865           // Set arrive time to zero to be able to check it in
1866           // __kmp_invoke_task(); the same is done inside the loop below
1867           this_thr->th.th_bar_arrive_time = 0;
1868           for (i = 1; i < nproc; ++i) {
1869             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1870             other_threads[i]->th.th_bar_arrive_time = 0;
1871           }
1872           __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1873                                        cur_time, delta, 0);
1874         }
1875         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1876                                loc, nproc);
1877         this_thr->th.th_frame_time = cur_time;
1878         break;
1879       }
1880     }
1881 #endif /* USE_ITT_BUILD */
1882   }
1883 #if USE_ITT_BUILD
1884   else {
1885     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1886       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1887   }
1888 #endif /* USE_ITT_BUILD */
1889 
1890 #if KMP_DEBUG
1891   if (KMP_MASTER_TID(tid)) {
1892     KA_TRACE(
1893         15,
1894         ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1895          gtid, team_id, tid, nproc));
1896   }
1897 #endif /* KMP_DEBUG */
1898 
1899   // TODO now, mark worker threads as done so they may be disbanded
1900   KMP_MB(); // Flush all pending memory write invalidates.
1901   KA_TRACE(10,
1902            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1903 
1904   ANNOTATE_BARRIER_END(&team->t.t_bar);
1905 }
1906 
1907 // TODO release worker threads' fork barriers as we are ready instead of all at
1908 // once
1909 void __kmp_fork_barrier(int gtid, int tid) {
1910   KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1911   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1912   kmp_info_t *this_thr = __kmp_threads[gtid];
1913   kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1914 #if USE_ITT_BUILD
1915   void *itt_sync_obj = NULL;
1916 #endif /* USE_ITT_BUILD */
1917   if (team)
1918     ANNOTATE_BARRIER_END(&team->t.t_bar);
1919 
1920   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1921                 (team != NULL) ? team->t.t_id : -1, tid));
1922 
1923   // th_team pointer only valid for master thread here
1924   if (KMP_MASTER_TID(tid)) {
1925 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1926     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1927       // Create itt barrier object
1928       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1929       __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1930     }
1931 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1932 
1933 #ifdef KMP_DEBUG
1934     kmp_info_t **other_threads = team->t.t_threads;
1935     int i;
1936 
1937     // Verify state
1938     KMP_MB();
1939 
1940     for (i = 1; i < team->t.t_nproc; ++i) {
1941       KA_TRACE(500,
1942                ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1943                 "== %u.\n",
1944                 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1945                 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1946                 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1947       KMP_DEBUG_ASSERT(
1948           (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1949            ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1950       KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1951     }
1952 #endif
1953 
1954     if (__kmp_tasking_mode != tskm_immediate_exec) {
1955       // 0 indicates setup current task team if nthreads > 1
1956       __kmp_task_team_setup(this_thr, team, 0);
1957     }
1958 
1959     /* The master thread may have changed its blocktime between the join barrier
1960        and the fork barrier. Copy the blocktime info to the thread, where
1961        __kmp_wait_template() can access it when the team struct is not
1962        guaranteed to exist. */
1963     // See note about the corresponding code in __kmp_join_barrier() being
1964     // performance-critical
1965     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1966 #if KMP_USE_MONITOR
1967       this_thr->th.th_team_bt_intervals =
1968           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1969       this_thr->th.th_team_bt_set =
1970           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1971 #else
1972       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1973 #endif
1974     }
1975   } // master
1976 
1977   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1978   case bp_hyper_bar: {
1979     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1980     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1981                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1982     break;
1983   }
1984   case bp_hierarchical_bar: {
1985     __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1986                                        TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1987     break;
1988   }
1989   case bp_tree_bar: {
1990     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1991     __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1992                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1993     break;
1994   }
1995   default: {
1996     __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1997                                  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1998   }
1999   }
2000 
2001 #if OMPT_SUPPORT
2002   if (ompt_enabled.enabled &&
2003       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2004     int ds_tid = this_thr->th.th_info.ds.ds_tid;
2005     ompt_data_t *task_data = (team)
2006                                  ? OMPT_CUR_TASK_DATA(this_thr)
2007                                  : &(this_thr->th.ompt_thread_info.task_data);
2008     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2009 #if OMPT_OPTIONAL
2010     void *codeptr = NULL;
2011     if (KMP_MASTER_TID(ds_tid) &&
2012         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2013          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2014       codeptr = team->t.ompt_team_info.master_return_address;
2015     if (ompt_enabled.ompt_callback_sync_region_wait) {
2016       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2017           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2018           codeptr);
2019     }
2020     if (ompt_enabled.ompt_callback_sync_region) {
2021       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2022           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2023           codeptr);
2024     }
2025 #endif
2026     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2027       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2028           ompt_scope_end, NULL, task_data, 0, ds_tid,
2029           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2030     }
2031   }
2032 #endif
2033 
2034   // Early exit for reaping threads releasing forkjoin barrier
2035   if (TCR_4(__kmp_global.g.g_done)) {
2036     this_thr->th.th_task_team = NULL;
2037 
2038 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2039     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2040       if (!KMP_MASTER_TID(tid)) {
2041         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2042         if (itt_sync_obj)
2043           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2044       }
2045     }
2046 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2047     KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2048     return;
2049   }
2050 
2051   /* We can now assume that a valid team structure has been allocated by the
2052      master and propagated to all worker threads. The current thread, however,
2053      may not be part of the team, so we can't blindly assume that the team
2054      pointer is non-null.  */
2055   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2056   KMP_DEBUG_ASSERT(team != NULL);
2057   tid = __kmp_tid_from_gtid(gtid);
2058 
2059 #if KMP_BARRIER_ICV_PULL
2060   /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2061      __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2062      implicit task has this data before this function is called. We cannot
2063      modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2064      struct, because it is not always the case that the threads arrays have
2065      been allocated when __kmp_fork_call() is executed. */
2066   {
2067     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2068     if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2069       // Copy the initial ICVs from the master's thread struct to the implicit
2070       // task for this tid.
2071       KA_TRACE(10,
2072                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2073       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2074                                tid, FALSE);
2075       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2076                 &team->t.t_threads[0]
2077                      ->th.th_bar[bs_forkjoin_barrier]
2078                      .bb.th_fixed_icvs);
2079     }
2080   }
2081 #endif // KMP_BARRIER_ICV_PULL
2082 
2083   if (__kmp_tasking_mode != tskm_immediate_exec) {
2084     __kmp_task_team_sync(this_thr, team);
2085   }
2086 
2087 #if KMP_AFFINITY_SUPPORTED
2088   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2089   if (proc_bind == proc_bind_intel) {
2090     // Call dynamic affinity settings
2091     if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2092       __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2093     }
2094   } else if (proc_bind != proc_bind_false) {
2095     if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2096       KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2097                      __kmp_gtid_from_thread(this_thr),
2098                      this_thr->th.th_current_place));
2099     } else {
2100       __kmp_affinity_set_place(gtid);
2101     }
2102   }
2103 #endif // KMP_AFFINITY_SUPPORTED
2104   // Perform the display affinity functionality
2105   if (__kmp_display_affinity) {
2106     if (team->t.t_display_affinity
2107 #if KMP_AFFINITY_SUPPORTED
2108         || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2109 #endif
2110     ) {
2111       // NULL means use the affinity-format-var ICV
2112       __kmp_aux_display_affinity(gtid, NULL);
2113       this_thr->th.th_prev_num_threads = team->t.t_nproc;
2114       this_thr->th.th_prev_level = team->t.t_level;
2115     }
2116   }
2117   if (!KMP_MASTER_TID(tid))
2118     KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2119 
2120 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2121   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2122     if (!KMP_MASTER_TID(tid)) {
2123       // Get correct barrier object
2124       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2125       __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2126     } // (prepare called inside barrier_release)
2127   }
2128 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2129   ANNOTATE_BARRIER_END(&team->t.t_bar);
2130   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2131                 team->t.t_id, tid));
2132 }
2133 
2134 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2135                           kmp_internal_control_t *new_icvs, ident_t *loc) {
2136   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2137 
2138   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2139   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2140 
2141 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2142    __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2143    implicit task has this data before this function is called. */
2144 #if KMP_BARRIER_ICV_PULL
2145   /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2146      untouched), where all of the worker threads can access them and make their
2147      own copies after the barrier. */
2148   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2149   // allocated at this point
2150   copy_icvs(
2151       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2152       new_icvs);
2153   KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2154                 team->t.t_threads[0], team));
2155 #elif KMP_BARRIER_ICV_PUSH
2156   // The ICVs will be propagated in the fork barrier, so nothing needs to be
2157   // done here.
2158   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2159                 team->t.t_threads[0], team));
2160 #else
2161   // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
2162   // time.
2163   ngo_load(new_icvs);
2164   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2165   // allocated at this point
2166   for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2167     // TODO: GEH - pass in better source location info since usually NULL here
2168     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2169                   f, team->t.t_threads[f], team));
2170     __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2171     ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2172     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2173                   f, team->t.t_threads[f], team));
2174   }
2175   ngo_sync();
2176 #endif // KMP_BARRIER_ICV_PULL
2177 }
2178