1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp_wait_release.h"
14 #include "kmp_barrier.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 // for distributed barrier
20 #include "kmp_affinity.h"
21 
22 #if KMP_MIC
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
25 #endif // KMP_MIC
26 
27 #include "tsan_annotations.h"
28 
29 #if KMP_MIC && USE_NGO_STORES
30 // ICV copying
31 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
32 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
35 #else
36 #define ngo_load(src) ((void)0)
37 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
38 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
39 #define ngo_sync() ((void)0)
40 #endif /* KMP_MIC && USE_NGO_STORES */
41 
42 void __kmp_print_structure(void); // Forward declaration
43 
44 // ---------------------------- Barrier Algorithms ----------------------------
45 // Distributed barrier
46 
47 // Compute how many threads to have polling each cache-line.
48 // We want to limit the number of writes to IDEAL_GO_RESOLUTION.
49 void distributedBarrier::computeVarsForN(size_t n) {
50   int nsockets = 1;
51   if (__kmp_topology) {
52     int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
53     int core_level = __kmp_topology->get_level(KMP_HW_CORE);
54     int ncores_per_socket =
55         __kmp_topology->calculate_ratio(core_level, socket_level);
56     nsockets = __kmp_topology->get_count(socket_level);
57 
58     if (nsockets <= 0)
59       nsockets = 1;
60     if (ncores_per_socket <= 0)
61       ncores_per_socket = 1;
62 
63     threads_per_go = ncores_per_socket >> 1;
64     if (!fix_threads_per_go) {
65       // Minimize num_gos
66       if (threads_per_go > 4) {
67         if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
68           threads_per_go = threads_per_go >> 1;
69         }
70         if (threads_per_go > 4 && nsockets == 1)
71           threads_per_go = threads_per_go >> 1;
72       }
73     }
74     if (threads_per_go == 0)
75       threads_per_go = 1;
76     fix_threads_per_go = true;
77     num_gos = n / threads_per_go;
78     if (n % threads_per_go)
79       num_gos++;
80     if (nsockets == 1 || num_gos == 1)
81       num_groups = 1;
82     else {
83       num_groups = num_gos / nsockets;
84       if (num_gos % nsockets)
85         num_groups++;
86     }
87     if (num_groups <= 0)
88       num_groups = 1;
89     gos_per_group = num_gos / num_groups;
90     if (num_gos % num_groups)
91       gos_per_group++;
92     threads_per_group = threads_per_go * gos_per_group;
93   } else {
94     num_gos = n / threads_per_go;
95     if (n % threads_per_go)
96       num_gos++;
97     if (num_gos == 1)
98       num_groups = 1;
99     else {
100       num_groups = num_gos / 2;
101       if (num_gos % 2)
102         num_groups++;
103     }
104     gos_per_group = num_gos / num_groups;
105     if (num_gos % num_groups)
106       gos_per_group++;
107     threads_per_group = threads_per_go * gos_per_group;
108   }
109 }
110 
111 void distributedBarrier::computeGo(size_t n) {
112   // Minimize num_gos
113   for (num_gos = 1;; num_gos++)
114     if (IDEAL_CONTENTION * num_gos >= n)
115       break;
116   threads_per_go = n / num_gos;
117   if (n % num_gos)
118     threads_per_go++;
119   while (num_gos > MAX_GOS) {
120     threads_per_go++;
121     num_gos = n / threads_per_go;
122     if (n % threads_per_go)
123       num_gos++;
124   }
125   computeVarsForN(n);
126 }
127 
128 // This function is to resize the barrier arrays when the new number of threads
129 // exceeds max_threads, which is the current size of all the arrays
130 void distributedBarrier::resize(size_t nthr) {
131   KMP_DEBUG_ASSERT(nthr > max_threads);
132 
133   // expand to requested size * 2
134   max_threads = nthr * 2;
135 
136   // allocate arrays to new max threads
137   for (int i = 0; i < MAX_ITERS; ++i) {
138     if (flags[i])
139       flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
140                                                  max_threads * sizeof(flags_s));
141     else
142       flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
143   }
144 
145   if (go)
146     go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
147   else
148     go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
149 
150   if (iter)
151     iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
152   else
153     iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
154 
155   if (sleep)
156     sleep =
157         (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
158   else
159     sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
160 }
161 
162 // This function is to set all the go flags that threads might be waiting
163 // on, and when blocktime is not infinite, it should be followed by a wake-up
164 // call to each thread
165 kmp_uint64 distributedBarrier::go_release() {
166   kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
167   for (size_t j = 0; j < num_gos; j++) {
168     go[j].go.store(next_go);
169   }
170   return next_go;
171 }
172 
173 void distributedBarrier::go_reset() {
174   for (size_t j = 0; j < max_threads; ++j) {
175     for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
176       flags[i][j].stillNeed = 1;
177     }
178     go[j].go.store(0);
179     iter[j].iter = 0;
180   }
181 }
182 
183 // This function inits/re-inits the distributed barrier for a particular number
184 // of threads. If a resize of arrays is needed, it calls the resize function.
185 void distributedBarrier::init(size_t nthr) {
186   size_t old_max = max_threads;
187   if (nthr > max_threads) { // need more space in arrays
188     resize(nthr);
189   }
190 
191   for (size_t i = 0; i < max_threads; i++) {
192     for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
193       flags[j][i].stillNeed = 1;
194     }
195     go[i].go.store(0);
196     iter[i].iter = 0;
197     if (i >= old_max)
198       sleep[i].sleep = false;
199   }
200 
201   // Recalculate num_gos, etc. based on new nthr
202   computeVarsForN(nthr);
203 
204   num_threads = nthr;
205 
206   if (team_icvs == NULL)
207     team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
208 }
209 
210 // This function is used only when KMP_BLOCKTIME is not infinite.
211 // static
212 void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
213                                size_t start, size_t stop, size_t inc,
214                                size_t tid) {
215   KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
216   if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
217     return;
218 
219   kmp_info_t **other_threads = team->t.t_threads;
220   for (size_t thr = start; thr < stop; thr += inc) {
221     KMP_DEBUG_ASSERT(other_threads[thr]);
222     int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
223     // Wake up worker regardless of if it appears to be sleeping or not
224     __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
225   }
226 }
227 
228 static void __kmp_dist_barrier_gather(
229     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
230     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
231   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
232   kmp_team_t *team;
233   distributedBarrier *b;
234   kmp_info_t **other_threads;
235   kmp_uint64 my_current_iter, my_next_iter;
236   kmp_uint32 nproc;
237   bool group_leader;
238 
239   team = this_thr->th.th_team;
240   nproc = this_thr->th.th_team_nproc;
241   other_threads = team->t.t_threads;
242   b = team->t.b;
243   my_current_iter = b->iter[tid].iter;
244   my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
245   group_leader = ((tid % b->threads_per_group) == 0);
246 
247   KA_TRACE(20,
248            ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
249             gtid, team->t.t_id, tid, bt));
250 
251 #if USE_ITT_BUILD && USE_ITT_NOTIFY
252   // Barrier imbalance - save arrive time to the thread
253   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
254     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
255         __itt_get_timestamp();
256   }
257 #endif
258 
259   if (group_leader) {
260     // Start from the thread after the group leader
261     size_t group_start = tid + 1;
262     size_t group_end = tid + b->threads_per_group;
263     size_t threads_pending = 0;
264 
265     if (group_end > nproc)
266       group_end = nproc;
267     do { // wait for threads in my group
268       threads_pending = 0;
269       // Check all the flags every time to avoid branch misspredict
270       for (size_t thr = group_start; thr < group_end; thr++) {
271         // Each thread uses a different cache line
272         threads_pending += b->flags[my_current_iter][thr].stillNeed;
273       }
274       // Execute tasks here
275       if (__kmp_tasking_mode != tskm_immediate_exec) {
276         kmp_task_team_t *task_team = this_thr->th.th_task_team;
277         if (task_team != NULL) {
278           if (TCR_SYNC_4(task_team->tt.tt_active)) {
279             if (KMP_TASKING_ENABLED(task_team)) {
280               int tasks_completed = FALSE;
281               __kmp_atomic_execute_tasks_64(
282                   this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
283                   &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
284             } else
285               this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
286           }
287         } else {
288           this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
289         } // if
290       }
291       if (TCR_4(__kmp_global.g.g_done)) {
292         if (__kmp_global.g.g_abort)
293           __kmp_abort_thread();
294         break;
295       } else if (__kmp_tasking_mode != tskm_immediate_exec &&
296                  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
297         this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
298       }
299     } while (threads_pending > 0);
300 
301     if (reduce) { // Perform reduction if needed
302       OMPT_REDUCTION_DECL(this_thr, gtid);
303       OMPT_REDUCTION_BEGIN;
304       // Group leader reduces all threads in group
305       for (size_t thr = group_start; thr < group_end; thr++) {
306         (*reduce)(this_thr->th.th_local.reduce_data,
307                   other_threads[thr]->th.th_local.reduce_data);
308       }
309       OMPT_REDUCTION_END;
310     }
311 
312     // Set flag for next iteration
313     b->flags[my_next_iter][tid].stillNeed = 1;
314     // Each thread uses a different cache line; resets stillNeed to 0 to
315     // indicate it has reached the barrier
316     b->flags[my_current_iter][tid].stillNeed = 0;
317 
318     do { // wait for all group leaders
319       threads_pending = 0;
320       for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
321         threads_pending += b->flags[my_current_iter][thr].stillNeed;
322       }
323       // Execute tasks here
324       if (__kmp_tasking_mode != tskm_immediate_exec) {
325         kmp_task_team_t *task_team = this_thr->th.th_task_team;
326         if (task_team != NULL) {
327           if (TCR_SYNC_4(task_team->tt.tt_active)) {
328             if (KMP_TASKING_ENABLED(task_team)) {
329               int tasks_completed = FALSE;
330               __kmp_atomic_execute_tasks_64(
331                   this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
332                   &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
333             } else
334               this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
335           }
336         } else {
337           this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
338         } // if
339       }
340       if (TCR_4(__kmp_global.g.g_done)) {
341         if (__kmp_global.g.g_abort)
342           __kmp_abort_thread();
343         break;
344       } else if (__kmp_tasking_mode != tskm_immediate_exec &&
345                  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
346         this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
347       }
348     } while (threads_pending > 0);
349 
350     if (reduce) { // Perform reduction if needed
351       if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
352         OMPT_REDUCTION_DECL(this_thr, gtid);
353         OMPT_REDUCTION_BEGIN;
354         for (size_t thr = b->threads_per_group; thr < nproc;
355              thr += b->threads_per_group) {
356           (*reduce)(this_thr->th.th_local.reduce_data,
357                     other_threads[thr]->th.th_local.reduce_data);
358         }
359         OMPT_REDUCTION_END;
360       }
361     }
362   } else {
363     // Set flag for next iteration
364     b->flags[my_next_iter][tid].stillNeed = 1;
365     // Each thread uses a different cache line; resets stillNeed to 0 to
366     // indicate it has reached the barrier
367     b->flags[my_current_iter][tid].stillNeed = 0;
368   }
369 
370   KMP_MFENCE();
371 
372   KA_TRACE(20,
373            ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
374             gtid, team->t.t_id, tid, bt));
375 }
376 
377 static void __kmp_dist_barrier_release(
378     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
379     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
380   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
381   kmp_team_t *team;
382   distributedBarrier *b;
383   kmp_bstate_t *thr_bar;
384   kmp_uint64 my_current_iter, next_go;
385   size_t my_go_index;
386   bool group_leader;
387 
388   KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
389                 gtid, tid, bt));
390 
391   thr_bar = &this_thr->th.th_bar[bt].bb;
392 
393   if (!KMP_MASTER_TID(tid)) {
394     // workers and non-master group leaders need to check their presence in team
395     do {
396       if (this_thr->th.th_used_in_team.load() != 1 &&
397           this_thr->th.th_used_in_team.load() != 3) {
398         // Thread is not in use in a team. Wait on location in tid's thread
399         // struct. The 0 value tells anyone looking that this thread is spinning
400         // or sleeping until this location becomes 3 again; 3 is the transition
401         // state to get to 1 which is waiting on go and being in the team
402         kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
403         if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
404                                         0) ||
405             this_thr->th.th_used_in_team.load() == 0) {
406           my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
407         }
408 #if USE_ITT_BUILD && USE_ITT_NOTIFY
409         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
410           // In fork barrier where we could not get the object reliably
411           itt_sync_obj =
412               __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
413           // Cancel wait on previous parallel region...
414           __kmp_itt_task_starting(itt_sync_obj);
415 
416           if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
417             return;
418 
419           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
420           if (itt_sync_obj != NULL)
421             // Call prepare as early as possible for "new" barrier
422             __kmp_itt_task_finished(itt_sync_obj);
423         } else
424 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
425             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
426           return;
427       }
428       if (this_thr->th.th_used_in_team.load() != 1 &&
429           this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
430         continue;
431       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
432         return;
433 
434       // At this point, the thread thinks it is in use in a team, or in
435       // transition to be used in a team, but it might have reached this barrier
436       // before it was marked unused by the team. Unused threads are awoken and
437       // shifted to wait on local thread struct elsewhere. It also might reach
438       // this point by being picked up for use by a different team. Either way,
439       // we need to update the tid.
440       tid = __kmp_tid_from_gtid(gtid);
441       team = this_thr->th.th_team;
442       KMP_DEBUG_ASSERT(tid >= 0);
443       KMP_DEBUG_ASSERT(team);
444       b = team->t.b;
445       my_current_iter = b->iter[tid].iter;
446       next_go = my_current_iter + distributedBarrier::MAX_ITERS;
447       my_go_index = tid / b->threads_per_go;
448       if (this_thr->th.th_used_in_team.load() == 3) {
449         KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
450       }
451       // Check if go flag is set
452       if (b->go[my_go_index].go.load() != next_go) {
453         // Wait on go flag on team
454         kmp_atomic_flag_64<false, true> my_flag(
455             &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
456         my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
457         KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
458                          b->iter[tid].iter == 0);
459         KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
460       }
461 
462       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
463         return;
464       // At this point, the thread's go location was set. This means the primary
465       // thread is safely in the barrier, and so this thread's data is
466       // up-to-date, but we should check again that this thread is really in
467       // use in the team, as it could have been woken up for the purpose of
468       // changing team size, or reaping threads at shutdown.
469       if (this_thr->th.th_used_in_team.load() == 1)
470         break;
471     } while (1);
472 
473     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
474       return;
475 
476     group_leader = ((tid % b->threads_per_group) == 0);
477     if (group_leader) {
478       // Tell all the threads in my group they can go!
479       for (size_t go_idx = my_go_index + 1;
480            go_idx < my_go_index + b->gos_per_group; go_idx++) {
481         b->go[go_idx].go.store(next_go);
482       }
483       // Fence added so that workers can see changes to go. sfence inadequate.
484       KMP_MFENCE();
485     }
486 
487 #if KMP_BARRIER_ICV_PUSH
488     if (propagate_icvs) { // copy ICVs to final dest
489       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
490                                tid, FALSE);
491       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
492                 (kmp_internal_control_t *)team->t.b->team_icvs);
493       copy_icvs(&thr_bar->th_fixed_icvs,
494                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
495     }
496 #endif
497     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
498       // This thread is now awake and participating in the barrier;
499       // wake up the other threads in the group
500       size_t nproc = this_thr->th.th_team_nproc;
501       size_t group_end = tid + b->threads_per_group;
502       if (nproc < group_end)
503         group_end = nproc;
504       __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
505     }
506   } else { //  Primary thread
507     team = this_thr->th.th_team;
508     b = team->t.b;
509     my_current_iter = b->iter[tid].iter;
510     next_go = my_current_iter + distributedBarrier::MAX_ITERS;
511 #if KMP_BARRIER_ICV_PUSH
512     if (propagate_icvs) {
513       // primary thread has ICVs in final destination; copy
514       copy_icvs(&thr_bar->th_fixed_icvs,
515                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
516     }
517 #endif
518     // Tell all the group leaders they can go!
519     for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
520       b->go[go_idx].go.store(next_go);
521     }
522 
523     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
524       // Wake-up the group leaders
525       size_t nproc = this_thr->th.th_team_nproc;
526       __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
527                                 b->threads_per_group, tid);
528     }
529 
530     // Tell all the threads in my group they can go!
531     for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
532       b->go[go_idx].go.store(next_go);
533     }
534 
535     // Fence added so that workers can see changes to go. sfence inadequate.
536     KMP_MFENCE();
537 
538     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
539       // Wake-up the other threads in my group
540       size_t nproc = this_thr->th.th_team_nproc;
541       size_t group_end = tid + b->threads_per_group;
542       if (nproc < group_end)
543         group_end = nproc;
544       __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
545     }
546   }
547   // Update to next iteration
548   KMP_ASSERT(my_current_iter == b->iter[tid].iter);
549   b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
550 
551   KA_TRACE(
552       20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
553            gtid, team->t.t_id, tid, bt));
554 }
555 
556 // Linear Barrier
557 template <bool cancellable = false>
558 static bool __kmp_linear_barrier_gather_template(
559     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
560     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
561   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
562   kmp_team_t *team = this_thr->th.th_team;
563   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
564   kmp_info_t **other_threads = team->t.t_threads;
565 
566   KA_TRACE(
567       20,
568       ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
569        gtid, team->t.t_id, tid, bt));
570   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
571 
572 #if USE_ITT_BUILD && USE_ITT_NOTIFY
573   // Barrier imbalance - save arrive time to the thread
574   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
575     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
576         __itt_get_timestamp();
577   }
578 #endif
579   // We now perform a linear reduction to signal that all of the threads have
580   // arrived.
581   if (!KMP_MASTER_TID(tid)) {
582     KA_TRACE(20,
583              ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
584               "arrived(%p): %llu => %llu\n",
585               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
586               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
587               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
588     // Mark arrival to primary thread
589     /* After performing this write, a worker thread may not assume that the team
590        is valid any more - it could be deallocated by the primary thread at any
591        time. */
592     ANNOTATE_BARRIER_BEGIN(this_thr);
593     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
594     flag.release();
595   } else {
596     kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
597     int nproc = this_thr->th.th_team_nproc;
598     int i;
599     // Don't have to worry about sleep bit here or atomic since team setting
600     kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
601 
602     // Collect all the worker team member threads.
603     for (i = 1; i < nproc; ++i) {
604 #if KMP_CACHE_MANAGE
605       // Prefetch next thread's arrived count
606       if (i + 1 < nproc)
607         KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
608 #endif /* KMP_CACHE_MANAGE */
609       KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
610                     "arrived(%p) == %llu\n",
611                     gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
612                     team->t.t_id, i,
613                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
614 
615       // Wait for worker thread to arrive
616       if (cancellable) {
617         kmp_flag_64<true, false> flag(
618             &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
619         if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
620           return true;
621       } else {
622         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
623                            new_state);
624         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
625       }
626       ANNOTATE_BARRIER_END(other_threads[i]);
627 #if USE_ITT_BUILD && USE_ITT_NOTIFY
628       // Barrier imbalance - write min of the thread time and the other thread
629       // time to the thread.
630       if (__kmp_forkjoin_frames_mode == 2) {
631         this_thr->th.th_bar_min_time = KMP_MIN(
632             this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
633       }
634 #endif
635       if (reduce) {
636         KA_TRACE(100,
637                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
638                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
639                   team->t.t_id, i));
640         ANNOTATE_REDUCE_AFTER(reduce);
641         OMPT_REDUCTION_DECL(this_thr, gtid);
642         OMPT_REDUCTION_BEGIN;
643         (*reduce)(this_thr->th.th_local.reduce_data,
644                   other_threads[i]->th.th_local.reduce_data);
645         OMPT_REDUCTION_END;
646         ANNOTATE_REDUCE_BEFORE(reduce);
647         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
648       }
649     }
650     // Don't have to worry about sleep bit here or atomic since team setting
651     team_bar->b_arrived = new_state;
652     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
653                   "arrived(%p) = %llu\n",
654                   gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
655                   new_state));
656   }
657   KA_TRACE(
658       20,
659       ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
660        gtid, team->t.t_id, tid, bt));
661   return false;
662 }
663 
664 template <bool cancellable = false>
665 static bool __kmp_linear_barrier_release_template(
666     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
667     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
668   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
669   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
670   kmp_team_t *team;
671 
672   if (KMP_MASTER_TID(tid)) {
673     unsigned int i;
674     kmp_uint32 nproc = this_thr->th.th_team_nproc;
675     kmp_info_t **other_threads;
676 
677     team = __kmp_threads[gtid]->th.th_team;
678     KMP_DEBUG_ASSERT(team != NULL);
679     other_threads = team->t.t_threads;
680 
681     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
682                   "barrier type %d\n",
683                   gtid, team->t.t_id, tid, bt));
684 
685     if (nproc > 1) {
686 #if KMP_BARRIER_ICV_PUSH
687       {
688         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
689         if (propagate_icvs) {
690           ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
691           for (i = 1; i < nproc; ++i) {
692             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
693                                      team, i, FALSE);
694             ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
695                            &team->t.t_implicit_task_taskdata[0].td_icvs);
696           }
697           ngo_sync();
698         }
699       }
700 #endif // KMP_BARRIER_ICV_PUSH
701 
702       // Now, release all of the worker threads
703       for (i = 1; i < nproc; ++i) {
704 #if KMP_CACHE_MANAGE
705         // Prefetch next thread's go flag
706         if (i + 1 < nproc)
707           KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
708 #endif /* KMP_CACHE_MANAGE */
709         KA_TRACE(
710             20,
711             ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
712              "go(%p): %u => %u\n",
713              gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
714              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
715              other_threads[i]->th.th_bar[bt].bb.b_go,
716              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
717         ANNOTATE_BARRIER_BEGIN(other_threads[i]);
718         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
719                            other_threads[i]);
720         flag.release();
721       }
722     }
723   } else { // Wait for the PRIMARY thread to release us
724     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
725                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
726     if (cancellable) {
727       kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
728       if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
729         return true;
730     } else {
731       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
732       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
733     }
734     ANNOTATE_BARRIER_END(this_thr);
735 #if USE_ITT_BUILD && USE_ITT_NOTIFY
736     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
737       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
738       // disabled)
739       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
740       // Cancel wait on previous parallel region...
741       __kmp_itt_task_starting(itt_sync_obj);
742 
743       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
744         return false;
745 
746       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
747       if (itt_sync_obj != NULL)
748         // Call prepare as early as possible for "new" barrier
749         __kmp_itt_task_finished(itt_sync_obj);
750     } else
751 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
752         // Early exit for reaping threads releasing forkjoin barrier
753         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
754       return false;
755 // The worker thread may now assume that the team is valid.
756 #ifdef KMP_DEBUG
757     tid = __kmp_tid_from_gtid(gtid);
758     team = __kmp_threads[gtid]->th.th_team;
759 #endif
760     KMP_DEBUG_ASSERT(team != NULL);
761     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
762     KA_TRACE(20,
763              ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
764               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
765     KMP_MB(); // Flush all pending memory write invalidates.
766   }
767   KA_TRACE(
768       20,
769       ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
770        gtid, team->t.t_id, tid, bt));
771   return false;
772 }
773 
774 static void __kmp_linear_barrier_gather(
775     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
776     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
777   __kmp_linear_barrier_gather_template<false>(
778       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
779 }
780 
781 static bool __kmp_linear_barrier_gather_cancellable(
782     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
783     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
784   return __kmp_linear_barrier_gather_template<true>(
785       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
786 }
787 
788 static void __kmp_linear_barrier_release(
789     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
790     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
791   __kmp_linear_barrier_release_template<false>(
792       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
793 }
794 
795 static bool __kmp_linear_barrier_release_cancellable(
796     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
797     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
798   return __kmp_linear_barrier_release_template<true>(
799       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
800 }
801 
802 // Tree barrier
803 static void __kmp_tree_barrier_gather(
804     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
805     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
806   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
807   kmp_team_t *team = this_thr->th.th_team;
808   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
809   kmp_info_t **other_threads = team->t.t_threads;
810   kmp_uint32 nproc = this_thr->th.th_team_nproc;
811   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
812   kmp_uint32 branch_factor = 1 << branch_bits;
813   kmp_uint32 child;
814   kmp_uint32 child_tid;
815   kmp_uint64 new_state = 0;
816 
817   KA_TRACE(
818       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
819            gtid, team->t.t_id, tid, bt));
820   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
821 
822 #if USE_ITT_BUILD && USE_ITT_NOTIFY
823   // Barrier imbalance - save arrive time to the thread
824   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
825     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
826         __itt_get_timestamp();
827   }
828 #endif
829   // Perform tree gather to wait until all threads have arrived; reduce any
830   // required data as we go
831   child_tid = (tid << branch_bits) + 1;
832   if (child_tid < nproc) {
833     // Parent threads wait for all their children to arrive
834     new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
835     child = 1;
836     do {
837       kmp_info_t *child_thr = other_threads[child_tid];
838       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
839 #if KMP_CACHE_MANAGE
840       // Prefetch next thread's arrived count
841       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
842         KMP_CACHE_PREFETCH(
843             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
844 #endif /* KMP_CACHE_MANAGE */
845       KA_TRACE(20,
846                ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
847                 "arrived(%p) == %llu\n",
848                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
849                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
850       // Wait for child to arrive
851       kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
852       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
853       ANNOTATE_BARRIER_END(child_thr);
854 #if USE_ITT_BUILD && USE_ITT_NOTIFY
855       // Barrier imbalance - write min of the thread time and a child time to
856       // the thread.
857       if (__kmp_forkjoin_frames_mode == 2) {
858         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
859                                                child_thr->th.th_bar_min_time);
860       }
861 #endif
862       if (reduce) {
863         KA_TRACE(100,
864                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
865                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
866                   team->t.t_id, child_tid));
867         ANNOTATE_REDUCE_AFTER(reduce);
868         OMPT_REDUCTION_DECL(this_thr, gtid);
869         OMPT_REDUCTION_BEGIN;
870         (*reduce)(this_thr->th.th_local.reduce_data,
871                   child_thr->th.th_local.reduce_data);
872         OMPT_REDUCTION_END;
873         ANNOTATE_REDUCE_BEFORE(reduce);
874         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
875       }
876       child++;
877       child_tid++;
878     } while (child <= branch_factor && child_tid < nproc);
879   }
880 
881   if (!KMP_MASTER_TID(tid)) { // Worker threads
882     kmp_int32 parent_tid = (tid - 1) >> branch_bits;
883 
884     KA_TRACE(20,
885              ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
886               "arrived(%p): %llu => %llu\n",
887               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
888               team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
889               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
890 
891     // Mark arrival to parent thread
892     /* After performing this write, a worker thread may not assume that the team
893        is valid any more - it could be deallocated by the primary thread at any
894        time.  */
895     ANNOTATE_BARRIER_BEGIN(this_thr);
896     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
897     flag.release();
898   } else {
899     // Need to update the team arrived pointer if we are the primary thread
900     if (nproc > 1) // New value was already computed above
901       team->t.t_bar[bt].b_arrived = new_state;
902     else
903       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
904     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
905                   "arrived(%p) = %llu\n",
906                   gtid, team->t.t_id, tid, team->t.t_id,
907                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
908   }
909   KA_TRACE(20,
910            ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
911             gtid, team->t.t_id, tid, bt));
912 }
913 
914 static void __kmp_tree_barrier_release(
915     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
916     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
917   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
918   kmp_team_t *team;
919   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
920   kmp_uint32 nproc;
921   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
922   kmp_uint32 branch_factor = 1 << branch_bits;
923   kmp_uint32 child;
924   kmp_uint32 child_tid;
925 
926   // Perform a tree release for all of the threads that have been gathered
927   if (!KMP_MASTER_TID(
928           tid)) { // Handle fork barrier workers who aren't part of a team yet
929     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
930                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
931     // Wait for parent thread to release us
932     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
933     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
934     ANNOTATE_BARRIER_END(this_thr);
935 #if USE_ITT_BUILD && USE_ITT_NOTIFY
936     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
937       // In fork barrier where we could not get the object reliably (or
938       // ITTNOTIFY is disabled)
939       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
940       // Cancel wait on previous parallel region...
941       __kmp_itt_task_starting(itt_sync_obj);
942 
943       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
944         return;
945 
946       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
947       if (itt_sync_obj != NULL)
948         // Call prepare as early as possible for "new" barrier
949         __kmp_itt_task_finished(itt_sync_obj);
950     } else
951 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
952         // Early exit for reaping threads releasing forkjoin barrier
953         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
954       return;
955 
956     // The worker thread may now assume that the team is valid.
957     team = __kmp_threads[gtid]->th.th_team;
958     KMP_DEBUG_ASSERT(team != NULL);
959     tid = __kmp_tid_from_gtid(gtid);
960 
961     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
962     KA_TRACE(20,
963              ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
964               team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
965     KMP_MB(); // Flush all pending memory write invalidates.
966   } else {
967     team = __kmp_threads[gtid]->th.th_team;
968     KMP_DEBUG_ASSERT(team != NULL);
969     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
970                   "barrier type %d\n",
971                   gtid, team->t.t_id, tid, bt));
972   }
973   nproc = this_thr->th.th_team_nproc;
974   child_tid = (tid << branch_bits) + 1;
975 
976   if (child_tid < nproc) {
977     kmp_info_t **other_threads = team->t.t_threads;
978     child = 1;
979     // Parent threads release all their children
980     do {
981       kmp_info_t *child_thr = other_threads[child_tid];
982       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
983 #if KMP_CACHE_MANAGE
984       // Prefetch next thread's go count
985       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
986         KMP_CACHE_PREFETCH(
987             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
988 #endif /* KMP_CACHE_MANAGE */
989 
990 #if KMP_BARRIER_ICV_PUSH
991       {
992         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
993         if (propagate_icvs) {
994           __kmp_init_implicit_task(team->t.t_ident,
995                                    team->t.t_threads[child_tid], team,
996                                    child_tid, FALSE);
997           copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
998                     &team->t.t_implicit_task_taskdata[0].td_icvs);
999         }
1000       }
1001 #endif // KMP_BARRIER_ICV_PUSH
1002       KA_TRACE(20,
1003                ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1004                 "go(%p): %u => %u\n",
1005                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1006                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1007                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1008       // Release child from barrier
1009       ANNOTATE_BARRIER_BEGIN(child_thr);
1010       kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1011       flag.release();
1012       child++;
1013       child_tid++;
1014     } while (child <= branch_factor && child_tid < nproc);
1015   }
1016   KA_TRACE(
1017       20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1018            gtid, team->t.t_id, tid, bt));
1019 }
1020 
1021 // Hyper Barrier
1022 static void __kmp_hyper_barrier_gather(
1023     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1024     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1025   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1026   kmp_team_t *team = this_thr->th.th_team;
1027   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1028   kmp_info_t **other_threads = team->t.t_threads;
1029   kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1030   kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1031   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1032   kmp_uint32 branch_factor = 1 << branch_bits;
1033   kmp_uint32 offset;
1034   kmp_uint32 level;
1035 
1036   KA_TRACE(
1037       20,
1038       ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1039        gtid, team->t.t_id, tid, bt));
1040   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1041 
1042 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1043   // Barrier imbalance - save arrive time to the thread
1044   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1045     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1046         __itt_get_timestamp();
1047   }
1048 #endif
1049   /* Perform a hypercube-embedded tree gather to wait until all of the threads
1050      have arrived, and reduce any required data as we go.  */
1051   kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1052   for (level = 0, offset = 1; offset < num_threads;
1053        level += branch_bits, offset <<= branch_bits) {
1054     kmp_uint32 child;
1055     kmp_uint32 child_tid;
1056 
1057     if (((tid >> level) & (branch_factor - 1)) != 0) {
1058       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1059 
1060       KMP_MB(); // Synchronize parent and child threads.
1061       KA_TRACE(20,
1062                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1063                 "arrived(%p): %llu => %llu\n",
1064                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1065                 team->t.t_id, parent_tid, &thr_bar->b_arrived,
1066                 thr_bar->b_arrived,
1067                 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1068       // Mark arrival to parent thread
1069       /* After performing this write (in the last iteration of the enclosing for
1070          loop), a worker thread may not assume that the team is valid any more
1071          - it could be deallocated by the primary thread at any time.  */
1072       ANNOTATE_BARRIER_BEGIN(this_thr);
1073       p_flag.set_waiter(other_threads[parent_tid]);
1074       p_flag.release();
1075       break;
1076     }
1077 
1078     // Parent threads wait for children to arrive
1079     if (new_state == KMP_BARRIER_UNUSED_STATE)
1080       new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1081     for (child = 1, child_tid = tid + (1 << level);
1082          child < branch_factor && child_tid < num_threads;
1083          child++, child_tid += (1 << level)) {
1084       kmp_info_t *child_thr = other_threads[child_tid];
1085       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1086 #if KMP_CACHE_MANAGE
1087       kmp_uint32 next_child_tid = child_tid + (1 << level);
1088       // Prefetch next thread's arrived count
1089       if (child + 1 < branch_factor && next_child_tid < num_threads)
1090         KMP_CACHE_PREFETCH(
1091             &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1092 #endif /* KMP_CACHE_MANAGE */
1093       KA_TRACE(20,
1094                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1095                 "arrived(%p) == %llu\n",
1096                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1097                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1098       // Wait for child to arrive
1099       kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1100       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1101       ANNOTATE_BARRIER_END(child_thr);
1102       KMP_MB(); // Synchronize parent and child threads.
1103 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1104       // Barrier imbalance - write min of the thread time and a child time to
1105       // the thread.
1106       if (__kmp_forkjoin_frames_mode == 2) {
1107         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1108                                                child_thr->th.th_bar_min_time);
1109       }
1110 #endif
1111       if (reduce) {
1112         KA_TRACE(100,
1113                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1114                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1115                   team->t.t_id, child_tid));
1116         ANNOTATE_REDUCE_AFTER(reduce);
1117         OMPT_REDUCTION_DECL(this_thr, gtid);
1118         OMPT_REDUCTION_BEGIN;
1119         (*reduce)(this_thr->th.th_local.reduce_data,
1120                   child_thr->th.th_local.reduce_data);
1121         OMPT_REDUCTION_END;
1122         ANNOTATE_REDUCE_BEFORE(reduce);
1123         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1124       }
1125     }
1126   }
1127 
1128   if (KMP_MASTER_TID(tid)) {
1129     // Need to update the team arrived pointer if we are the primary thread
1130     if (new_state == KMP_BARRIER_UNUSED_STATE)
1131       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1132     else
1133       team->t.t_bar[bt].b_arrived = new_state;
1134     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1135                   "arrived(%p) = %llu\n",
1136                   gtid, team->t.t_id, tid, team->t.t_id,
1137                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1138   }
1139   KA_TRACE(
1140       20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1141            gtid, team->t.t_id, tid, bt));
1142 }
1143 
1144 // The reverse versions seem to beat the forward versions overall
1145 #define KMP_REVERSE_HYPER_BAR
1146 static void __kmp_hyper_barrier_release(
1147     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1148     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1149   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1150   kmp_team_t *team;
1151   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1152   kmp_info_t **other_threads;
1153   kmp_uint32 num_threads;
1154   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1155   kmp_uint32 branch_factor = 1 << branch_bits;
1156   kmp_uint32 child;
1157   kmp_uint32 child_tid;
1158   kmp_uint32 offset;
1159   kmp_uint32 level;
1160 
1161   /* Perform a hypercube-embedded tree release for all of the threads that have
1162      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1163      are released in the reverse order of the corresponding gather, otherwise
1164      threads are released in the same order. */
1165   if (KMP_MASTER_TID(tid)) { // primary thread
1166     team = __kmp_threads[gtid]->th.th_team;
1167     KMP_DEBUG_ASSERT(team != NULL);
1168     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1169                   "barrier type %d\n",
1170                   gtid, team->t.t_id, tid, bt));
1171 #if KMP_BARRIER_ICV_PUSH
1172     if (propagate_icvs) { // primary already has ICVs in final destination; copy
1173       copy_icvs(&thr_bar->th_fixed_icvs,
1174                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1175     }
1176 #endif
1177   } else { // Handle fork barrier workers who aren't part of a team yet
1178     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1179                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1180     // Wait for parent thread to release us
1181     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1182     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1183     ANNOTATE_BARRIER_END(this_thr);
1184 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1185     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1186       // In fork barrier where we could not get the object reliably
1187       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1188       // Cancel wait on previous parallel region...
1189       __kmp_itt_task_starting(itt_sync_obj);
1190 
1191       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1192         return;
1193 
1194       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1195       if (itt_sync_obj != NULL)
1196         // Call prepare as early as possible for "new" barrier
1197         __kmp_itt_task_finished(itt_sync_obj);
1198     } else
1199 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1200         // Early exit for reaping threads releasing forkjoin barrier
1201         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1202       return;
1203 
1204     // The worker thread may now assume that the team is valid.
1205     team = __kmp_threads[gtid]->th.th_team;
1206     KMP_DEBUG_ASSERT(team != NULL);
1207     tid = __kmp_tid_from_gtid(gtid);
1208 
1209     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1210     KA_TRACE(20,
1211              ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1212               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1213     KMP_MB(); // Flush all pending memory write invalidates.
1214   }
1215   num_threads = this_thr->th.th_team_nproc;
1216   other_threads = team->t.t_threads;
1217 
1218 #ifdef KMP_REVERSE_HYPER_BAR
1219   // Count up to correct level for parent
1220   for (level = 0, offset = 1;
1221        offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1222        level += branch_bits, offset <<= branch_bits)
1223     ;
1224 
1225   // Now go down from there
1226   for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1227        level -= branch_bits, offset >>= branch_bits)
1228 #else
1229   // Go down the tree, level by level
1230   for (level = 0, offset = 1; offset < num_threads;
1231        level += branch_bits, offset <<= branch_bits)
1232 #endif // KMP_REVERSE_HYPER_BAR
1233   {
1234 #ifdef KMP_REVERSE_HYPER_BAR
1235     /* Now go in reverse order through the children, highest to lowest.
1236        Initial setting of child is conservative here. */
1237     child = num_threads >> ((level == 0) ? level : level - 1);
1238     for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1239         child_tid = tid + (child << level);
1240          child >= 1; child--, child_tid -= (1 << level))
1241 #else
1242     if (((tid >> level) & (branch_factor - 1)) != 0)
1243       // No need to go lower than this, since this is the level parent would be
1244       // notified
1245       break;
1246     // Iterate through children on this level of the tree
1247     for (child = 1, child_tid = tid + (1 << level);
1248          child < branch_factor && child_tid < num_threads;
1249          child++, child_tid += (1 << level))
1250 #endif // KMP_REVERSE_HYPER_BAR
1251     {
1252       if (child_tid >= num_threads)
1253         continue; // Child doesn't exist so keep going
1254       else {
1255         kmp_info_t *child_thr = other_threads[child_tid];
1256         kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1257 #if KMP_CACHE_MANAGE
1258         kmp_uint32 next_child_tid = child_tid - (1 << level);
1259 // Prefetch next thread's go count
1260 #ifdef KMP_REVERSE_HYPER_BAR
1261         if (child - 1 >= 1 && next_child_tid < num_threads)
1262 #else
1263         if (child + 1 < branch_factor && next_child_tid < num_threads)
1264 #endif // KMP_REVERSE_HYPER_BAR
1265           KMP_CACHE_PREFETCH(
1266               &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1267 #endif /* KMP_CACHE_MANAGE */
1268 
1269 #if KMP_BARRIER_ICV_PUSH
1270         if (propagate_icvs) // push my fixed ICVs to my child
1271           copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1272 #endif // KMP_BARRIER_ICV_PUSH
1273 
1274         KA_TRACE(
1275             20,
1276             ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1277              "go(%p): %u => %u\n",
1278              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1279              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1280              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1281         // Release child from barrier
1282         ANNOTATE_BARRIER_BEGIN(child_thr);
1283         kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1284         flag.release();
1285       }
1286     }
1287   }
1288 #if KMP_BARRIER_ICV_PUSH
1289   if (propagate_icvs &&
1290       !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1291     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1292                              FALSE);
1293     copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1294               &thr_bar->th_fixed_icvs);
1295   }
1296 #endif
1297   KA_TRACE(
1298       20,
1299       ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1300        gtid, team->t.t_id, tid, bt));
1301 }
1302 
1303 // Hierarchical Barrier
1304 
1305 // Initialize thread barrier data
1306 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1307    Performs the minimum amount of initialization required based on how the team
1308    has changed. Returns true if leaf children will require both on-core and
1309    traditional wake-up mechanisms. For example, if the team size increases,
1310    threads already in the team will respond to on-core wakeup on their parent
1311    thread, but threads newly added to the team will only be listening on the
1312    their local b_go. */
1313 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1314                                                    kmp_bstate_t *thr_bar,
1315                                                    kmp_uint32 nproc, int gtid,
1316                                                    int tid, kmp_team_t *team) {
1317   // Checks to determine if (re-)initialization is needed
1318   bool uninitialized = thr_bar->team == NULL;
1319   bool team_changed = team != thr_bar->team;
1320   bool team_sz_changed = nproc != thr_bar->nproc;
1321   bool tid_changed = tid != thr_bar->old_tid;
1322   bool retval = false;
1323 
1324   if (uninitialized || team_sz_changed) {
1325     __kmp_get_hierarchy(nproc, thr_bar);
1326   }
1327 
1328   if (uninitialized || team_sz_changed || tid_changed) {
1329     thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1330     thr_bar->parent_tid = -1; // default for primary thread
1331     if (!KMP_MASTER_TID(tid)) {
1332       // if not primary thread, find parent thread in hierarchy
1333       kmp_uint32 d = 0;
1334       while (d < thr_bar->depth) { // find parent based on level of thread in
1335         // hierarchy, and note level
1336         kmp_uint32 rem;
1337         if (d == thr_bar->depth - 2) { // reached level right below the primary
1338           thr_bar->parent_tid = 0;
1339           thr_bar->my_level = d;
1340           break;
1341         } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1342           // TODO: can we make the above op faster?
1343           // thread is not a subtree root at next level, so this is max
1344           thr_bar->parent_tid = tid - rem;
1345           thr_bar->my_level = d;
1346           break;
1347         }
1348         ++d;
1349       }
1350     }
1351     __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1352                             (thr_bar->skip_per_level[thr_bar->my_level])),
1353                        &(thr_bar->offset));
1354     thr_bar->old_tid = tid;
1355     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1356     thr_bar->team = team;
1357     thr_bar->parent_bar =
1358         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1359   }
1360   if (uninitialized || team_changed || tid_changed) {
1361     thr_bar->team = team;
1362     thr_bar->parent_bar =
1363         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1364     retval = true;
1365   }
1366   if (uninitialized || team_sz_changed || tid_changed) {
1367     thr_bar->nproc = nproc;
1368     thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1369     if (thr_bar->my_level == 0)
1370       thr_bar->leaf_kids = 0;
1371     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1372       __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1373     thr_bar->leaf_state = 0;
1374     for (int i = 0; i < thr_bar->leaf_kids; ++i)
1375       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1376   }
1377   return retval;
1378 }
1379 
1380 static void __kmp_hierarchical_barrier_gather(
1381     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1382     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1383   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1384   kmp_team_t *team = this_thr->th.th_team;
1385   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1386   kmp_uint32 nproc = this_thr->th.th_team_nproc;
1387   kmp_info_t **other_threads = team->t.t_threads;
1388   kmp_uint64 new_state = 0;
1389 
1390   int level = team->t.t_level;
1391   if (other_threads[0]
1392           ->th.th_teams_microtask) // are we inside the teams construct?
1393     if (this_thr->th.th_teams_size.nteams > 1)
1394       ++level; // level was not increased in teams construct for team_of_masters
1395   if (level == 1)
1396     thr_bar->use_oncore_barrier = 1;
1397   else
1398     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1399 
1400   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1401                 "barrier type %d\n",
1402                 gtid, team->t.t_id, tid, bt));
1403   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1404 
1405 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1406   // Barrier imbalance - save arrive time to the thread
1407   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1408     this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1409   }
1410 #endif
1411 
1412   (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1413                                                team);
1414 
1415   if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1416     kmp_int32 child_tid;
1417     new_state =
1418         (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1419     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1420         thr_bar->use_oncore_barrier) {
1421       if (thr_bar->leaf_kids) {
1422         // First, wait for leaf children to check-in on my b_arrived flag
1423         kmp_uint64 leaf_state =
1424             KMP_MASTER_TID(tid)
1425                 ? thr_bar->b_arrived | thr_bar->leaf_state
1426                 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1427         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1428                       "for leaf kids\n",
1429                       gtid, team->t.t_id, tid));
1430         kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1431         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1432         if (reduce) {
1433           ANNOTATE_REDUCE_AFTER(reduce);
1434           OMPT_REDUCTION_DECL(this_thr, gtid);
1435           OMPT_REDUCTION_BEGIN;
1436           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1437                ++child_tid) {
1438             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1439                            "T#%d(%d:%d)\n",
1440                            gtid, team->t.t_id, tid,
1441                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1442                            child_tid));
1443             ANNOTATE_BARRIER_END(other_threads[child_tid]);
1444             (*reduce)(this_thr->th.th_local.reduce_data,
1445                       other_threads[child_tid]->th.th_local.reduce_data);
1446           }
1447           OMPT_REDUCTION_END;
1448           ANNOTATE_REDUCE_BEFORE(reduce);
1449           ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1450         }
1451         // clear leaf_state bits
1452         KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1453       }
1454       // Next, wait for higher level children on each child's b_arrived flag
1455       for (kmp_uint32 d = 1; d < thr_bar->my_level;
1456            ++d) { // gather lowest level threads first, but skip 0
1457         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1458                    skip = thr_bar->skip_per_level[d];
1459         if (last > nproc)
1460           last = nproc;
1461         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1462           kmp_info_t *child_thr = other_threads[child_tid];
1463           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1464           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1465                         "T#%d(%d:%d) "
1466                         "arrived(%p) == %llu\n",
1467                         gtid, team->t.t_id, tid,
1468                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1469                         child_tid, &child_bar->b_arrived, new_state));
1470           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1471           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1472           ANNOTATE_BARRIER_END(child_thr);
1473           if (reduce) {
1474             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1475                            "T#%d(%d:%d)\n",
1476                            gtid, team->t.t_id, tid,
1477                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1478                            child_tid));
1479             ANNOTATE_REDUCE_AFTER(reduce);
1480             (*reduce)(this_thr->th.th_local.reduce_data,
1481                       child_thr->th.th_local.reduce_data);
1482             ANNOTATE_REDUCE_BEFORE(reduce);
1483             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1484           }
1485         }
1486       }
1487     } else { // Blocktime is not infinite
1488       for (kmp_uint32 d = 0; d < thr_bar->my_level;
1489            ++d) { // Gather lowest level threads first
1490         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1491                    skip = thr_bar->skip_per_level[d];
1492         if (last > nproc)
1493           last = nproc;
1494         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1495           kmp_info_t *child_thr = other_threads[child_tid];
1496           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1497           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1498                         "T#%d(%d:%d) "
1499                         "arrived(%p) == %llu\n",
1500                         gtid, team->t.t_id, tid,
1501                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1502                         child_tid, &child_bar->b_arrived, new_state));
1503           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1504           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1505           ANNOTATE_BARRIER_END(child_thr);
1506           if (reduce) {
1507             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1508                            "T#%d(%d:%d)\n",
1509                            gtid, team->t.t_id, tid,
1510                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1511                            child_tid));
1512             ANNOTATE_REDUCE_AFTER(reduce);
1513             (*reduce)(this_thr->th.th_local.reduce_data,
1514                       child_thr->th.th_local.reduce_data);
1515             ANNOTATE_REDUCE_BEFORE(reduce);
1516             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1517           }
1518         }
1519       }
1520     }
1521   }
1522   // All subordinates are gathered; now release parent if not primary thread
1523 
1524   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1525     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1526                   " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1527                   gtid, team->t.t_id, tid,
1528                   __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1529                   thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1530                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1531     /* Mark arrival to parent: After performing this write, a worker thread may
1532        not assume that the team is valid any more - it could be deallocated by
1533        the primary thread at any time. */
1534     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1535         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1536       // flag; release it
1537       ANNOTATE_BARRIER_BEGIN(this_thr);
1538       kmp_flag_64<> flag(&thr_bar->b_arrived,
1539                          other_threads[thr_bar->parent_tid]);
1540       flag.release();
1541     } else {
1542       // Leaf does special release on "offset" bits of parent's b_arrived flag
1543       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1544       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1545                            thr_bar->offset + 1);
1546       flag.set_waiter(other_threads[thr_bar->parent_tid]);
1547       flag.release();
1548     }
1549   } else { // Primary thread needs to update the team's b_arrived value
1550     team->t.t_bar[bt].b_arrived = new_state;
1551     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1552                   "arrived(%p) = %llu\n",
1553                   gtid, team->t.t_id, tid, team->t.t_id,
1554                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1555   }
1556   // Is the team access below unsafe or just technically invalid?
1557   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1558                 "barrier type %d\n",
1559                 gtid, team->t.t_id, tid, bt));
1560 }
1561 
1562 static void __kmp_hierarchical_barrier_release(
1563     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1564     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1565   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1566   kmp_team_t *team;
1567   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1568   kmp_uint32 nproc;
1569   bool team_change = false; // indicates on-core barrier shouldn't be used
1570 
1571   if (KMP_MASTER_TID(tid)) {
1572     team = __kmp_threads[gtid]->th.th_team;
1573     KMP_DEBUG_ASSERT(team != NULL);
1574     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1575                   "entered barrier type %d\n",
1576                   gtid, team->t.t_id, tid, bt));
1577   } else { // Worker threads
1578     // Wait for parent thread to release me
1579     if (!thr_bar->use_oncore_barrier ||
1580         __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1581         thr_bar->team == NULL) {
1582       // Use traditional method of waiting on my own b_go flag
1583       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1584       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1585       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1586       ANNOTATE_BARRIER_END(this_thr);
1587       TCW_8(thr_bar->b_go,
1588             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1589     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1590       // infinite, not nested
1591       // Wait on my "offset" bits on parent's b_go flag
1592       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1593       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1594                            thr_bar->offset + 1, bt,
1595                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1596       flag.wait(this_thr, TRUE);
1597       if (thr_bar->wait_flag ==
1598           KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1599         TCW_8(thr_bar->b_go,
1600               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1601       } else { // Reset my bits on parent's b_go flag
1602         (RCAST(volatile char *,
1603                &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1604       }
1605     }
1606     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1607     // Early exit for reaping threads releasing forkjoin barrier
1608     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1609       return;
1610     // The worker thread may now assume that the team is valid.
1611     team = __kmp_threads[gtid]->th.th_team;
1612     KMP_DEBUG_ASSERT(team != NULL);
1613     tid = __kmp_tid_from_gtid(gtid);
1614 
1615     KA_TRACE(
1616         20,
1617         ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1618          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1619     KMP_MB(); // Flush all pending memory write invalidates.
1620   }
1621 
1622   nproc = this_thr->th.th_team_nproc;
1623   int level = team->t.t_level;
1624   if (team->t.t_threads[0]
1625           ->th.th_teams_microtask) { // are we inside the teams construct?
1626     if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1627         this_thr->th.th_teams_level == level)
1628       ++level; // level was not increased in teams construct for team_of_workers
1629     if (this_thr->th.th_teams_size.nteams > 1)
1630       ++level; // level was not increased in teams construct for team_of_masters
1631   }
1632   if (level == 1)
1633     thr_bar->use_oncore_barrier = 1;
1634   else
1635     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1636 
1637   // If the team size has increased, we still communicate with old leaves via
1638   // oncore barrier.
1639   unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1640   kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1641   team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1642                                                        tid, team);
1643   // But if the entire team changes, we won't use oncore barrier at all
1644   if (team_change)
1645     old_leaf_kids = 0;
1646 
1647 #if KMP_BARRIER_ICV_PUSH
1648   if (propagate_icvs) {
1649     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1650                              FALSE);
1651     if (KMP_MASTER_TID(
1652             tid)) { // primary already has copy in final destination; copy
1653       copy_icvs(&thr_bar->th_fixed_icvs,
1654                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1655     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1656                thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1657       if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1658         // leaves (on-core children) pull parent's fixed ICVs directly to local
1659         // ICV store
1660         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1661                   &thr_bar->parent_bar->th_fixed_icvs);
1662       // non-leaves will get ICVs piggybacked with b_go via NGO store
1663     } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1664       if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1665         // access
1666         copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1667       else // leaves copy parent's fixed ICVs directly to local ICV store
1668         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1669                   &thr_bar->parent_bar->th_fixed_icvs);
1670     }
1671   }
1672 #endif // KMP_BARRIER_ICV_PUSH
1673 
1674   // Now, release my children
1675   if (thr_bar->my_level) { // not a leaf
1676     kmp_int32 child_tid;
1677     kmp_uint32 last;
1678     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1679         thr_bar->use_oncore_barrier) {
1680       if (KMP_MASTER_TID(tid)) { // do a flat release
1681         // Set local b_go to bump children via NGO store of the cache line
1682         // containing IVCs and b_go.
1683         thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1684         // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1685         // the cache line
1686         ngo_load(&thr_bar->th_fixed_icvs);
1687         // This loops over all the threads skipping only the leaf nodes in the
1688         // hierarchy
1689         for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1690              child_tid += thr_bar->skip_per_level[1]) {
1691           kmp_bstate_t *child_bar =
1692               &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1693           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1694                         "releasing T#%d(%d:%d)"
1695                         " go(%p): %u => %u\n",
1696                         gtid, team->t.t_id, tid,
1697                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1698                         child_tid, &child_bar->b_go, child_bar->b_go,
1699                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1700           // Use ngo store (if available) to both store ICVs and release child
1701           // via child's b_go
1702           ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1703         }
1704         ngo_sync();
1705       }
1706       TCW_8(thr_bar->b_go,
1707             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1708       // Now, release leaf children
1709       if (thr_bar->leaf_kids) { // if there are any
1710         // We test team_change on the off-chance that the level 1 team changed.
1711         if (team_change ||
1712             old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1713           if (old_leaf_kids) { // release old leaf kids
1714             thr_bar->b_go |= old_leaf_state;
1715           }
1716           // Release new leaf kids
1717           last = tid + thr_bar->skip_per_level[1];
1718           if (last > nproc)
1719             last = nproc;
1720           for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1721                ++child_tid) { // skip_per_level[0]=1
1722             kmp_info_t *child_thr = team->t.t_threads[child_tid];
1723             kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1724             KA_TRACE(
1725                 20,
1726                 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1727                  " T#%d(%d:%d) go(%p): %u => %u\n",
1728                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1729                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1730                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1731             // Release child using child's b_go flag
1732             ANNOTATE_BARRIER_BEGIN(child_thr);
1733             kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1734             flag.release();
1735           }
1736         } else { // Release all children at once with leaf_state bits on my own
1737           // b_go flag
1738           thr_bar->b_go |= thr_bar->leaf_state;
1739         }
1740       }
1741     } else { // Blocktime is not infinite; do a simple hierarchical release
1742       for (int d = thr_bar->my_level - 1; d >= 0;
1743            --d) { // Release highest level threads first
1744         last = tid + thr_bar->skip_per_level[d + 1];
1745         kmp_uint32 skip = thr_bar->skip_per_level[d];
1746         if (last > nproc)
1747           last = nproc;
1748         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1749           kmp_info_t *child_thr = team->t.t_threads[child_tid];
1750           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1751           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1752                         "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1753                         gtid, team->t.t_id, tid,
1754                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1755                         child_tid, &child_bar->b_go, child_bar->b_go,
1756                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1757           // Release child using child's b_go flag
1758           ANNOTATE_BARRIER_BEGIN(child_thr);
1759           kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1760           flag.release();
1761         }
1762       }
1763     }
1764 #if KMP_BARRIER_ICV_PUSH
1765     if (propagate_icvs && !KMP_MASTER_TID(tid))
1766       // non-leaves copy ICVs from fixed ICVs to local dest
1767       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1768                 &thr_bar->th_fixed_icvs);
1769 #endif // KMP_BARRIER_ICV_PUSH
1770   }
1771   KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1772                 "barrier type %d\n",
1773                 gtid, team->t.t_id, tid, bt));
1774 }
1775 
1776 // End of Barrier Algorithms
1777 
1778 // type traits for cancellable value
1779 // if cancellable is true, then is_cancellable is a normal boolean variable
1780 // if cancellable is false, then is_cancellable is a compile time constant
1781 template <bool cancellable> struct is_cancellable {};
1782 template <> struct is_cancellable<true> {
1783   bool value;
1784   is_cancellable() : value(false) {}
1785   is_cancellable(bool b) : value(b) {}
1786   is_cancellable &operator=(bool b) {
1787     value = b;
1788     return *this;
1789   }
1790   operator bool() const { return value; }
1791 };
1792 template <> struct is_cancellable<false> {
1793   is_cancellable &operator=(bool b) { return *this; }
1794   constexpr operator bool() const { return false; }
1795 };
1796 
1797 // Internal function to do a barrier.
1798 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1799    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1800    barrier
1801    When cancellable = false,
1802      Returns 0 if primary thread, 1 if worker thread.
1803    When cancellable = true
1804      Returns 0 if not cancelled, 1 if cancelled.  */
1805 template <bool cancellable = false>
1806 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1807                                   size_t reduce_size, void *reduce_data,
1808                                   void (*reduce)(void *, void *)) {
1809   KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1810   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1811   int tid = __kmp_tid_from_gtid(gtid);
1812   kmp_info_t *this_thr = __kmp_threads[gtid];
1813   kmp_team_t *team = this_thr->th.th_team;
1814   int status = 0;
1815   is_cancellable<cancellable> cancelled;
1816 #if OMPT_SUPPORT && OMPT_OPTIONAL
1817   ompt_data_t *my_task_data;
1818   ompt_data_t *my_parallel_data;
1819   void *return_address;
1820   ompt_sync_region_t barrier_kind;
1821 #endif
1822 
1823   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1824                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1825 
1826   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1827 #if OMPT_SUPPORT
1828   if (ompt_enabled.enabled) {
1829 #if OMPT_OPTIONAL
1830     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1831     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1832     return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1833     barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1834     if (ompt_enabled.ompt_callback_sync_region) {
1835       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1836           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1837           return_address);
1838     }
1839     if (ompt_enabled.ompt_callback_sync_region_wait) {
1840       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1841           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1842           return_address);
1843     }
1844 #endif
1845     // It is OK to report the barrier state after the barrier begin callback.
1846     // According to the OMPT specification, a compliant implementation may
1847     // even delay reporting this state until the barrier begins to wait.
1848     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1849   }
1850 #endif
1851 
1852   if (!team->t.t_serialized) {
1853 #if USE_ITT_BUILD
1854     // This value will be used in itt notify events below.
1855     void *itt_sync_obj = NULL;
1856 #if USE_ITT_NOTIFY
1857     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1858       itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1859 #endif
1860 #endif /* USE_ITT_BUILD */
1861     if (__kmp_tasking_mode == tskm_extra_barrier) {
1862       __kmp_tasking_barrier(team, this_thr, gtid);
1863       KA_TRACE(15,
1864                ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1865                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1866     }
1867 
1868     /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1869        access it when the team struct is not guaranteed to exist. */
1870     // See note about the corresponding code in __kmp_join_barrier() being
1871     // performance-critical.
1872     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1873 #if KMP_USE_MONITOR
1874       this_thr->th.th_team_bt_intervals =
1875           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1876       this_thr->th.th_team_bt_set =
1877           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1878 #else
1879       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1880 #endif
1881     }
1882 
1883 #if USE_ITT_BUILD
1884     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1885       __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1886 #endif /* USE_ITT_BUILD */
1887 #if USE_DEBUGGER
1888     // Let the debugger know: the thread arrived to the barrier and waiting.
1889     if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1890       team->t.t_bar[bt].b_master_arrived += 1;
1891     } else {
1892       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1893     } // if
1894 #endif /* USE_DEBUGGER */
1895     if (reduce != NULL) {
1896       // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1897       this_thr->th.th_local.reduce_data = reduce_data;
1898     }
1899 
1900     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1901       // use 0 to only setup the current team if nthreads > 1
1902       __kmp_task_team_setup(this_thr, team, 0);
1903 
1904     if (cancellable) {
1905       cancelled = __kmp_linear_barrier_gather_cancellable(
1906           bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1907     } else {
1908       switch (__kmp_barrier_gather_pattern[bt]) {
1909       case bp_dist_bar: {
1910         __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1911                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1912         break;
1913       }
1914       case bp_hyper_bar: {
1915         // don't set branch bits to 0; use linear
1916         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1917         __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1918                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1919         break;
1920       }
1921       case bp_hierarchical_bar: {
1922         __kmp_hierarchical_barrier_gather(
1923             bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1924         break;
1925       }
1926       case bp_tree_bar: {
1927         // don't set branch bits to 0; use linear
1928         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1929         __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1930                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1931         break;
1932       }
1933       default: {
1934         __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1935                                     reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1936       }
1937       }
1938     }
1939 
1940     KMP_MB();
1941 
1942     if (KMP_MASTER_TID(tid)) {
1943       status = 0;
1944       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1945         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1946       }
1947 #if USE_DEBUGGER
1948       // Let the debugger know: All threads are arrived and starting leaving the
1949       // barrier.
1950       team->t.t_bar[bt].b_team_arrived += 1;
1951 #endif
1952 
1953       if (__kmp_omp_cancellation) {
1954         kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1955         // Reset cancellation flag for worksharing constructs
1956         if (cancel_request == cancel_loop ||
1957             cancel_request == cancel_sections) {
1958           KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1959         }
1960       }
1961 #if USE_ITT_BUILD
1962       /* TODO: In case of split reduction barrier, primary thread may send
1963          acquired event early, before the final summation into the shared
1964          variable is done (final summation can be a long operation for array
1965          reductions).  */
1966       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1967         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1968 #endif /* USE_ITT_BUILD */
1969 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1970       // Barrier - report frame end (only if active_level == 1)
1971       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1972           __kmp_forkjoin_frames_mode &&
1973           (this_thr->th.th_teams_microtask == NULL || // either not in teams
1974            this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1975           team->t.t_active_level == 1) {
1976         ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1977         kmp_uint64 cur_time = __itt_get_timestamp();
1978         kmp_info_t **other_threads = team->t.t_threads;
1979         int nproc = this_thr->th.th_team_nproc;
1980         int i;
1981         switch (__kmp_forkjoin_frames_mode) {
1982         case 1:
1983           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1984                                  loc, nproc);
1985           this_thr->th.th_frame_time = cur_time;
1986           break;
1987         case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1988           // be fixed)
1989           __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1990                                  1, loc, nproc);
1991           break;
1992         case 3:
1993           if (__itt_metadata_add_ptr) {
1994             // Initialize with primary thread's wait time
1995             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1996             // Set arrive time to zero to be able to check it in
1997             // __kmp_invoke_task(); the same is done inside the loop below
1998             this_thr->th.th_bar_arrive_time = 0;
1999             for (i = 1; i < nproc; ++i) {
2000               delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2001               other_threads[i]->th.th_bar_arrive_time = 0;
2002             }
2003             __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2004                                          cur_time, delta,
2005                                          (kmp_uint64)(reduce != NULL));
2006           }
2007           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2008                                  loc, nproc);
2009           this_thr->th.th_frame_time = cur_time;
2010           break;
2011         }
2012       }
2013 #endif /* USE_ITT_BUILD */
2014     } else {
2015       status = 1;
2016 #if USE_ITT_BUILD
2017       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2018         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2019 #endif /* USE_ITT_BUILD */
2020     }
2021     if ((status == 1 || !is_split) && !cancelled) {
2022       if (cancellable) {
2023         cancelled = __kmp_linear_barrier_release_cancellable(
2024             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2025       } else {
2026         switch (__kmp_barrier_release_pattern[bt]) {
2027         case bp_dist_bar: {
2028           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2029           __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2030                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2031           break;
2032         }
2033         case bp_hyper_bar: {
2034           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2035           __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2036                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2037           break;
2038         }
2039         case bp_hierarchical_bar: {
2040           __kmp_hierarchical_barrier_release(
2041               bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2042           break;
2043         }
2044         case bp_tree_bar: {
2045           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2046           __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2047                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2048           break;
2049         }
2050         default: {
2051           __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2052                                        FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2053         }
2054         }
2055       }
2056       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2057         __kmp_task_team_sync(this_thr, team);
2058       }
2059     }
2060 
2061 #if USE_ITT_BUILD
2062     /* GEH: TODO: Move this under if-condition above and also include in
2063        __kmp_end_split_barrier(). This will more accurately represent the actual
2064        release time of the threads for split barriers.  */
2065     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2066       __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2067 #endif /* USE_ITT_BUILD */
2068   } else { // Team is serialized.
2069     status = 0;
2070     if (__kmp_tasking_mode != tskm_immediate_exec) {
2071       if (this_thr->th.th_task_team != NULL) {
2072 #if USE_ITT_NOTIFY
2073         void *itt_sync_obj = NULL;
2074         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2075           itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2076           __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2077         }
2078 #endif
2079 
2080         KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
2081                          TRUE);
2082         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2083         __kmp_task_team_setup(this_thr, team, 0);
2084 
2085 #if USE_ITT_BUILD
2086         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2087           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2088 #endif /* USE_ITT_BUILD */
2089       }
2090     }
2091   }
2092   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2093                 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2094                 __kmp_tid_from_gtid(gtid), status));
2095 
2096 #if OMPT_SUPPORT
2097   if (ompt_enabled.enabled) {
2098 #if OMPT_OPTIONAL
2099     if (ompt_enabled.ompt_callback_sync_region_wait) {
2100       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2101           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2102           return_address);
2103     }
2104     if (ompt_enabled.ompt_callback_sync_region) {
2105       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2106           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2107           return_address);
2108     }
2109 #endif
2110     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2111   }
2112 #endif
2113   ANNOTATE_BARRIER_END(&team->t.t_bar);
2114 
2115   if (cancellable)
2116     return (int)cancelled;
2117   return status;
2118 }
2119 
2120 // Returns 0 if primary thread, 1 if worker thread.
2121 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2122                   size_t reduce_size, void *reduce_data,
2123                   void (*reduce)(void *, void *)) {
2124   return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2125                                   reduce);
2126 }
2127 
2128 #if defined(KMP_GOMP_COMPAT)
2129 // Returns 1 if cancelled, 0 otherwise
2130 int __kmp_barrier_gomp_cancel(int gtid) {
2131   if (__kmp_omp_cancellation) {
2132     int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2133                                                  0, NULL, NULL);
2134     if (cancelled) {
2135       int tid = __kmp_tid_from_gtid(gtid);
2136       kmp_info_t *this_thr = __kmp_threads[gtid];
2137       if (KMP_MASTER_TID(tid)) {
2138         // Primary thread does not need to revert anything
2139       } else {
2140         // Workers need to revert their private b_arrived flag
2141         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2142             KMP_BARRIER_STATE_BUMP;
2143       }
2144     }
2145     return cancelled;
2146   }
2147   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2148   return FALSE;
2149 }
2150 #endif
2151 
2152 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2153   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2154   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2155   KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2156   int tid = __kmp_tid_from_gtid(gtid);
2157   kmp_info_t *this_thr = __kmp_threads[gtid];
2158   kmp_team_t *team = this_thr->th.th_team;
2159 
2160   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
2161   if (!team->t.t_serialized) {
2162     if (KMP_MASTER_GTID(gtid)) {
2163       switch (__kmp_barrier_release_pattern[bt]) {
2164       case bp_dist_bar: {
2165         __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2166                                    FALSE USE_ITT_BUILD_ARG(NULL));
2167         break;
2168       }
2169       case bp_hyper_bar: {
2170         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2171         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2172                                     FALSE USE_ITT_BUILD_ARG(NULL));
2173         break;
2174       }
2175       case bp_hierarchical_bar: {
2176         __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2177                                            FALSE USE_ITT_BUILD_ARG(NULL));
2178         break;
2179       }
2180       case bp_tree_bar: {
2181         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2182         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2183                                    FALSE USE_ITT_BUILD_ARG(NULL));
2184         break;
2185       }
2186       default: {
2187         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2188                                      FALSE USE_ITT_BUILD_ARG(NULL));
2189       }
2190       }
2191       if (__kmp_tasking_mode != tskm_immediate_exec) {
2192         __kmp_task_team_sync(this_thr, team);
2193       } // if
2194     }
2195   }
2196   ANNOTATE_BARRIER_END(&team->t.t_bar);
2197 }
2198 
2199 void __kmp_join_barrier(int gtid) {
2200   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2201   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2202 
2203   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2204 
2205   kmp_info_t *this_thr = __kmp_threads[gtid];
2206   kmp_team_t *team;
2207   kmp_uint nproc;
2208   kmp_info_t *master_thread;
2209   int tid;
2210 #ifdef KMP_DEBUG
2211   int team_id;
2212 #endif /* KMP_DEBUG */
2213 #if USE_ITT_BUILD
2214   void *itt_sync_obj = NULL;
2215 #if USE_ITT_NOTIFY
2216   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2217     // Get object created at fork_barrier
2218     itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2219 #endif
2220 #endif /* USE_ITT_BUILD */
2221   KMP_MB();
2222 
2223   // Get current info
2224   team = this_thr->th.th_team;
2225   nproc = this_thr->th.th_team_nproc;
2226   KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
2227   tid = __kmp_tid_from_gtid(gtid);
2228 #ifdef KMP_DEBUG
2229   team_id = team->t.t_id;
2230 #endif /* KMP_DEBUG */
2231   master_thread = this_thr->th.th_team_master;
2232 #ifdef KMP_DEBUG
2233   if (master_thread != team->t.t_threads[0]) {
2234     __kmp_print_structure();
2235   }
2236 #endif /* KMP_DEBUG */
2237   KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2238   KMP_MB();
2239 
2240   // Verify state
2241   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2242   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2243   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2244   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2245                 gtid, team_id, tid));
2246 
2247   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
2248 #if OMPT_SUPPORT
2249   if (ompt_enabled.enabled) {
2250 #if OMPT_OPTIONAL
2251     ompt_data_t *my_task_data;
2252     ompt_data_t *my_parallel_data;
2253     void *codeptr = NULL;
2254     int ds_tid = this_thr->th.th_info.ds.ds_tid;
2255     if (KMP_MASTER_TID(ds_tid) &&
2256         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2257          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2258       codeptr = team->t.ompt_team_info.master_return_address;
2259     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2260     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2261     if (ompt_enabled.ompt_callback_sync_region) {
2262       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2263           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
2264           my_task_data, codeptr);
2265     }
2266     if (ompt_enabled.ompt_callback_sync_region_wait) {
2267       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2268           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
2269           my_task_data, codeptr);
2270     }
2271     if (!KMP_MASTER_TID(ds_tid))
2272       this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2273 #endif
2274     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
2275   }
2276 #endif
2277 
2278   if (__kmp_tasking_mode == tskm_extra_barrier) {
2279     __kmp_tasking_barrier(team, this_thr, gtid);
2280     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2281                   gtid, team_id, tid));
2282   }
2283 #ifdef KMP_DEBUG
2284   if (__kmp_tasking_mode != tskm_immediate_exec) {
2285     KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2286                   "%p, th_task_team = %p\n",
2287                   __kmp_gtid_from_thread(this_thr), team_id,
2288                   team->t.t_task_team[this_thr->th.th_task_state],
2289                   this_thr->th.th_task_team));
2290     if (this_thr->th.th_task_team)
2291       KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
2292                        team->t.t_task_team[this_thr->th.th_task_state]);
2293   }
2294 #endif /* KMP_DEBUG */
2295 
2296   /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2297      access it when the team struct is not guaranteed to exist. Doing these
2298      loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2299      we do not perform the copy if blocktime=infinite, since the values are not
2300      used by __kmp_wait_template() in that case. */
2301   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2302 #if KMP_USE_MONITOR
2303     this_thr->th.th_team_bt_intervals =
2304         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2305     this_thr->th.th_team_bt_set =
2306         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2307 #else
2308     this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2309 #endif
2310   }
2311 
2312 #if USE_ITT_BUILD
2313   if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2314     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2315 #endif /* USE_ITT_BUILD */
2316 
2317   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2318   case bp_dist_bar: {
2319     __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2320                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2321     break;
2322   }
2323   case bp_hyper_bar: {
2324     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2325     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2326                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2327     break;
2328   }
2329   case bp_hierarchical_bar: {
2330     __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2331                                       NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2332     break;
2333   }
2334   case bp_tree_bar: {
2335     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2336     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2337                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2338     break;
2339   }
2340   default: {
2341     __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2342                                 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2343   }
2344   }
2345 
2346   /* From this point on, the team data structure may be deallocated at any time
2347      by the primary thread - it is unsafe to reference it in any of the worker
2348      threads. Any per-team data items that need to be referenced before the
2349      end of the barrier should be moved to the kmp_task_team_t structs.  */
2350   if (KMP_MASTER_TID(tid)) {
2351     if (__kmp_tasking_mode != tskm_immediate_exec) {
2352       __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2353     }
2354     if (__kmp_display_affinity) {
2355       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2356     }
2357 #if KMP_STATS_ENABLED
2358     // Have primary thread flag the workers to indicate they are now waiting for
2359     // next parallel region, Also wake them up so they switch their timers to
2360     // idle.
2361     for (int i = 0; i < team->t.t_nproc; ++i) {
2362       kmp_info_t *team_thread = team->t.t_threads[i];
2363       if (team_thread == this_thr)
2364         continue;
2365       team_thread->th.th_stats->setIdleFlag();
2366       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2367           team_thread->th.th_sleep_loc != NULL)
2368         __kmp_null_resume_wrapper(team_thread);
2369     }
2370 #endif
2371 #if USE_ITT_BUILD
2372     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2373       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2374 #endif /* USE_ITT_BUILD */
2375 
2376 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2377     // Join barrier - report frame end
2378     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2379         __kmp_forkjoin_frames_mode &&
2380         (this_thr->th.th_teams_microtask == NULL || // either not in teams
2381          this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2382         team->t.t_active_level == 1) {
2383       kmp_uint64 cur_time = __itt_get_timestamp();
2384       ident_t *loc = team->t.t_ident;
2385       kmp_info_t **other_threads = team->t.t_threads;
2386       int nproc = this_thr->th.th_team_nproc;
2387       int i;
2388       switch (__kmp_forkjoin_frames_mode) {
2389       case 1:
2390         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2391                                loc, nproc);
2392         break;
2393       case 2:
2394         __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2395                                loc, nproc);
2396         break;
2397       case 3:
2398         if (__itt_metadata_add_ptr) {
2399           // Initialize with primary thread's wait time
2400           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2401           // Set arrive time to zero to be able to check it in
2402           // __kmp_invoke_task(); the same is done inside the loop below
2403           this_thr->th.th_bar_arrive_time = 0;
2404           for (i = 1; i < nproc; ++i) {
2405             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2406             other_threads[i]->th.th_bar_arrive_time = 0;
2407           }
2408           __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2409                                        cur_time, delta, 0);
2410         }
2411         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2412                                loc, nproc);
2413         this_thr->th.th_frame_time = cur_time;
2414         break;
2415       }
2416     }
2417 #endif /* USE_ITT_BUILD */
2418   }
2419 #if USE_ITT_BUILD
2420   else {
2421     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2422       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2423   }
2424 #endif /* USE_ITT_BUILD */
2425 
2426 #if KMP_DEBUG
2427   if (KMP_MASTER_TID(tid)) {
2428     KA_TRACE(
2429         15,
2430         ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2431          gtid, team_id, tid, nproc));
2432   }
2433 #endif /* KMP_DEBUG */
2434 
2435   // TODO now, mark worker threads as done so they may be disbanded
2436   KMP_MB(); // Flush all pending memory write invalidates.
2437   KA_TRACE(10,
2438            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2439 
2440   ANNOTATE_BARRIER_END(&team->t.t_bar);
2441 }
2442 
2443 // TODO release worker threads' fork barriers as we are ready instead of all at
2444 // once
2445 void __kmp_fork_barrier(int gtid, int tid) {
2446   KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2447   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2448   kmp_info_t *this_thr = __kmp_threads[gtid];
2449   kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2450 #if USE_ITT_BUILD
2451   void *itt_sync_obj = NULL;
2452 #endif /* USE_ITT_BUILD */
2453   if (team)
2454     ANNOTATE_BARRIER_END(&team->t.t_bar);
2455 
2456   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2457                 (team != NULL) ? team->t.t_id : -1, tid));
2458 
2459   // th_team pointer only valid for primary thread here
2460   if (KMP_MASTER_TID(tid)) {
2461 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2462     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2463       // Create itt barrier object
2464       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2465       __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2466     }
2467 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2468 
2469 #ifdef KMP_DEBUG
2470     KMP_DEBUG_ASSERT(team);
2471     kmp_info_t **other_threads = team->t.t_threads;
2472     int i;
2473 
2474     // Verify state
2475     KMP_MB();
2476 
2477     for (i = 1; i < team->t.t_nproc; ++i) {
2478       KA_TRACE(500,
2479                ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2480                 "== %u.\n",
2481                 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2482                 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2483                 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2484       KMP_DEBUG_ASSERT(
2485           (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2486            ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2487       KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2488     }
2489 #endif
2490 
2491     if (__kmp_tasking_mode != tskm_immediate_exec) {
2492       // 0 indicates setup current task team if nthreads > 1
2493       __kmp_task_team_setup(this_thr, team, 0);
2494     }
2495 
2496     /* The primary thread may have changed its blocktime between join barrier
2497        and fork barrier. Copy the blocktime info to the thread, where
2498        __kmp_wait_template() can access it when the team struct is not
2499        guaranteed to exist. */
2500     // See note about the corresponding code in __kmp_join_barrier() being
2501     // performance-critical
2502     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2503 #if KMP_USE_MONITOR
2504       this_thr->th.th_team_bt_intervals =
2505           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2506       this_thr->th.th_team_bt_set =
2507           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2508 #else
2509       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2510 #endif
2511     }
2512   } // primary thread
2513 
2514   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2515   case bp_dist_bar: {
2516     __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2517                                TRUE USE_ITT_BUILD_ARG(NULL));
2518     break;
2519   }
2520   case bp_hyper_bar: {
2521     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2522     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2523                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2524     break;
2525   }
2526   case bp_hierarchical_bar: {
2527     __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2528                                        TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2529     break;
2530   }
2531   case bp_tree_bar: {
2532     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2533     __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2534                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2535     break;
2536   }
2537   default: {
2538     __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2539                                  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2540   }
2541   }
2542 
2543 #if OMPT_SUPPORT
2544   if (ompt_enabled.enabled &&
2545       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2546     int ds_tid = this_thr->th.th_info.ds.ds_tid;
2547     ompt_data_t *task_data = (team)
2548                                  ? OMPT_CUR_TASK_DATA(this_thr)
2549                                  : &(this_thr->th.ompt_thread_info.task_data);
2550     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2551 #if OMPT_OPTIONAL
2552     void *codeptr = NULL;
2553     if (KMP_MASTER_TID(ds_tid) &&
2554         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2555          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2556       codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2557     if (ompt_enabled.ompt_callback_sync_region_wait) {
2558       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2559           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2560           codeptr);
2561     }
2562     if (ompt_enabled.ompt_callback_sync_region) {
2563       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2564           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2565           codeptr);
2566     }
2567 #endif
2568     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2569       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2570           ompt_scope_end, NULL, task_data, 0, ds_tid,
2571           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2572     }
2573   }
2574 #endif
2575 
2576   // Early exit for reaping threads releasing forkjoin barrier
2577   if (TCR_4(__kmp_global.g.g_done)) {
2578     this_thr->th.th_task_team = NULL;
2579 
2580 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2581     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2582       if (!KMP_MASTER_TID(tid)) {
2583         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2584         if (itt_sync_obj)
2585           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2586       }
2587     }
2588 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2589     KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2590     return;
2591   }
2592 
2593   /* We can now assume that a valid team structure has been allocated by the
2594      primary thread and propagated to all worker threads. The current thread,
2595      however, may not be part of the team, so we can't blindly assume that the
2596      team pointer is non-null.  */
2597   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2598   KMP_DEBUG_ASSERT(team != NULL);
2599   tid = __kmp_tid_from_gtid(gtid);
2600 
2601 #if KMP_BARRIER_ICV_PULL
2602   /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2603      __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2604      implicit task has this data before this function is called. We cannot
2605      modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2606      thread struct, because it is not always the case that the threads arrays
2607      have been allocated when __kmp_fork_call() is executed. */
2608   {
2609     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2610     if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2611       // Copy the initial ICVs from the primary thread's thread struct to the
2612       // implicit task for this tid.
2613       KA_TRACE(10,
2614                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2615       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2616                                tid, FALSE);
2617       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2618                 &team->t.t_threads[0]
2619                      ->th.th_bar[bs_forkjoin_barrier]
2620                      .bb.th_fixed_icvs);
2621     }
2622   }
2623 #endif // KMP_BARRIER_ICV_PULL
2624 
2625   if (__kmp_tasking_mode != tskm_immediate_exec) {
2626     __kmp_task_team_sync(this_thr, team);
2627   }
2628 
2629 #if KMP_AFFINITY_SUPPORTED
2630   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2631   if (proc_bind == proc_bind_intel) {
2632     // Call dynamic affinity settings
2633     if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2634       __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2635     }
2636   } else if (proc_bind != proc_bind_false) {
2637     if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2638       KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2639                      __kmp_gtid_from_thread(this_thr),
2640                      this_thr->th.th_current_place));
2641     } else {
2642       __kmp_affinity_set_place(gtid);
2643     }
2644   }
2645 #endif // KMP_AFFINITY_SUPPORTED
2646   // Perform the display affinity functionality
2647   if (__kmp_display_affinity) {
2648     if (team->t.t_display_affinity
2649 #if KMP_AFFINITY_SUPPORTED
2650         || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2651 #endif
2652     ) {
2653       // NULL means use the affinity-format-var ICV
2654       __kmp_aux_display_affinity(gtid, NULL);
2655       this_thr->th.th_prev_num_threads = team->t.t_nproc;
2656       this_thr->th.th_prev_level = team->t.t_level;
2657     }
2658   }
2659   if (!KMP_MASTER_TID(tid))
2660     KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2661 
2662 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2663   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2664     if (!KMP_MASTER_TID(tid)) {
2665       // Get correct barrier object
2666       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2667       __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2668     } // (prepare called inside barrier_release)
2669   }
2670 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2671   ANNOTATE_BARRIER_END(&team->t.t_bar);
2672   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2673                 team->t.t_id, tid));
2674 }
2675 
2676 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2677                           kmp_internal_control_t *new_icvs, ident_t *loc) {
2678   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2679 
2680   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2681   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2682 
2683 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2684    __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2685    implicit task has this data before this function is called. */
2686 #if KMP_BARRIER_ICV_PULL
2687   /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2688      remains untouched), where all of the worker threads can access them and
2689      make their own copies after the barrier. */
2690   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2691   // allocated at this point
2692   copy_icvs(
2693       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2694       new_icvs);
2695   KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2696                 team->t.t_threads[0], team));
2697 #elif KMP_BARRIER_ICV_PUSH
2698   // The ICVs will be propagated in the fork barrier, so nothing needs to be
2699   // done here.
2700   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2701                 team->t.t_threads[0], team));
2702 #else
2703   // Copy the ICVs to each of the non-primary threads.  This takes O(nthreads)
2704   // time.
2705   ngo_load(new_icvs);
2706   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2707   // allocated at this point
2708   for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2709     // TODO: GEH - pass in better source location info since usually NULL here
2710     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2711                   f, team->t.t_threads[f], team));
2712     __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2713     ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2714     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2715                   f, team->t.t_threads[f], team));
2716   }
2717   ngo_sync();
2718 #endif // KMP_BARRIER_ICV_PULL
2719 }
2720