1 /*
2  * kmp_barrier.cpp
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_stats.h"
19 #include "kmp_itt.h"
20 #include "kmp_os.h"
21 
22 
23 #if KMP_MIC
24 #include <immintrin.h>
25 #define USE_NGO_STORES 1
26 #endif // KMP_MIC
27 
28 #if KMP_MIC && USE_NGO_STORES
29 // ICV copying
30 #define ngo_load(src)            __m512d Vt = _mm512_load_pd((void *)(src))
31 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_store_go(dst, src)   _mm512_storenrngo_pd((void *)(dst), Vt)
33 #define ngo_sync()               __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
34 #else
35 #define ngo_load(src)            ((void)0)
36 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
37 #define ngo_store_go(dst, src)   KMP_MEMCPY((dst), (src), CACHE_LINE)
38 #define ngo_sync()               ((void)0)
39 #endif /* KMP_MIC && USE_NGO_STORES */
40 
41 void __kmp_print_structure(void); // Forward declaration
42 
43 // ---------------------------- Barrier Algorithms ----------------------------
44 
45 // Linear Barrier
46 static void
47 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48                             void (*reduce)(void *, void *)
49                             USE_ITT_BUILD_ARG(void * itt_sync_obj) )
50 {
51     KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather);
52     register kmp_team_t *team = this_thr->th.th_team;
53     register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
54     register kmp_info_t **other_threads = team->t.t_threads;
55 
56     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57                   gtid, team->t.t_id, tid, bt));
58     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61     // Barrier imbalance - save arrive time to the thread
62     if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63         this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
64     }
65 #endif
66     // We now perform a linear reduction to signal that all of the threads have arrived.
67     if (!KMP_MASTER_TID(tid)) {
68         KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
69                       "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
70                       __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
71                       thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
72         // Mark arrival to master thread
73         /* After performing this write, a worker thread may not assume that the team is valid
74            any more - it could be deallocated by the master thread at any time. */
75         kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
76         flag.release();
77     } else {
78         register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
79         register int nproc = this_thr->th.th_team_nproc;
80         register int i;
81         // Don't have to worry about sleep bit here or atomic since team setting
82         register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
83 
84         // Collect all the worker team member threads.
85         for (i=1; i<nproc; ++i) {
86 #if KMP_CACHE_MANAGE
87             // Prefetch next thread's arrived count
88             if (i+1 < nproc)
89                 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
90 #endif /* KMP_CACHE_MANAGE */
91             KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
92                           "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
93                             __kmp_gtid_from_tid(i, team), team->t.t_id, i,
94                             &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
95 
96             // Wait for worker thread to arrive
97             kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
98             flag.wait(this_thr, FALSE
99                       USE_ITT_BUILD_ARG(itt_sync_obj) );
100 #if USE_ITT_BUILD && USE_ITT_NOTIFY
101             // Barrier imbalance - write min of the thread time and the other thread time to the thread.
102             if (__kmp_forkjoin_frames_mode == 2) {
103                 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
104                                                           other_threads[i]->th.th_bar_min_time);
105             }
106 #endif
107             if (reduce) {
108                 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
109                                team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
110                 (*reduce)(this_thr->th.th_local.reduce_data,
111                           other_threads[i]->th.th_local.reduce_data);
112             }
113         }
114         // Don't have to worry about sleep bit here or atomic since team setting
115         team_bar->b_arrived = new_state;
116         KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
117                       gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
118     }
119     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
120                   gtid, team->t.t_id, tid, bt));
121 }
122 
123 static void
124 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
125                              int propagate_icvs
126                              USE_ITT_BUILD_ARG(void *itt_sync_obj) )
127 {
128     KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release);
129     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
130     register kmp_team_t *team;
131 
132     if (KMP_MASTER_TID(tid)) {
133         register unsigned int i;
134         register kmp_uint32 nproc = this_thr->th.th_team_nproc;
135         register kmp_info_t **other_threads;
136 
137         team = __kmp_threads[gtid]->th.th_team;
138         KMP_DEBUG_ASSERT(team != NULL);
139         other_threads = team->t.t_threads;
140 
141         KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
142                       gtid, team->t.t_id, tid, bt));
143 
144         if (nproc > 1) {
145 #if KMP_BARRIER_ICV_PUSH
146             {
147                 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
148                 if (propagate_icvs) {
149                     ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
150                     for (i=1; i<nproc; ++i) {
151                         __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
152                         ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
153                                        &team->t.t_implicit_task_taskdata[0].td_icvs);
154                     }
155                     ngo_sync();
156                 }
157             }
158 #endif // KMP_BARRIER_ICV_PUSH
159 
160             // Now, release all of the worker threads
161             for (i=1; i<nproc; ++i) {
162 #if KMP_CACHE_MANAGE
163                 // Prefetch next thread's go flag
164                 if (i+1 < nproc)
165                     KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
166 #endif /* KMP_CACHE_MANAGE */
167                 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
168                               "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
169                               other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
170                               &other_threads[i]->th.th_bar[bt].bb.b_go,
171                               other_threads[i]->th.th_bar[bt].bb.b_go,
172                               other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
173                 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
174                 flag.release();
175             }
176         }
177     } else { // Wait for the MASTER thread to release us
178         KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
179                       gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
180         kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
181         flag.wait(this_thr, TRUE
182                   USE_ITT_BUILD_ARG(itt_sync_obj) );
183 #if USE_ITT_BUILD && USE_ITT_NOTIFY
184         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
185             // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
186             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
187             // Cancel wait on previous parallel region...
188             __kmp_itt_task_starting(itt_sync_obj);
189 
190             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
191                 return;
192 
193             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
194             if (itt_sync_obj != NULL)
195                 // Call prepare as early as possible for "new" barrier
196                 __kmp_itt_task_finished(itt_sync_obj);
197         } else
198 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
199         // Early exit for reaping threads releasing forkjoin barrier
200         if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
201             return;
202         // The worker thread may now assume that the team is valid.
203 #ifdef KMP_DEBUG
204         tid = __kmp_tid_from_gtid(gtid);
205         team = __kmp_threads[gtid]->th.th_team;
206 #endif
207         KMP_DEBUG_ASSERT(team != NULL);
208         TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
209         KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
210                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
211         KMP_MB();  // Flush all pending memory write invalidates.
212     }
213     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
214                   gtid, team->t.t_id, tid, bt));
215 }
216 
217 // Tree barrier
218 static void
219 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
220                           void (*reduce)(void *, void *)
221                           USE_ITT_BUILD_ARG(void *itt_sync_obj) )
222 {
223     KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather);
224     register kmp_team_t *team = this_thr->th.th_team;
225     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
226     register kmp_info_t **other_threads = team->t.t_threads;
227     register kmp_uint32 nproc = this_thr->th.th_team_nproc;
228     register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
229     register kmp_uint32 branch_factor = 1 << branch_bits;
230     register kmp_uint32 child;
231     register kmp_uint32 child_tid;
232     register kmp_uint64 new_state;
233 
234     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
235                   gtid, team->t.t_id, tid, bt));
236     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
237 
238 #if USE_ITT_BUILD && USE_ITT_NOTIFY
239     // Barrier imbalance - save arrive time to the thread
240     if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
241         this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
242     }
243 #endif
244     // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
245     child_tid = (tid << branch_bits) + 1;
246     if (child_tid < nproc) {
247         // Parent threads wait for all their children to arrive
248         new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
249         child = 1;
250         do {
251             register kmp_info_t *child_thr = other_threads[child_tid];
252             register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
253 #if KMP_CACHE_MANAGE
254             // Prefetch next thread's arrived count
255             if (child+1 <= branch_factor && child_tid+1 < nproc)
256                 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
257 #endif /* KMP_CACHE_MANAGE */
258             KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
259                           "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
260                             __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
261                             &child_bar->b_arrived, new_state));
262             // Wait for child to arrive
263             kmp_flag_64 flag(&child_bar->b_arrived, new_state);
264             flag.wait(this_thr, FALSE
265                       USE_ITT_BUILD_ARG(itt_sync_obj) );
266 #if USE_ITT_BUILD && USE_ITT_NOTIFY
267             // Barrier imbalance - write min of the thread time and a child time to the thread.
268             if (__kmp_forkjoin_frames_mode == 2) {
269                 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
270                                                           child_thr->th.th_bar_min_time);
271             }
272 #endif
273             if (reduce) {
274                 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
275                                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
276                                team->t.t_id, child_tid));
277                 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
278             }
279             child++;
280             child_tid++;
281         }
282         while (child <= branch_factor && child_tid < nproc);
283     }
284 
285     if (!KMP_MASTER_TID(tid)) { // Worker threads
286         register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
287 
288         KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
289                       "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
290                       __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
291                       &thr_bar->b_arrived, thr_bar->b_arrived,
292                       thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
293 
294         // Mark arrival to parent thread
295         /* After performing this write, a worker thread may not assume that the team is valid
296            any more - it could be deallocated by the master thread at any time.  */
297         kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
298         flag.release();
299     } else {
300         // Need to update the team arrived pointer if we are the master thread
301         if (nproc > 1) // New value was already computed above
302             team->t.t_bar[bt].b_arrived = new_state;
303         else
304             team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
305         KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
306                       gtid, team->t.t_id, tid, team->t.t_id,
307                       &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
308     }
309     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
310                   gtid, team->t.t_id, tid, bt));
311 }
312 
313 static void
314 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
315                            int propagate_icvs
316                            USE_ITT_BUILD_ARG(void *itt_sync_obj) )
317 {
318     KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release);
319     register kmp_team_t *team;
320     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
321     register kmp_uint32 nproc;
322     register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
323     register kmp_uint32 branch_factor = 1 << branch_bits;
324     register kmp_uint32 child;
325     register kmp_uint32 child_tid;
326 
327     // Perform a tree release for all of the threads that have been gathered
328     if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
329         KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
330                       gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
331         // Wait for parent thread to release us
332         kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
333         flag.wait(this_thr, TRUE
334                   USE_ITT_BUILD_ARG(itt_sync_obj) );
335 #if USE_ITT_BUILD && USE_ITT_NOTIFY
336         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
337             // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
338             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
339             // Cancel wait on previous parallel region...
340             __kmp_itt_task_starting(itt_sync_obj);
341 
342             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
343                 return;
344 
345             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
346             if (itt_sync_obj != NULL)
347                 // Call prepare as early as possible for "new" barrier
348                 __kmp_itt_task_finished(itt_sync_obj);
349         } else
350 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
351         // Early exit for reaping threads releasing forkjoin barrier
352         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
353             return;
354 
355         // The worker thread may now assume that the team is valid.
356         team = __kmp_threads[gtid]->th.th_team;
357         KMP_DEBUG_ASSERT(team != NULL);
358         tid = __kmp_tid_from_gtid(gtid);
359 
360         TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
361         KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
362                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
363         KMP_MB();  // Flush all pending memory write invalidates.
364     } else {
365         team = __kmp_threads[gtid]->th.th_team;
366         KMP_DEBUG_ASSERT(team != NULL);
367         KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
368                       gtid, team->t.t_id, tid, bt));
369     }
370     nproc = this_thr->th.th_team_nproc;
371     child_tid = (tid << branch_bits) + 1;
372 
373     if (child_tid < nproc) {
374         register kmp_info_t **other_threads = team->t.t_threads;
375         child = 1;
376         // Parent threads release all their children
377         do {
378             register kmp_info_t *child_thr = other_threads[child_tid];
379             register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
380 #if KMP_CACHE_MANAGE
381             // Prefetch next thread's go count
382             if (child+1 <= branch_factor && child_tid+1 < nproc)
383                 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
384 #endif /* KMP_CACHE_MANAGE */
385 
386 #if KMP_BARRIER_ICV_PUSH
387             {
388                 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
389                 if (propagate_icvs) {
390                     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
391                                              team, child_tid, FALSE);
392                     copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
393                               &team->t.t_implicit_task_taskdata[0].td_icvs);
394                 }
395             }
396 #endif // KMP_BARRIER_ICV_PUSH
397             KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
398                           "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
399                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
400                           child_tid, &child_bar->b_go, child_bar->b_go,
401                           child_bar->b_go + KMP_BARRIER_STATE_BUMP));
402             // Release child from barrier
403             kmp_flag_64 flag(&child_bar->b_go, child_thr);
404             flag.release();
405             child++;
406             child_tid++;
407         }
408         while (child <= branch_factor && child_tid < nproc);
409     }
410     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
411                   gtid, team->t.t_id, tid, bt));
412 }
413 
414 
415 // Hyper Barrier
416 static void
417 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
418                            void (*reduce)(void *, void *)
419                            USE_ITT_BUILD_ARG(void *itt_sync_obj) )
420 {
421     KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather);
422     register kmp_team_t *team = this_thr->th.th_team;
423     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
424     register kmp_info_t **other_threads = team->t.t_threads;
425     register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
426     register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
427     register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
428     register kmp_uint32 branch_factor = 1 << branch_bits;
429     register kmp_uint32 offset;
430     register kmp_uint32 level;
431 
432     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
433                   gtid, team->t.t_id, tid, bt));
434 
435     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
436 
437 #if USE_ITT_BUILD && USE_ITT_NOTIFY
438     // Barrier imbalance - save arrive time to the thread
439     if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
440         this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
441     }
442 #endif
443     /* Perform a hypercube-embedded tree gather to wait until all of the threads have
444        arrived, and reduce any required data as we go.  */
445     kmp_flag_64 p_flag(&thr_bar->b_arrived);
446     for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
447     {
448         register kmp_uint32 child;
449         register kmp_uint32 child_tid;
450 
451         if (((tid >> level) & (branch_factor - 1)) != 0) {
452             register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
453 
454             KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
455                           "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
456                           __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
457                           &thr_bar->b_arrived, thr_bar->b_arrived,
458                           thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
459             // Mark arrival to parent thread
460             /* After performing this write (in the last iteration of the enclosing for loop),
461                a worker thread may not assume that the team is valid any more - it could be
462                deallocated by the master thread at any time.  */
463             p_flag.set_waiter(other_threads[parent_tid]);
464             p_flag.release();
465             break;
466         }
467 
468         // Parent threads wait for children to arrive
469         if (new_state == KMP_BARRIER_UNUSED_STATE)
470             new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
471         for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
472              child++, child_tid+=(1 << level))
473         {
474             register kmp_info_t *child_thr = other_threads[child_tid];
475             register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
476 #if KMP_CACHE_MANAGE
477             register kmp_uint32 next_child_tid = child_tid + (1 << level);
478             // Prefetch next thread's arrived count
479             if (child+1 < branch_factor && next_child_tid < num_threads)
480                 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
481 #endif /* KMP_CACHE_MANAGE */
482             KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
483                           "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
484                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
485                           &child_bar->b_arrived, new_state));
486             // Wait for child to arrive
487             kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
488             c_flag.wait(this_thr, FALSE
489                         USE_ITT_BUILD_ARG(itt_sync_obj) );
490 #if USE_ITT_BUILD && USE_ITT_NOTIFY
491             // Barrier imbalance - write min of the thread time and a child time to the thread.
492             if (__kmp_forkjoin_frames_mode == 2) {
493                 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
494                                                           child_thr->th.th_bar_min_time);
495             }
496 #endif
497             if (reduce) {
498                 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
499                                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
500                                team->t.t_id, child_tid));
501                 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
502             }
503         }
504     }
505 
506     if (KMP_MASTER_TID(tid)) {
507         // Need to update the team arrived pointer if we are the master thread
508         if (new_state == KMP_BARRIER_UNUSED_STATE)
509             team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
510         else
511             team->t.t_bar[bt].b_arrived = new_state;
512         KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
513                       gtid, team->t.t_id, tid, team->t.t_id,
514                       &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
515     }
516     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
517                   gtid, team->t.t_id, tid, bt));
518 }
519 
520 // The reverse versions seem to beat the forward versions overall
521 #define KMP_REVERSE_HYPER_BAR
522 static void
523 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
524                             int propagate_icvs
525                             USE_ITT_BUILD_ARG(void *itt_sync_obj) )
526 {
527     KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release);
528     register kmp_team_t    *team;
529     register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
530     register kmp_info_t   **other_threads;
531     register kmp_uint32     num_threads;
532     register kmp_uint32     branch_bits   = __kmp_barrier_release_branch_bits[ bt ];
533     register kmp_uint32     branch_factor = 1 << branch_bits;
534     register kmp_uint32     child;
535     register kmp_uint32     child_tid;
536     register kmp_uint32     offset;
537     register kmp_uint32     level;
538 
539     /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
540        If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
541        order of the corresponding gather, otherwise threads are released in the same order. */
542     if (KMP_MASTER_TID(tid)) { // master
543         team = __kmp_threads[gtid]->th.th_team;
544         KMP_DEBUG_ASSERT(team != NULL);
545         KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
546                       gtid, team->t.t_id, tid, bt));
547 #if KMP_BARRIER_ICV_PUSH
548         if (propagate_icvs) { // master already has ICVs in final destination; copy
549             copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
550         }
551 #endif
552     }
553     else  { // Handle fork barrier workers who aren't part of a team yet
554         KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
555                       gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
556         // Wait for parent thread to release us
557         kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
558         flag.wait(this_thr, TRUE
559                   USE_ITT_BUILD_ARG(itt_sync_obj) );
560 #if USE_ITT_BUILD && USE_ITT_NOTIFY
561         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
562             // In fork barrier where we could not get the object reliably
563             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
564             // Cancel wait on previous parallel region...
565             __kmp_itt_task_starting(itt_sync_obj);
566 
567             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
568                 return;
569 
570             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
571             if (itt_sync_obj != NULL)
572                 // Call prepare as early as possible for "new" barrier
573                 __kmp_itt_task_finished(itt_sync_obj);
574         } else
575 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
576         // Early exit for reaping threads releasing forkjoin barrier
577         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
578             return;
579 
580         // The worker thread may now assume that the team is valid.
581         team = __kmp_threads[gtid]->th.th_team;
582         KMP_DEBUG_ASSERT(team != NULL);
583         tid = __kmp_tid_from_gtid(gtid);
584 
585         TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
586         KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
587                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
588         KMP_MB();  // Flush all pending memory write invalidates.
589     }
590     num_threads = this_thr->th.th_team_nproc;
591     other_threads = team->t.t_threads;
592 
593 #ifdef KMP_REVERSE_HYPER_BAR
594     // Count up to correct level for parent
595     for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
596          level+=branch_bits, offset<<=branch_bits);
597 
598     // Now go down from there
599     for (level-=branch_bits, offset>>=branch_bits; offset != 0;
600          level-=branch_bits, offset>>=branch_bits)
601 #else
602     // Go down the tree, level by level
603     for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
604 #endif // KMP_REVERSE_HYPER_BAR
605     {
606 #ifdef KMP_REVERSE_HYPER_BAR
607         /* Now go in reverse order through the children, highest to lowest.
608            Initial setting of child is conservative here. */
609         child = num_threads >> ((level==0)?level:level-1);
610         for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
611              child>=1; child--, child_tid-=(1<<level))
612 #else
613         if (((tid >> level) & (branch_factor - 1)) != 0)
614             // No need to go lower than this, since this is the level parent would be notified
615             break;
616         // Iterate through children on this level of the tree
617         for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
618              child++, child_tid+=(1<<level))
619 #endif // KMP_REVERSE_HYPER_BAR
620         {
621             if (child_tid >= num_threads) continue;  // Child doesn't exist so keep going
622             else {
623                 register kmp_info_t *child_thr = other_threads[child_tid];
624                 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
625 #if KMP_CACHE_MANAGE
626                 register kmp_uint32 next_child_tid = child_tid - (1 << level);
627                 // Prefetch next thread's go count
628 # ifdef KMP_REVERSE_HYPER_BAR
629                 if (child-1 >= 1 && next_child_tid < num_threads)
630 # else
631                 if (child+1 < branch_factor && next_child_tid < num_threads)
632 # endif // KMP_REVERSE_HYPER_BAR
633                     KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
634 #endif /* KMP_CACHE_MANAGE */
635 
636 #if KMP_BARRIER_ICV_PUSH
637                 if (propagate_icvs) // push my fixed ICVs to my child
638                     copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
639 #endif // KMP_BARRIER_ICV_PUSH
640 
641                 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
642                               "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
643                               __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
644                               child_tid, &child_bar->b_go, child_bar->b_go,
645                               child_bar->b_go + KMP_BARRIER_STATE_BUMP));
646                 // Release child from barrier
647                 kmp_flag_64 flag(&child_bar->b_go, child_thr);
648                 flag.release();
649             }
650         }
651     }
652 #if KMP_BARRIER_ICV_PUSH
653     if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
654         __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
655         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
656     }
657 #endif
658     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
659                   gtid, team->t.t_id, tid, bt));
660 }
661 
662 // Hierarchical Barrier
663 
664 // Initialize thread barrier data
665 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.  Performs the
666    minimum amount of initialization required based on how the team has changed.  Returns true if
667    leaf children will require both on-core and traditional wake-up mechanisms.  For example, if the
668    team size increases, threads already in the team will respond to on-core wakeup on their parent
669    thread, but threads newly added to the team will only be listening on the their local b_go. */
670 static bool
671 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
672                                        int gtid, int tid, kmp_team_t *team)
673 {
674     // Checks to determine if (re-)initialization is needed
675     bool uninitialized = thr_bar->team == NULL;
676     bool team_changed = team != thr_bar->team;
677     bool team_sz_changed = nproc != thr_bar->nproc;
678     bool tid_changed = tid != thr_bar->old_tid;
679     bool retval = false;
680 
681     if (uninitialized || team_sz_changed) {
682         __kmp_get_hierarchy(nproc, thr_bar);
683     }
684 
685     if (uninitialized || team_sz_changed || tid_changed) {
686         thr_bar->my_level = thr_bar->depth-1; // default for master
687         thr_bar->parent_tid = -1; // default for master
688         if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
689             kmp_uint32 d=0;
690             while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
691                 kmp_uint32 rem;
692                 if (d == thr_bar->depth-2) { // reached level right below the master
693                     thr_bar->parent_tid = 0;
694                     thr_bar->my_level = d;
695                     break;
696                 }
697                 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
698                     // thread is not a subtree root at next level, so this is max
699                     thr_bar->parent_tid = tid - rem;
700                     thr_bar->my_level = d;
701                     break;
702                 }
703                 ++d;
704             }
705         }
706         thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
707         thr_bar->old_tid = tid;
708         thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
709         thr_bar->team = team;
710         thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
711     }
712     if (uninitialized || team_changed || tid_changed) {
713         thr_bar->team = team;
714         thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
715         retval = true;
716     }
717     if (uninitialized || team_sz_changed || tid_changed) {
718         thr_bar->nproc = nproc;
719         thr_bar->leaf_kids = thr_bar->base_leaf_kids;
720         if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
721         if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
722             thr_bar->leaf_kids = nproc - tid - 1;
723         thr_bar->leaf_state = 0;
724         for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
725     }
726     return retval;
727 }
728 
729 static void
730 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
731                                   int gtid, int tid, void (*reduce) (void *, void *)
732                                   USE_ITT_BUILD_ARG(void * itt_sync_obj) )
733 {
734     KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather);
735     register kmp_team_t *team = this_thr->th.th_team;
736     register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
737     register kmp_uint32 nproc = this_thr->th.th_team_nproc;
738     register kmp_info_t **other_threads = team->t.t_threads;
739     register kmp_uint64 new_state;
740 
741     int level = team->t.t_level;
742 #if OMP_40_ENABLED
743     if (other_threads[0]->th.th_teams_microtask)    // are we inside the teams construct?
744         if (this_thr->th.th_teams_size.nteams > 1)
745             ++level; // level was not increased in teams construct for team_of_masters
746 #endif
747     if (level == 1) thr_bar->use_oncore_barrier = 1;
748     else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
749 
750     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
751                   gtid, team->t.t_id, tid, bt));
752     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
753 
754 #if USE_ITT_BUILD && USE_ITT_NOTIFY
755     // Barrier imbalance - save arrive time to the thread
756     if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
757         this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
758     }
759 #endif
760 
761     (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
762 
763     if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
764         register kmp_int32 child_tid;
765         new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
766         if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
767             if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
768                 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
769                 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n",
770                               gtid, team->t.t_id, tid));
771                 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
772                 flag.wait(this_thr, FALSE
773                           USE_ITT_BUILD_ARG(itt_sync_obj) );
774                 if (reduce) {
775                     for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
776                         KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
777                                        gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
778                                        team->t.t_id, child_tid));
779                         (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
780                     }
781                 }
782                 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
783             }
784             // Next, wait for higher level children on each child's b_arrived flag
785             for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
786                 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
787                 if (last > nproc) last = nproc;
788                 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
789                     register kmp_info_t *child_thr = other_threads[child_tid];
790                     register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
791                     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
792                                   "arrived(%p) == %llu\n",
793                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
794                                   team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
795                     kmp_flag_64 flag(&child_bar->b_arrived, new_state);
796                     flag.wait(this_thr, FALSE
797                               USE_ITT_BUILD_ARG(itt_sync_obj) );
798                     if (reduce) {
799                         KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
800                                        gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
801                                        team->t.t_id, child_tid));
802                         (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
803                     }
804                 }
805             }
806         }
807         else { // Blocktime is not infinite
808             for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
809                 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
810                 if (last > nproc) last = nproc;
811                 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
812                     register kmp_info_t *child_thr = other_threads[child_tid];
813                     register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
814                     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
815                                   "arrived(%p) == %llu\n",
816                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
817                                   team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
818                     kmp_flag_64 flag(&child_bar->b_arrived, new_state);
819                     flag.wait(this_thr, FALSE
820                               USE_ITT_BUILD_ARG(itt_sync_obj) );
821                     if (reduce) {
822                         KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
823                                        gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
824                                        team->t.t_id, child_tid));
825                         (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
826                     }
827                 }
828             }
829         }
830     }
831     // All subordinates are gathered; now release parent if not master thread
832 
833     if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
834         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
835                       "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
836                       __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
837                       &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
838         /* Mark arrival to parent: After performing this write, a worker thread may not assume that
839            the team is valid any more - it could be deallocated by the master thread at any time. */
840         if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
841             || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
842             kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
843             flag.release();
844         }
845         else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
846             thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
847             kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
848             flag.set_waiter(other_threads[thr_bar->parent_tid]);
849             flag.release();
850         }
851     } else { // Master thread needs to update the team's b_arrived value
852         team->t.t_bar[bt].b_arrived = new_state;
853         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
854                       gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
855     }
856     // Is the team access below unsafe or just technically invalid?
857     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
858                   gtid, team->t.t_id, tid, bt));
859 }
860 
861 static void
862 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
863                                    int propagate_icvs
864                                    USE_ITT_BUILD_ARG(void * itt_sync_obj) )
865 {
866     KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release);
867     register kmp_team_t *team;
868     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
869     register kmp_uint32 nproc;
870     bool team_change = false; // indicates on-core barrier shouldn't be used
871 
872     if (KMP_MASTER_TID(tid)) {
873         team = __kmp_threads[gtid]->th.th_team;
874         KMP_DEBUG_ASSERT(team != NULL);
875         KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
876                       gtid, team->t.t_id, tid, bt));
877     }
878     else { // Worker threads
879         // Wait for parent thread to release me
880         if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
881             || thr_bar->my_level != 0 || thr_bar->team == NULL) {
882             // Use traditional method of waiting on my own b_go flag
883             thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
884             kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
885             flag.wait(this_thr, TRUE
886                       USE_ITT_BUILD_ARG(itt_sync_obj) );
887             TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
888         }
889         else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
890             // Wait on my "offset" bits on parent's b_go flag
891             thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
892             kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
893                                  bt, this_thr
894                                  USE_ITT_BUILD_ARG(itt_sync_obj) );
895             flag.wait(this_thr, TRUE);
896             if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
897                 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
898             }
899             else { // Reset my bits on parent's b_go flag
900                 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
901             }
902         }
903         thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
904         // Early exit for reaping threads releasing forkjoin barrier
905         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
906             return;
907         // The worker thread may now assume that the team is valid.
908         team = __kmp_threads[gtid]->th.th_team;
909         KMP_DEBUG_ASSERT(team != NULL);
910         tid = __kmp_tid_from_gtid(gtid);
911 
912         KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
913                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
914         KMP_MB();  // Flush all pending memory write invalidates.
915     }
916 
917     nproc = this_thr->th.th_team_nproc;
918     int level = team->t.t_level;
919 #if OMP_40_ENABLED
920     if (team->t.t_threads[0]->th.th_teams_microtask ) {    // are we inside the teams construct?
921         if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
922             ++level; // level was not increased in teams construct for team_of_workers
923         if( this_thr->th.th_teams_size.nteams > 1 )
924             ++level; // level was not increased in teams construct for team_of_masters
925     }
926 #endif
927     if (level == 1) thr_bar->use_oncore_barrier = 1;
928     else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
929 
930     // If the team size has increased, we still communicate with old leaves via oncore barrier.
931     unsigned short int old_leaf_kids = thr_bar->leaf_kids;
932     kmp_uint64 old_leaf_state = thr_bar->leaf_state;
933     team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
934     // But if the entire team changes, we won't use oncore barrier at all
935     if (team_change) old_leaf_kids = 0;
936 
937 #if KMP_BARRIER_ICV_PUSH
938     if (propagate_icvs) {
939         __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
940         if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
941             copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
942         }
943         else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
944             if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
945                 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
946                 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
947                           &thr_bar->parent_bar->th_fixed_icvs);
948             // non-leaves will get ICVs piggybacked with b_go via NGO store
949         }
950         else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
951             if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
952                 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
953             else // leaves copy parent's fixed ICVs directly to local ICV store
954                 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
955                           &thr_bar->parent_bar->th_fixed_icvs);
956         }
957     }
958 #endif // KMP_BARRIER_ICV_PUSH
959 
960     // Now, release my children
961     if (thr_bar->my_level) { // not a leaf
962         register kmp_int32 child_tid;
963         kmp_uint32 last;
964         if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
965             if (KMP_MASTER_TID(tid)) { // do a flat release
966                 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
967                 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
968                 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
969                 ngo_load(&thr_bar->th_fixed_icvs);
970                 // This loops over all the threads skipping only the leaf nodes in the hierarchy
971                 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
972                     register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
973                     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
974                                   " go(%p): %u => %u\n",
975                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
976                                   team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
977                                   child_bar->b_go + KMP_BARRIER_STATE_BUMP));
978                     // Use ngo store (if available) to both store ICVs and release child via child's b_go
979                     ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
980                 }
981                 ngo_sync();
982             }
983             TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
984             // Now, release leaf children
985             if (thr_bar->leaf_kids) { // if there are any
986                 // We test team_change on the off-chance that the level 1 team changed.
987                 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
988                     if (old_leaf_kids) { // release old leaf kids
989                         thr_bar->b_go |= old_leaf_state;
990                     }
991                     // Release new leaf kids
992                     last = tid+thr_bar->skip_per_level[1];
993                     if (last > nproc) last = nproc;
994                     for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
995                         register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
996                         register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
997                         KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
998                                       " T#%d(%d:%d) go(%p): %u => %u\n",
999                                       gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1000                                       team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1001                                       child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1002                         // Release child using child's b_go flag
1003                         kmp_flag_64 flag(&child_bar->b_go, child_thr);
1004                         flag.release();
1005                     }
1006                 }
1007                 else { // Release all children at once with leaf_state bits on my own b_go flag
1008                     thr_bar->b_go |= thr_bar->leaf_state;
1009                 }
1010             }
1011         }
1012         else { // Blocktime is not infinite; do a simple hierarchical release
1013             for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1014                 last = tid+thr_bar->skip_per_level[d+1];
1015                 kmp_uint32 skip = thr_bar->skip_per_level[d];
1016                 if (last > nproc) last = nproc;
1017                 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1018                     register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
1019                     register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1020                     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1021                                   " go(%p): %u => %u\n",
1022                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1023                                   team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1024                                   child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1025                     // Release child using child's b_go flag
1026                     kmp_flag_64 flag(&child_bar->b_go, child_thr);
1027                     flag.release();
1028                 }
1029             }
1030         }
1031 #if KMP_BARRIER_ICV_PUSH
1032         if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1033             copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1034 #endif // KMP_BARRIER_ICV_PUSH
1035     }
1036     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1037                   gtid, team->t.t_id, tid, bt));
1038 }
1039 
1040 // ---------------------------- End of Barrier Algorithms ----------------------------
1041 
1042 // Internal function to do a barrier.
1043 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1044    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1045    Returns 0 if master thread, 1 if worker thread.  */
1046 int
1047 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1048               void *reduce_data, void (*reduce)(void *, void *))
1049 {
1050     KMP_TIME_DEVELOPER_BLOCK(KMP_barrier);
1051     KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1052     KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1053     register int tid = __kmp_tid_from_gtid(gtid);
1054     register kmp_info_t *this_thr = __kmp_threads[gtid];
1055     register kmp_team_t *team = this_thr->th.th_team;
1056     register int status = 0;
1057     ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1058 #if OMPT_SUPPORT
1059     ompt_task_id_t my_task_id;
1060     ompt_parallel_id_t my_parallel_id;
1061 #endif
1062 
1063     KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1064                   gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1065 
1066 #if OMPT_SUPPORT
1067     if (ompt_enabled) {
1068 #if OMPT_BLAME
1069         my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1070         my_parallel_id = team->t.ompt_team_info.parallel_id;
1071 
1072 #if OMPT_TRACE
1073         if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1074             if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1075                 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1076                     my_parallel_id, my_task_id);
1077             }
1078         }
1079 #endif
1080         if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1081             ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1082                 my_parallel_id, my_task_id);
1083         }
1084 #endif
1085         // It is OK to report the barrier state after the barrier begin callback.
1086         // According to the OMPT specification, a compliant implementation may
1087         // even delay reporting this state until the barrier begins to wait.
1088         this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1089     }
1090 #endif
1091 
1092     if (! team->t.t_serialized) {
1093 #if USE_ITT_BUILD
1094         // This value will be used in itt notify events below.
1095         void *itt_sync_obj = NULL;
1096 # if USE_ITT_NOTIFY
1097         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1098             itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1099 # endif
1100 #endif /* USE_ITT_BUILD */
1101         if (__kmp_tasking_mode == tskm_extra_barrier) {
1102             __kmp_tasking_barrier(team, this_thr, gtid);
1103             KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1104                           gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1105         }
1106 
1107         /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1108            the team struct is not guaranteed to exist. */
1109         // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1110         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1111 #if KMP_USE_MONITOR
1112             this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1113 #endif
1114             this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1115         }
1116 
1117 #if USE_ITT_BUILD
1118         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1119             __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1120 #endif /* USE_ITT_BUILD */
1121 #if USE_DEBUGGER
1122         // Let the debugger know: the thread arrived to the barrier and waiting.
1123         if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1124             team->t.t_bar[bt].b_master_arrived += 1;
1125         } else {
1126             this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1127         } // if
1128 #endif /* USE_DEBUGGER */
1129         if (reduce != NULL) {
1130             //KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1131             this_thr->th.th_local.reduce_data = reduce_data;
1132         }
1133 
1134         if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1135             __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1
1136 
1137         switch (__kmp_barrier_gather_pattern[bt]) {
1138         case bp_hyper_bar: {
1139             KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1140             __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1141                                        USE_ITT_BUILD_ARG(itt_sync_obj) );
1142             break;
1143         }
1144         case bp_hierarchical_bar: {
1145             __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1146                                               USE_ITT_BUILD_ARG(itt_sync_obj));
1147             break;
1148         }
1149         case bp_tree_bar: {
1150             KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1151             __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1152                                       USE_ITT_BUILD_ARG(itt_sync_obj) );
1153             break;
1154         }
1155         default: {
1156             __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1157                                         USE_ITT_BUILD_ARG(itt_sync_obj) );
1158         }
1159         }
1160 
1161         KMP_MB();
1162 
1163         if (KMP_MASTER_TID(tid)) {
1164             status = 0;
1165             if (__kmp_tasking_mode != tskm_immediate_exec) {
1166                 __kmp_task_team_wait(this_thr, team
1167                                      USE_ITT_BUILD_ARG(itt_sync_obj) );
1168             }
1169 #if USE_DEBUGGER
1170             // Let the debugger know: All threads are arrived and starting leaving the barrier.
1171             team->t.t_bar[bt].b_team_arrived += 1;
1172 #endif
1173 
1174 #if USE_ITT_BUILD
1175             /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1176                before the final summation into the shared variable is done (final summation can be a
1177                long operation for array reductions).  */
1178             if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1179                 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1180 #endif /* USE_ITT_BUILD */
1181 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1182             // Barrier - report frame end (only if active_level == 1)
1183             if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1184 #if OMP_40_ENABLED
1185                 this_thr->th.th_teams_microtask == NULL &&
1186 #endif
1187                 team->t.t_active_level == 1)
1188             {
1189                 kmp_uint64 cur_time = __itt_get_timestamp();
1190                 kmp_info_t **other_threads = team->t.t_threads;
1191                 int nproc = this_thr->th.th_team_nproc;
1192                 int i;
1193                 switch(__kmp_forkjoin_frames_mode) {
1194                 case 1:
1195                     __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1196                     this_thr->th.th_frame_time = cur_time;
1197                     break;
1198                 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
1199                     __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1200                     break;
1201                 case 3:
1202                     if( __itt_metadata_add_ptr ) {
1203                         // Initialize with master's wait time
1204                         kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1205                         // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1206                         this_thr->th.th_bar_arrive_time = 0;
1207                         for (i=1; i<nproc; ++i) {
1208                             delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1209                             other_threads[i]->th.th_bar_arrive_time = 0;
1210                         }
1211                         __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1212                     }
1213                     __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1214                     this_thr->th.th_frame_time = cur_time;
1215                     break;
1216                 }
1217             }
1218 #endif /* USE_ITT_BUILD */
1219         } else {
1220             status = 1;
1221 #if USE_ITT_BUILD
1222             if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1223                 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1224 #endif /* USE_ITT_BUILD */
1225         }
1226         if (status == 1 || ! is_split) {
1227             switch (__kmp_barrier_release_pattern[bt]) {
1228             case bp_hyper_bar: {
1229                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1230                 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1231                                             USE_ITT_BUILD_ARG(itt_sync_obj) );
1232                 break;
1233             }
1234             case bp_hierarchical_bar: {
1235                 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1236                                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
1237                 break;
1238             }
1239             case bp_tree_bar: {
1240                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1241                 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1242                                            USE_ITT_BUILD_ARG(itt_sync_obj) );
1243                 break;
1244             }
1245             default: {
1246                 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1247                                              USE_ITT_BUILD_ARG(itt_sync_obj) );
1248             }
1249             }
1250             if (__kmp_tasking_mode != tskm_immediate_exec) {
1251                 __kmp_task_team_sync(this_thr, team);
1252             }
1253         }
1254 
1255 #if USE_ITT_BUILD
1256         /* GEH: TODO: Move this under if-condition above and also include in
1257            __kmp_end_split_barrier(). This will more accurately represent the actual release time
1258            of the threads for split barriers.  */
1259         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1260             __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1261 #endif /* USE_ITT_BUILD */
1262     } else { // Team is serialized.
1263         status = 0;
1264         if (__kmp_tasking_mode != tskm_immediate_exec) {
1265 #if OMP_45_ENABLED
1266             if ( this_thr->th.th_task_team != NULL ) {
1267                 void *itt_sync_obj = NULL;
1268 #if USE_ITT_NOTIFY
1269                 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1270                     itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1271                     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1272                 }
1273 #endif
1274 
1275                 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE);
1276                 __kmp_task_team_wait(this_thr, team
1277                                                USE_ITT_BUILD_ARG(itt_sync_obj));
1278                 __kmp_task_team_setup(this_thr, team, 0);
1279 
1280 #if USE_ITT_BUILD
1281                 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1282                     __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1283 #endif /* USE_ITT_BUILD */
1284             }
1285 #else
1286             // The task team should be NULL for serialized code (tasks will be executed immediately)
1287             KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1288             KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1289 #endif
1290         }
1291     }
1292     KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1293                   gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1294 
1295 #if OMPT_SUPPORT
1296     if (ompt_enabled) {
1297 #if OMPT_BLAME
1298         if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1299             ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1300                 my_parallel_id, my_task_id);
1301         }
1302 #endif
1303         this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1304     }
1305 #endif
1306 
1307     return status;
1308 }
1309 
1310 
1311 void
1312 __kmp_end_split_barrier(enum barrier_type bt, int gtid)
1313 {
1314     KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier);
1315     int tid = __kmp_tid_from_gtid(gtid);
1316     kmp_info_t *this_thr = __kmp_threads[gtid];
1317     kmp_team_t *team = this_thr->th.th_team;
1318 
1319     if (!team->t.t_serialized) {
1320         if (KMP_MASTER_GTID(gtid)) {
1321             switch (__kmp_barrier_release_pattern[bt]) {
1322             case bp_hyper_bar: {
1323                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1324                 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1325                                             USE_ITT_BUILD_ARG(NULL) );
1326                 break;
1327             }
1328             case bp_hierarchical_bar: {
1329                 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1330                                                    USE_ITT_BUILD_ARG(NULL));
1331                 break;
1332             }
1333             case bp_tree_bar: {
1334                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1335                 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1336                                            USE_ITT_BUILD_ARG(NULL) );
1337                 break;
1338             }
1339             default: {
1340                 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1341                                              USE_ITT_BUILD_ARG(NULL) );
1342             }
1343             }
1344             if (__kmp_tasking_mode != tskm_immediate_exec) {
1345                 __kmp_task_team_sync(this_thr, team);
1346             } // if
1347         }
1348     }
1349 }
1350 
1351 
1352 void
1353 __kmp_join_barrier(int gtid)
1354 {
1355     KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier);
1356     KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1357     KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier);
1358     register kmp_info_t *this_thr = __kmp_threads[gtid];
1359     register kmp_team_t *team;
1360     register kmp_uint nproc;
1361     kmp_info_t *master_thread;
1362     int tid;
1363 #ifdef KMP_DEBUG
1364     int team_id;
1365 #endif /* KMP_DEBUG */
1366 #if USE_ITT_BUILD
1367     void *itt_sync_obj = NULL;
1368 # if USE_ITT_NOTIFY
1369     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1370         // Get object created at fork_barrier
1371         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1372 # endif
1373 #endif /* USE_ITT_BUILD */
1374     KMP_MB();
1375 
1376     // Get current info
1377     team = this_thr->th.th_team;
1378     nproc = this_thr->th.th_team_nproc;
1379     KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1380     tid = __kmp_tid_from_gtid(gtid);
1381 #ifdef KMP_DEBUG
1382     team_id = team->t.t_id;
1383 #endif /* KMP_DEBUG */
1384     master_thread = this_thr->th.th_team_master;
1385 #ifdef KMP_DEBUG
1386     if (master_thread != team->t.t_threads[0]) {
1387         __kmp_print_structure();
1388     }
1389 #endif /* KMP_DEBUG */
1390     KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1391     KMP_MB();
1392 
1393     // Verify state
1394     KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1395     KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1396     KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1397     KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1398     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1399 
1400 #if OMPT_SUPPORT
1401 #if OMPT_TRACE
1402     if (ompt_enabled &&
1403         ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1404         ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1405             team->t.ompt_team_info.parallel_id,
1406             team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1407     }
1408 #endif
1409     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1410 #endif
1411 
1412     if (__kmp_tasking_mode == tskm_extra_barrier) {
1413         __kmp_tasking_barrier(team, this_thr, gtid);
1414         KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1415     }
1416 # ifdef KMP_DEBUG
1417     if (__kmp_tasking_mode != tskm_immediate_exec) {
1418         KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1419                        __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
1420                        this_thr->th.th_task_team));
1421         KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
1422     }
1423 # endif /* KMP_DEBUG */
1424 
1425     /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1426        team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1427        down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1428        since the values are not used by __kmp_wait_template() in that case. */
1429     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1430 #if KMP_USE_MONITOR
1431         this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1432 #endif
1433         this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1434     }
1435 
1436 #if USE_ITT_BUILD
1437     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1438         __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1439 #endif /* USE_ITT_BUILD */
1440 
1441     switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1442     case bp_hyper_bar: {
1443         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1444         __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1445                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
1446         break;
1447     }
1448     case bp_hierarchical_bar: {
1449         __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1450                                           USE_ITT_BUILD_ARG(itt_sync_obj) );
1451         break;
1452     }
1453     case bp_tree_bar: {
1454         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1455         __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1456                                   USE_ITT_BUILD_ARG(itt_sync_obj) );
1457         break;
1458     }
1459     default: {
1460         __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1461                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
1462     }
1463     }
1464 
1465     /* From this point on, the team data structure may be deallocated at any time by the
1466        master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1467        data items that need to be referenced before the end of the barrier should be moved to
1468        the kmp_task_team_t structs.  */
1469     if (KMP_MASTER_TID(tid)) {
1470         if (__kmp_tasking_mode != tskm_immediate_exec) {
1471             __kmp_task_team_wait(this_thr, team
1472                                  USE_ITT_BUILD_ARG(itt_sync_obj) );
1473         }
1474 #if KMP_STATS_ENABLED
1475         // Have master thread flag the workers to indicate they are now waiting for
1476         // next parallel region, Also wake them up so they switch their timers to idle.
1477         for (int i=0; i<team->t.t_nproc; ++i) {
1478             kmp_info_t* team_thread = team->t.t_threads[i];
1479             if (team_thread == this_thr)
1480                 continue;
1481             team_thread->th.th_stats->setIdleFlag();
1482             if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL)
1483                 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc);
1484         }
1485 #endif
1486 #if USE_ITT_BUILD
1487         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1488             __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1489 #endif /* USE_ITT_BUILD */
1490 
1491 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1492         // Join barrier - report frame end
1493         if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1494 #if OMP_40_ENABLED
1495             this_thr->th.th_teams_microtask == NULL &&
1496 #endif
1497             team->t.t_active_level == 1)
1498         {
1499             kmp_uint64 cur_time = __itt_get_timestamp();
1500             ident_t * loc = team->t.t_ident;
1501             kmp_info_t **other_threads = team->t.t_threads;
1502             int nproc = this_thr->th.th_team_nproc;
1503             int i;
1504             switch(__kmp_forkjoin_frames_mode) {
1505             case 1:
1506                 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1507                 break;
1508             case 2:
1509                 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1510                 break;
1511             case 3:
1512                 if( __itt_metadata_add_ptr ) {
1513                     // Initialize with master's wait time
1514                     kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1515                     // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1516                     this_thr->th.th_bar_arrive_time = 0;
1517                     for (i=1; i<nproc; ++i) {
1518                         delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1519                         other_threads[i]->th.th_bar_arrive_time = 0;
1520                     }
1521                     __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1522                 }
1523                 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1524                 this_thr->th.th_frame_time = cur_time;
1525                 break;
1526             }
1527         }
1528 # endif /* USE_ITT_BUILD */
1529     }
1530 #if USE_ITT_BUILD
1531     else {
1532         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1533             __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1534     }
1535 #endif /* USE_ITT_BUILD */
1536 
1537 #if KMP_DEBUG
1538     if (KMP_MASTER_TID(tid)) {
1539         KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1540                       gtid, team_id, tid, nproc));
1541     }
1542 #endif /* KMP_DEBUG */
1543 
1544     // TODO now, mark worker threads as done so they may be disbanded
1545     KMP_MB(); // Flush all pending memory write invalidates.
1546     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1547 
1548 #if OMPT_SUPPORT
1549     if (ompt_enabled) {
1550 #if OMPT_BLAME
1551         if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1552             ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1553                 team->t.ompt_team_info.parallel_id,
1554                 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1555         }
1556 #endif
1557 
1558         // return to default state
1559         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1560     }
1561 #endif
1562 }
1563 
1564 
1565 // TODO release worker threads' fork barriers as we are ready instead of all at once
1566 void
1567 __kmp_fork_barrier(int gtid, int tid)
1568 {
1569     KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier);
1570     KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1571     KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier);
1572     kmp_info_t *this_thr = __kmp_threads[gtid];
1573     kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1574 #if USE_ITT_BUILD
1575     void * itt_sync_obj = NULL;
1576 #endif /* USE_ITT_BUILD */
1577 
1578     KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1579                   gtid, (team != NULL) ? team->t.t_id : -1, tid));
1580 
1581     // th_team pointer only valid for master thread here
1582     if (KMP_MASTER_TID(tid)) {
1583 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1584         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1585             // Create itt barrier object
1586             itt_sync_obj  = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1587             __kmp_itt_barrier_middle(gtid, itt_sync_obj);  // Call acquired/releasing
1588         }
1589 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1590 
1591 #ifdef KMP_DEBUG
1592         register kmp_info_t **other_threads = team->t.t_threads;
1593         register int i;
1594 
1595         // Verify state
1596         KMP_MB();
1597 
1598         for(i=1; i<team->t.t_nproc; ++i) {
1599             KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1600                            gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1601                            team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1602                            other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1603             KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1604                               & ~(KMP_BARRIER_SLEEP_STATE))
1605                              == KMP_INIT_BARRIER_STATE);
1606             KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1607         }
1608 #endif
1609 
1610         if (__kmp_tasking_mode != tskm_immediate_exec) {
1611             __kmp_task_team_setup(this_thr, team, 0);  // 0 indicates setup current task team if nthreads > 1
1612         }
1613 
1614         /* The master thread may have changed its blocktime between the join barrier and the
1615            fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1616            access it when the team struct is not guaranteed to exist. */
1617         // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1618         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1619 #if KMP_USE_MONITOR
1620             this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1621 #endif
1622             this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1623         }
1624     } // master
1625 
1626     switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1627     case bp_hyper_bar: {
1628         KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1629         __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1630                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
1631         break;
1632     }
1633     case bp_hierarchical_bar: {
1634         __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1635                                            USE_ITT_BUILD_ARG(itt_sync_obj) );
1636         break;
1637     }
1638     case bp_tree_bar: {
1639         KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1640         __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1641                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
1642         break;
1643     }
1644     default: {
1645         __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1646                                      USE_ITT_BUILD_ARG(itt_sync_obj) );
1647     }
1648     }
1649 
1650     // Early exit for reaping threads releasing forkjoin barrier
1651     if (TCR_4(__kmp_global.g.g_done)) {
1652         this_thr->th.th_task_team = NULL;
1653 
1654 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1655         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1656             if (!KMP_MASTER_TID(tid)) {
1657                 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1658                 if (itt_sync_obj)
1659                     __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1660             }
1661         }
1662 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1663         KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1664         return;
1665     }
1666 
1667     /* We can now assume that a valid team structure has been allocated by the master and
1668        propagated to all worker threads. The current thread, however, may not be part of the
1669        team, so we can't blindly assume that the team pointer is non-null.  */
1670     team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1671     KMP_DEBUG_ASSERT(team != NULL);
1672     tid = __kmp_tid_from_gtid(gtid);
1673 
1674 
1675 #if KMP_BARRIER_ICV_PULL
1676     /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1677        __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1678        this data before this function is called. We cannot modify __kmp_fork_call() to look at
1679        the fixed ICVs in the master's thread struct, because it is not always the case that the
1680        threads arrays have been allocated when __kmp_fork_call() is executed. */
1681     {
1682         KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
1683         if (!KMP_MASTER_TID(tid)) {  // master thread already has ICVs
1684             // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1685             KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1686             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1687             copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1688                       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1689         }
1690     }
1691 #endif // KMP_BARRIER_ICV_PULL
1692 
1693     if (__kmp_tasking_mode != tskm_immediate_exec) {
1694         __kmp_task_team_sync(this_thr, team);
1695     }
1696 
1697 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1698     kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1699     if (proc_bind == proc_bind_intel) {
1700 #endif
1701 #if KMP_AFFINITY_SUPPORTED
1702         // Call dynamic affinity settings
1703         if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1704             __kmp_balanced_affinity(tid, team->t.t_nproc);
1705         }
1706 #endif // KMP_AFFINITY_SUPPORTED
1707 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1708     }
1709     else if (proc_bind != proc_bind_false) {
1710         if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1711             KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1712                            __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1713         }
1714         else {
1715             __kmp_affinity_set_place(gtid);
1716         }
1717     }
1718 #endif
1719 
1720 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1721     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1722         if (!KMP_MASTER_TID(tid)) {
1723             // Get correct barrier object
1724             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1725             __kmp_itt_barrier_finished(gtid, itt_sync_obj);  // Workers call acquired
1726         } // (prepare called inside barrier_release)
1727     }
1728 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1729     KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1730 }
1731 
1732 
1733 void
1734 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1735 {
1736     KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy);
1737 
1738     KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1739     KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1740 
1741     /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1742        __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1743        this data before this function is called. */
1744 #if KMP_BARRIER_ICV_PULL
1745     /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1746        all of the worker threads can access them and make their own copies after the barrier. */
1747     KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
1748     copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1749     KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1750                   0, team->t.t_threads[0], team));
1751 #elif KMP_BARRIER_ICV_PUSH
1752     // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1753     KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1754                   0, team->t.t_threads[0], team));
1755 #else
1756     // Copy the ICVs to each of the non-master threads.  This takes O(nthreads) time.
1757     ngo_load(new_icvs);
1758     KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
1759     for (int f=1; f<new_nproc; ++f) { // Skip the master thread
1760         // TODO: GEH - pass in better source location info since usually NULL here
1761         KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1762                       f, team->t.t_threads[f], team));
1763         __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1764         ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1765         KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1766                       f, team->t.t_threads[f], team));
1767     }
1768     ngo_sync();
1769 #endif // KMP_BARRIER_ICV_PULL
1770 }
1771