1 /*
2  * kmp_barrier.cpp
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_stats.h"
19 #include "kmp_itt.h"
20 #include "kmp_os.h"
21 
22 
23 #if KMP_MIC
24 #include <immintrin.h>
25 #define USE_NGO_STORES 1
26 #endif // KMP_MIC
27 
28 #if KMP_MIC && USE_NGO_STORES
29 // ICV copying
30 #define ngo_load(src)            __m512d Vt = _mm512_load_pd((void *)(src))
31 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_store_go(dst, src)   _mm512_storenrngo_pd((void *)(dst), Vt)
33 #define ngo_sync()               __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
34 #else
35 #define ngo_load(src)            ((void)0)
36 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
37 #define ngo_store_go(dst, src)   KMP_MEMCPY((dst), (src), CACHE_LINE)
38 #define ngo_sync()               ((void)0)
39 #endif /* KMP_MIC && USE_NGO_STORES */
40 
41 void __kmp_print_structure(void); // Forward declaration
42 
43 // ---------------------------- Barrier Algorithms ----------------------------
44 
45 // Linear Barrier
46 static void
47 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48                             void (*reduce)(void *, void *)
49                             USE_ITT_BUILD_ARG(void * itt_sync_obj) )
50 {
51     KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather);
52     register kmp_team_t *team = this_thr->th.th_team;
53     register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
54     register kmp_info_t **other_threads = team->t.t_threads;
55 
56     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57                   gtid, team->t.t_id, tid, bt));
58     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61     // Barrier imbalance - save arrive time to the thread
62     if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63         this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
64     }
65 #endif
66     // We now perform a linear reduction to signal that all of the threads have arrived.
67     if (!KMP_MASTER_TID(tid)) {
68         KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
69                       "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
70                       __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
71                       thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
72         // Mark arrival to master thread
73         /* After performing this write, a worker thread may not assume that the team is valid
74            any more - it could be deallocated by the master thread at any time. */
75         kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
76         flag.release();
77     } else {
78         register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
79         register int nproc = this_thr->th.th_team_nproc;
80         register int i;
81         // Don't have to worry about sleep bit here or atomic since team setting
82         register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
83 
84         // Collect all the worker team member threads.
85         for (i=1; i<nproc; ++i) {
86 #if KMP_CACHE_MANAGE
87             // Prefetch next thread's arrived count
88             if (i+1 < nproc)
89                 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
90 #endif /* KMP_CACHE_MANAGE */
91             KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
92                           "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
93                             __kmp_gtid_from_tid(i, team), team->t.t_id, i,
94                             &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
95 
96             // Wait for worker thread to arrive
97             kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
98             flag.wait(this_thr, FALSE
99                       USE_ITT_BUILD_ARG(itt_sync_obj) );
100 #if USE_ITT_BUILD && USE_ITT_NOTIFY
101             // Barrier imbalance - write min of the thread time and the other thread time to the thread.
102             if (__kmp_forkjoin_frames_mode == 2) {
103                 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
104                                                           other_threads[i]->th.th_bar_min_time);
105             }
106 #endif
107             if (reduce) {
108                 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
109                                team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
110                 (*reduce)(this_thr->th.th_local.reduce_data,
111                           other_threads[i]->th.th_local.reduce_data);
112             }
113         }
114         // Don't have to worry about sleep bit here or atomic since team setting
115         team_bar->b_arrived = new_state;
116         KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
117                       gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
118     }
119     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
120                   gtid, team->t.t_id, tid, bt));
121 }
122 
123 static void
124 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
125                              int propagate_icvs
126                              USE_ITT_BUILD_ARG(void *itt_sync_obj) )
127 {
128     KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release);
129     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
130     register kmp_team_t *team;
131 
132     if (KMP_MASTER_TID(tid)) {
133         register unsigned int i;
134         register kmp_uint32 nproc = this_thr->th.th_team_nproc;
135         register kmp_info_t **other_threads;
136 
137         team = __kmp_threads[gtid]->th.th_team;
138         KMP_DEBUG_ASSERT(team != NULL);
139         other_threads = team->t.t_threads;
140 
141         KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
142                       gtid, team->t.t_id, tid, bt));
143 
144         if (nproc > 1) {
145 #if KMP_BARRIER_ICV_PUSH
146             {
147                 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
148                 if (propagate_icvs) {
149                     ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
150                     for (i=1; i<nproc; ++i) {
151                         __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
152                         ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
153                                        &team->t.t_implicit_task_taskdata[0].td_icvs);
154                     }
155                     ngo_sync();
156                 }
157             }
158 #endif // KMP_BARRIER_ICV_PUSH
159 
160             // Now, release all of the worker threads
161             for (i=1; i<nproc; ++i) {
162 #if KMP_CACHE_MANAGE
163                 // Prefetch next thread's go flag
164                 if (i+1 < nproc)
165                     KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
166 #endif /* KMP_CACHE_MANAGE */
167                 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
168                               "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
169                               other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
170                               &other_threads[i]->th.th_bar[bt].bb.b_go,
171                               other_threads[i]->th.th_bar[bt].bb.b_go,
172                               other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
173                 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
174                 flag.release();
175             }
176         }
177     } else { // Wait for the MASTER thread to release us
178         KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
179                       gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
180         kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
181         flag.wait(this_thr, TRUE
182                   USE_ITT_BUILD_ARG(itt_sync_obj) );
183 #if USE_ITT_BUILD && USE_ITT_NOTIFY
184         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
185             // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
186             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
187             // Cancel wait on previous parallel region...
188             __kmp_itt_task_starting(itt_sync_obj);
189 
190             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
191                 return;
192 
193             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
194             if (itt_sync_obj != NULL)
195                 // Call prepare as early as possible for "new" barrier
196                 __kmp_itt_task_finished(itt_sync_obj);
197         } else
198 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
199         // Early exit for reaping threads releasing forkjoin barrier
200         if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
201             return;
202         // The worker thread may now assume that the team is valid.
203 #ifdef KMP_DEBUG
204         tid = __kmp_tid_from_gtid(gtid);
205         team = __kmp_threads[gtid]->th.th_team;
206 #endif
207         KMP_DEBUG_ASSERT(team != NULL);
208         TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
209         KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
210                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
211         KMP_MB();  // Flush all pending memory write invalidates.
212     }
213     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
214                   gtid, team->t.t_id, tid, bt));
215 }
216 
217 // Tree barrier
218 static void
219 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
220                           void (*reduce)(void *, void *)
221                           USE_ITT_BUILD_ARG(void *itt_sync_obj) )
222 {
223     KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather);
224     register kmp_team_t *team = this_thr->th.th_team;
225     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
226     register kmp_info_t **other_threads = team->t.t_threads;
227     register kmp_uint32 nproc = this_thr->th.th_team_nproc;
228     register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
229     register kmp_uint32 branch_factor = 1 << branch_bits;
230     register kmp_uint32 child;
231     register kmp_uint32 child_tid;
232     register kmp_uint64 new_state;
233 
234     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
235                   gtid, team->t.t_id, tid, bt));
236     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
237 
238 #if USE_ITT_BUILD && USE_ITT_NOTIFY
239     // Barrier imbalance - save arrive time to the thread
240     if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
241         this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
242     }
243 #endif
244     // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
245     child_tid = (tid << branch_bits) + 1;
246     if (child_tid < nproc) {
247         // Parent threads wait for all their children to arrive
248         new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
249         child = 1;
250         do {
251             register kmp_info_t *child_thr = other_threads[child_tid];
252             register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
253 #if KMP_CACHE_MANAGE
254             // Prefetch next thread's arrived count
255             if (child+1 <= branch_factor && child_tid+1 < nproc)
256                 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
257 #endif /* KMP_CACHE_MANAGE */
258             KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
259                           "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
260                             __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
261                             &child_bar->b_arrived, new_state));
262             // Wait for child to arrive
263             kmp_flag_64 flag(&child_bar->b_arrived, new_state);
264             flag.wait(this_thr, FALSE
265                       USE_ITT_BUILD_ARG(itt_sync_obj) );
266 #if USE_ITT_BUILD && USE_ITT_NOTIFY
267             // Barrier imbalance - write min of the thread time and a child time to the thread.
268             if (__kmp_forkjoin_frames_mode == 2) {
269                 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
270                                                           child_thr->th.th_bar_min_time);
271             }
272 #endif
273             if (reduce) {
274                 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
275                                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
276                                team->t.t_id, child_tid));
277                 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
278             }
279             child++;
280             child_tid++;
281         }
282         while (child <= branch_factor && child_tid < nproc);
283     }
284 
285     if (!KMP_MASTER_TID(tid)) { // Worker threads
286         register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
287 
288         KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
289                       "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
290                       __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
291                       &thr_bar->b_arrived, thr_bar->b_arrived,
292                       thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
293 
294         // Mark arrival to parent thread
295         /* After performing this write, a worker thread may not assume that the team is valid
296            any more - it could be deallocated by the master thread at any time.  */
297         kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
298         flag.release();
299     } else {
300         // Need to update the team arrived pointer if we are the master thread
301         if (nproc > 1) // New value was already computed above
302             team->t.t_bar[bt].b_arrived = new_state;
303         else
304             team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
305         KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
306                       gtid, team->t.t_id, tid, team->t.t_id,
307                       &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
308     }
309     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
310                   gtid, team->t.t_id, tid, bt));
311 }
312 
313 static void
314 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
315                            int propagate_icvs
316                            USE_ITT_BUILD_ARG(void *itt_sync_obj) )
317 {
318     KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release);
319     register kmp_team_t *team;
320     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
321     register kmp_uint32 nproc;
322     register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
323     register kmp_uint32 branch_factor = 1 << branch_bits;
324     register kmp_uint32 child;
325     register kmp_uint32 child_tid;
326 
327     // Perform a tree release for all of the threads that have been gathered
328     if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
329         KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
330                       gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
331         // Wait for parent thread to release us
332         kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
333         flag.wait(this_thr, TRUE
334                   USE_ITT_BUILD_ARG(itt_sync_obj) );
335 #if USE_ITT_BUILD && USE_ITT_NOTIFY
336         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
337             // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
338             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
339             // Cancel wait on previous parallel region...
340             __kmp_itt_task_starting(itt_sync_obj);
341 
342             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
343                 return;
344 
345             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
346             if (itt_sync_obj != NULL)
347                 // Call prepare as early as possible for "new" barrier
348                 __kmp_itt_task_finished(itt_sync_obj);
349         } else
350 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
351         // Early exit for reaping threads releasing forkjoin barrier
352         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
353             return;
354 
355         // The worker thread may now assume that the team is valid.
356         team = __kmp_threads[gtid]->th.th_team;
357         KMP_DEBUG_ASSERT(team != NULL);
358         tid = __kmp_tid_from_gtid(gtid);
359 
360         TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
361         KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
362                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
363         KMP_MB();  // Flush all pending memory write invalidates.
364     } else {
365         team = __kmp_threads[gtid]->th.th_team;
366         KMP_DEBUG_ASSERT(team != NULL);
367         KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
368                       gtid, team->t.t_id, tid, bt));
369     }
370     nproc = this_thr->th.th_team_nproc;
371     child_tid = (tid << branch_bits) + 1;
372 
373     if (child_tid < nproc) {
374         register kmp_info_t **other_threads = team->t.t_threads;
375         child = 1;
376         // Parent threads release all their children
377         do {
378             register kmp_info_t *child_thr = other_threads[child_tid];
379             register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
380 #if KMP_CACHE_MANAGE
381             // Prefetch next thread's go count
382             if (child+1 <= branch_factor && child_tid+1 < nproc)
383                 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
384 #endif /* KMP_CACHE_MANAGE */
385 
386 #if KMP_BARRIER_ICV_PUSH
387             {
388                 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
389                 if (propagate_icvs) {
390                     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
391                                              team, child_tid, FALSE);
392                     copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
393                               &team->t.t_implicit_task_taskdata[0].td_icvs);
394                 }
395             }
396 #endif // KMP_BARRIER_ICV_PUSH
397             KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
398                           "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
399                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
400                           child_tid, &child_bar->b_go, child_bar->b_go,
401                           child_bar->b_go + KMP_BARRIER_STATE_BUMP));
402             // Release child from barrier
403             kmp_flag_64 flag(&child_bar->b_go, child_thr);
404             flag.release();
405             child++;
406             child_tid++;
407         }
408         while (child <= branch_factor && child_tid < nproc);
409     }
410     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
411                   gtid, team->t.t_id, tid, bt));
412 }
413 
414 
415 // Hyper Barrier
416 static void
417 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
418                            void (*reduce)(void *, void *)
419                            USE_ITT_BUILD_ARG(void *itt_sync_obj) )
420 {
421     KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather);
422     register kmp_team_t *team = this_thr->th.th_team;
423     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
424     register kmp_info_t **other_threads = team->t.t_threads;
425     register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
426     register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
427     register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
428     register kmp_uint32 branch_factor = 1 << branch_bits;
429     register kmp_uint32 offset;
430     register kmp_uint32 level;
431 
432     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
433                   gtid, team->t.t_id, tid, bt));
434 
435     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
436 
437 #if USE_ITT_BUILD && USE_ITT_NOTIFY
438     // Barrier imbalance - save arrive time to the thread
439     if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
440         this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
441     }
442 #endif
443     /* Perform a hypercube-embedded tree gather to wait until all of the threads have
444        arrived, and reduce any required data as we go.  */
445     kmp_flag_64 p_flag(&thr_bar->b_arrived);
446     for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
447     {
448         register kmp_uint32 child;
449         register kmp_uint32 child_tid;
450 
451         if (((tid >> level) & (branch_factor - 1)) != 0) {
452             register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
453 
454             KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
455                           "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
456                           __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
457                           &thr_bar->b_arrived, thr_bar->b_arrived,
458                           thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
459             // Mark arrival to parent thread
460             /* After performing this write (in the last iteration of the enclosing for loop),
461                a worker thread may not assume that the team is valid any more - it could be
462                deallocated by the master thread at any time.  */
463             p_flag.set_waiter(other_threads[parent_tid]);
464             p_flag.release();
465             break;
466         }
467 
468         // Parent threads wait for children to arrive
469         if (new_state == KMP_BARRIER_UNUSED_STATE)
470             new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
471         for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
472              child++, child_tid+=(1 << level))
473         {
474             register kmp_info_t *child_thr = other_threads[child_tid];
475             register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
476 #if KMP_CACHE_MANAGE
477             register kmp_uint32 next_child_tid = child_tid + (1 << level);
478             // Prefetch next thread's arrived count
479             if (child+1 < branch_factor && next_child_tid < num_threads)
480                 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
481 #endif /* KMP_CACHE_MANAGE */
482             KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
483                           "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
484                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
485                           &child_bar->b_arrived, new_state));
486             // Wait for child to arrive
487             kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
488             c_flag.wait(this_thr, FALSE
489                         USE_ITT_BUILD_ARG(itt_sync_obj) );
490 #if USE_ITT_BUILD && USE_ITT_NOTIFY
491             // Barrier imbalance - write min of the thread time and a child time to the thread.
492             if (__kmp_forkjoin_frames_mode == 2) {
493                 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
494                                                           child_thr->th.th_bar_min_time);
495             }
496 #endif
497             if (reduce) {
498                 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
499                                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
500                                team->t.t_id, child_tid));
501                 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
502             }
503         }
504     }
505 
506     if (KMP_MASTER_TID(tid)) {
507         // Need to update the team arrived pointer if we are the master thread
508         if (new_state == KMP_BARRIER_UNUSED_STATE)
509             team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
510         else
511             team->t.t_bar[bt].b_arrived = new_state;
512         KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
513                       gtid, team->t.t_id, tid, team->t.t_id,
514                       &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
515     }
516     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
517                   gtid, team->t.t_id, tid, bt));
518 }
519 
520 // The reverse versions seem to beat the forward versions overall
521 #define KMP_REVERSE_HYPER_BAR
522 static void
523 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
524                             int propagate_icvs
525                             USE_ITT_BUILD_ARG(void *itt_sync_obj) )
526 {
527     KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release);
528     register kmp_team_t    *team;
529     register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
530     register kmp_info_t   **other_threads;
531     register kmp_uint32     num_threads;
532     register kmp_uint32     branch_bits   = __kmp_barrier_release_branch_bits[ bt ];
533     register kmp_uint32     branch_factor = 1 << branch_bits;
534     register kmp_uint32     child;
535     register kmp_uint32     child_tid;
536     register kmp_uint32     offset;
537     register kmp_uint32     level;
538 
539     /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
540        If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
541        order of the corresponding gather, otherwise threads are released in the same order. */
542     if (KMP_MASTER_TID(tid)) { // master
543         team = __kmp_threads[gtid]->th.th_team;
544         KMP_DEBUG_ASSERT(team != NULL);
545         KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
546                       gtid, team->t.t_id, tid, bt));
547 #if KMP_BARRIER_ICV_PUSH
548         if (propagate_icvs) { // master already has ICVs in final destination; copy
549             copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
550         }
551 #endif
552     }
553     else  { // Handle fork barrier workers who aren't part of a team yet
554         KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
555                       gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
556         // Wait for parent thread to release us
557         kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
558         flag.wait(this_thr, TRUE
559                   USE_ITT_BUILD_ARG(itt_sync_obj) );
560 #if USE_ITT_BUILD && USE_ITT_NOTIFY
561         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
562             // In fork barrier where we could not get the object reliably
563             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
564             // Cancel wait on previous parallel region...
565             __kmp_itt_task_starting(itt_sync_obj);
566 
567             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
568                 return;
569 
570             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
571             if (itt_sync_obj != NULL)
572                 // Call prepare as early as possible for "new" barrier
573                 __kmp_itt_task_finished(itt_sync_obj);
574         } else
575 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
576         // Early exit for reaping threads releasing forkjoin barrier
577         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
578             return;
579 
580         // The worker thread may now assume that the team is valid.
581         team = __kmp_threads[gtid]->th.th_team;
582         KMP_DEBUG_ASSERT(team != NULL);
583         tid = __kmp_tid_from_gtid(gtid);
584 
585         TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
586         KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
587                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
588         KMP_MB();  // Flush all pending memory write invalidates.
589     }
590     num_threads = this_thr->th.th_team_nproc;
591     other_threads = team->t.t_threads;
592 
593 #ifdef KMP_REVERSE_HYPER_BAR
594     // Count up to correct level for parent
595     for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
596          level+=branch_bits, offset<<=branch_bits);
597 
598     // Now go down from there
599     for (level-=branch_bits, offset>>=branch_bits; offset != 0;
600          level-=branch_bits, offset>>=branch_bits)
601 #else
602     // Go down the tree, level by level
603     for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
604 #endif // KMP_REVERSE_HYPER_BAR
605     {
606 #ifdef KMP_REVERSE_HYPER_BAR
607         /* Now go in reverse order through the children, highest to lowest.
608            Initial setting of child is conservative here. */
609         child = num_threads >> ((level==0)?level:level-1);
610         for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
611              child>=1; child--, child_tid-=(1<<level))
612 #else
613         if (((tid >> level) & (branch_factor - 1)) != 0)
614             // No need to go lower than this, since this is the level parent would be notified
615             break;
616         // Iterate through children on this level of the tree
617         for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
618              child++, child_tid+=(1<<level))
619 #endif // KMP_REVERSE_HYPER_BAR
620         {
621             if (child_tid >= num_threads) continue;  // Child doesn't exist so keep going
622             else {
623                 register kmp_info_t *child_thr = other_threads[child_tid];
624                 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
625 #if KMP_CACHE_MANAGE
626                 register kmp_uint32 next_child_tid = child_tid - (1 << level);
627                 // Prefetch next thread's go count
628 # ifdef KMP_REVERSE_HYPER_BAR
629                 if (child-1 >= 1 && next_child_tid < num_threads)
630 # else
631                 if (child+1 < branch_factor && next_child_tid < num_threads)
632 # endif // KMP_REVERSE_HYPER_BAR
633                     KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
634 #endif /* KMP_CACHE_MANAGE */
635 
636 #if KMP_BARRIER_ICV_PUSH
637                 if (propagate_icvs) // push my fixed ICVs to my child
638                     copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
639 #endif // KMP_BARRIER_ICV_PUSH
640 
641                 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
642                               "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
643                               __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
644                               child_tid, &child_bar->b_go, child_bar->b_go,
645                               child_bar->b_go + KMP_BARRIER_STATE_BUMP));
646                 // Release child from barrier
647                 kmp_flag_64 flag(&child_bar->b_go, child_thr);
648                 flag.release();
649             }
650         }
651     }
652 #if KMP_BARRIER_ICV_PUSH
653     if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
654         __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
655         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
656     }
657 #endif
658     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
659                   gtid, team->t.t_id, tid, bt));
660 }
661 
662 // Hierarchical Barrier
663 
664 // Initialize thread barrier data
665 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.  Performs the
666    minimum amount of initialization required based on how the team has changed.  Returns true if
667    leaf children will require both on-core and traditional wake-up mechanisms.  For example, if the
668    team size increases, threads already in the team will respond to on-core wakeup on their parent
669    thread, but threads newly added to the team will only be listening on the their local b_go. */
670 static bool
671 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
672                                        int gtid, int tid, kmp_team_t *team)
673 {
674     // Checks to determine if (re-)initialization is needed
675     bool uninitialized = thr_bar->team == NULL;
676     bool team_changed = team != thr_bar->team;
677     bool team_sz_changed = nproc != thr_bar->nproc;
678     bool tid_changed = tid != thr_bar->old_tid;
679     bool retval = false;
680 
681     if (uninitialized || team_sz_changed) {
682         __kmp_get_hierarchy(nproc, thr_bar);
683     }
684 
685     if (uninitialized || team_sz_changed || tid_changed) {
686         thr_bar->my_level = thr_bar->depth-1; // default for master
687         thr_bar->parent_tid = -1; // default for master
688         if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
689             kmp_uint32 d=0;
690             while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
691                 kmp_uint32 rem;
692                 if (d == thr_bar->depth-2) { // reached level right below the master
693                     thr_bar->parent_tid = 0;
694                     thr_bar->my_level = d;
695                     break;
696                 }
697                 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
698                     // thread is not a subtree root at next level, so this is max
699                     thr_bar->parent_tid = tid - rem;
700                     thr_bar->my_level = d;
701                     break;
702                 }
703                 ++d;
704             }
705         }
706         thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
707         thr_bar->old_tid = tid;
708         thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
709     }
710     if (uninitialized || team_changed || tid_changed) {
711         thr_bar->team = team;
712         thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
713         retval = true;
714     }
715     if (uninitialized || team_sz_changed || tid_changed) {
716         thr_bar->nproc = nproc;
717         thr_bar->leaf_kids = thr_bar->base_leaf_kids;
718         if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
719         if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
720             thr_bar->leaf_kids = nproc - tid - 1;
721         thr_bar->leaf_state = 0;
722         for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
723     }
724     return retval;
725 }
726 
727 static void
728 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
729                                   int gtid, int tid, void (*reduce) (void *, void *)
730                                   USE_ITT_BUILD_ARG(void * itt_sync_obj) )
731 {
732     KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather);
733     register kmp_team_t *team = this_thr->th.th_team;
734     register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
735     register kmp_uint32 nproc = this_thr->th.th_team_nproc;
736     register kmp_info_t **other_threads = team->t.t_threads;
737     register kmp_uint64 new_state;
738 
739     int level = team->t.t_level;
740 #if OMP_40_ENABLED
741     if (other_threads[0]->th.th_teams_microtask)    // are we inside the teams construct?
742         if (this_thr->th.th_teams_size.nteams > 1)
743             ++level; // level was not increased in teams construct for team_of_masters
744 #endif
745     if (level == 1) thr_bar->use_oncore_barrier = 1;
746     else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
747 
748     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
749                   gtid, team->t.t_id, tid, bt));
750     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
751 
752 #if USE_ITT_BUILD && USE_ITT_NOTIFY
753     // Barrier imbalance - save arrive time to the thread
754     if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
755         this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
756     }
757 #endif
758 
759     (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
760 
761     if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
762         register kmp_int32 child_tid;
763         new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
764         if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
765             if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
766                 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
767                 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
768                 flag.wait(this_thr, FALSE
769                           USE_ITT_BUILD_ARG(itt_sync_obj) );
770                 if (reduce) {
771                     for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
772                         KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
773                                        gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
774                                        team->t.t_id, child_tid));
775                         (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
776                     }
777                 }
778                 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
779             }
780             // Next, wait for higher level children on each child's b_arrived flag
781             for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
782                 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
783                 if (last > nproc) last = nproc;
784                 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
785                     register kmp_info_t *child_thr = other_threads[child_tid];
786                     register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
787                     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
788                                   "arrived(%p) == %llu\n",
789                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
790                                   team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
791                     kmp_flag_64 flag(&child_bar->b_arrived, new_state);
792                     flag.wait(this_thr, FALSE
793                               USE_ITT_BUILD_ARG(itt_sync_obj) );
794                     if (reduce) {
795                         KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
796                                        gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
797                                        team->t.t_id, child_tid));
798                         (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
799                     }
800                 }
801             }
802         }
803         else { // Blocktime is not infinite
804             for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
805                 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
806                 if (last > nproc) last = nproc;
807                 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
808                     register kmp_info_t *child_thr = other_threads[child_tid];
809                     register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
810                     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
811                                   "arrived(%p) == %llu\n",
812                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
813                                   team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
814                     kmp_flag_64 flag(&child_bar->b_arrived, new_state);
815                     flag.wait(this_thr, FALSE
816                               USE_ITT_BUILD_ARG(itt_sync_obj) );
817                     if (reduce) {
818                         KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
819                                        gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
820                                        team->t.t_id, child_tid));
821                         (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
822                     }
823                 }
824             }
825         }
826     }
827     // All subordinates are gathered; now release parent if not master thread
828 
829     if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
830         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
831                       "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
832                       __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
833                       &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
834         /* Mark arrival to parent: After performing this write, a worker thread may not assume that
835            the team is valid any more - it could be deallocated by the master thread at any time. */
836         if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
837             || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
838             kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
839             flag.release();
840         }
841         else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
842             thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
843             kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
844             flag.set_waiter(other_threads[thr_bar->parent_tid]);
845             flag.release();
846         }
847     } else { // Master thread needs to update the team's b_arrived value
848         team->t.t_bar[bt].b_arrived = new_state;
849         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
850                       gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
851     }
852     // Is the team access below unsafe or just technically invalid?
853     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
854                   gtid, team->t.t_id, tid, bt));
855 }
856 
857 static void
858 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
859                                    int propagate_icvs
860                                    USE_ITT_BUILD_ARG(void * itt_sync_obj) )
861 {
862     KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release);
863     register kmp_team_t *team;
864     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
865     register kmp_uint32 nproc;
866     bool team_change = false; // indicates on-core barrier shouldn't be used
867 
868     if (KMP_MASTER_TID(tid)) {
869         team = __kmp_threads[gtid]->th.th_team;
870         KMP_DEBUG_ASSERT(team != NULL);
871         KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
872                       gtid, team->t.t_id, tid, bt));
873     }
874     else { // Worker threads
875         // Wait for parent thread to release me
876         if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
877             || thr_bar->my_level != 0 || thr_bar->team == NULL) {
878             // Use traditional method of waiting on my own b_go flag
879             thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
880             kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
881             flag.wait(this_thr, TRUE
882                       USE_ITT_BUILD_ARG(itt_sync_obj) );
883             TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
884         }
885         else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
886             // Wait on my "offset" bits on parent's b_go flag
887             thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
888             kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
889                                  bt, this_thr
890                                  USE_ITT_BUILD_ARG(itt_sync_obj) );
891             flag.wait(this_thr, TRUE);
892             if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
893                 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
894             }
895             else { // Reset my bits on parent's b_go flag
896                 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
897             }
898         }
899         thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
900         // Early exit for reaping threads releasing forkjoin barrier
901         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
902             return;
903         // The worker thread may now assume that the team is valid.
904         team = __kmp_threads[gtid]->th.th_team;
905         KMP_DEBUG_ASSERT(team != NULL);
906         tid = __kmp_tid_from_gtid(gtid);
907 
908         KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
909                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
910         KMP_MB();  // Flush all pending memory write invalidates.
911     }
912 
913     nproc = this_thr->th.th_team_nproc;
914     int level = team->t.t_level;
915 #if OMP_40_ENABLED
916     if (team->t.t_threads[0]->th.th_teams_microtask ) {    // are we inside the teams construct?
917         if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
918             ++level; // level was not increased in teams construct for team_of_workers
919         if( this_thr->th.th_teams_size.nteams > 1 )
920             ++level; // level was not increased in teams construct for team_of_masters
921     }
922 #endif
923     if (level == 1) thr_bar->use_oncore_barrier = 1;
924     else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
925 
926     // If the team size has increased, we still communicate with old leaves via oncore barrier.
927     unsigned short int old_leaf_kids = thr_bar->leaf_kids;
928     kmp_uint64 old_leaf_state = thr_bar->leaf_state;
929     team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
930     // But if the entire team changes, we won't use oncore barrier at all
931     if (team_change) old_leaf_kids = 0;
932 
933 #if KMP_BARRIER_ICV_PUSH
934     if (propagate_icvs) {
935         __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
936         if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
937             copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
938         }
939         else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
940             if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
941                 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
942                 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
943                           &thr_bar->parent_bar->th_fixed_icvs);
944             // non-leaves will get ICVs piggybacked with b_go via NGO store
945         }
946         else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
947             if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
948                 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
949             else // leaves copy parent's fixed ICVs directly to local ICV store
950                 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
951                           &thr_bar->parent_bar->th_fixed_icvs);
952         }
953     }
954 #endif // KMP_BARRIER_ICV_PUSH
955 
956     // Now, release my children
957     if (thr_bar->my_level) { // not a leaf
958         register kmp_int32 child_tid;
959         kmp_uint32 last;
960         if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
961             if (KMP_MASTER_TID(tid)) { // do a flat release
962                 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
963                 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
964                 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
965                 ngo_load(&thr_bar->th_fixed_icvs);
966                 // This loops over all the threads skipping only the leaf nodes in the hierarchy
967                 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
968                     register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
969                     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
970                                   " go(%p): %u => %u\n",
971                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
972                                   team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
973                                   child_bar->b_go + KMP_BARRIER_STATE_BUMP));
974                     // Use ngo store (if available) to both store ICVs and release child via child's b_go
975                     ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
976                 }
977                 ngo_sync();
978             }
979             TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
980             // Now, release leaf children
981             if (thr_bar->leaf_kids) { // if there are any
982                 // We test team_change on the off-chance that the level 1 team changed.
983                 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
984                     if (old_leaf_kids) { // release old leaf kids
985                         thr_bar->b_go |= old_leaf_state;
986                     }
987                     // Release new leaf kids
988                     last = tid+thr_bar->skip_per_level[1];
989                     if (last > nproc) last = nproc;
990                     for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
991                         register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
992                         register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
993                         KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
994                                       " T#%d(%d:%d) go(%p): %u => %u\n",
995                                       gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
996                                       team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
997                                       child_bar->b_go + KMP_BARRIER_STATE_BUMP));
998                         // Release child using child's b_go flag
999                         kmp_flag_64 flag(&child_bar->b_go, child_thr);
1000                         flag.release();
1001                     }
1002                 }
1003                 else { // Release all children at once with leaf_state bits on my own b_go flag
1004                     thr_bar->b_go |= thr_bar->leaf_state;
1005                 }
1006             }
1007         }
1008         else { // Blocktime is not infinite; do a simple hierarchical release
1009             for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1010                 last = tid+thr_bar->skip_per_level[d+1];
1011                 kmp_uint32 skip = thr_bar->skip_per_level[d];
1012                 if (last > nproc) last = nproc;
1013                 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1014                     register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
1015                     register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1016                     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1017                                   " go(%p): %u => %u\n",
1018                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1019                                   team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1020                                   child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1021                     // Release child using child's b_go flag
1022                     kmp_flag_64 flag(&child_bar->b_go, child_thr);
1023                     flag.release();
1024                 }
1025             }
1026         }
1027 #if KMP_BARRIER_ICV_PUSH
1028         if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1029             copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1030 #endif // KMP_BARRIER_ICV_PUSH
1031     }
1032     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1033                   gtid, team->t.t_id, tid, bt));
1034 }
1035 
1036 // ---------------------------- End of Barrier Algorithms ----------------------------
1037 
1038 // Internal function to do a barrier.
1039 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1040    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1041    Returns 0 if master thread, 1 if worker thread.  */
1042 int
1043 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1044               void *reduce_data, void (*reduce)(void *, void *))
1045 {
1046     KMP_TIME_DEVELOPER_BLOCK(KMP_barrier);
1047     register int tid = __kmp_tid_from_gtid(gtid);
1048     register kmp_info_t *this_thr = __kmp_threads[gtid];
1049     register kmp_team_t *team = this_thr->th.th_team;
1050     register int status = 0;
1051     ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1052 #if OMPT_SUPPORT
1053     ompt_task_id_t my_task_id;
1054     ompt_parallel_id_t my_parallel_id;
1055 #endif
1056 
1057     KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1058                   gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1059 
1060 #if OMPT_SUPPORT
1061     if (ompt_enabled) {
1062 #if OMPT_BLAME
1063         my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1064         my_parallel_id = team->t.ompt_team_info.parallel_id;
1065 
1066 #if OMPT_TRACE
1067         if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1068             if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1069                 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1070                     my_parallel_id, my_task_id);
1071             }
1072         }
1073 #endif
1074         if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1075             ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1076                 my_parallel_id, my_task_id);
1077         }
1078 #endif
1079         // It is OK to report the barrier state after the barrier begin callback.
1080         // According to the OMPT specification, a compliant implementation may
1081         // even delay reporting this state until the barrier begins to wait.
1082         this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1083     }
1084 #endif
1085 
1086     if (! team->t.t_serialized) {
1087 #if USE_ITT_BUILD
1088         // This value will be used in itt notify events below.
1089         void *itt_sync_obj = NULL;
1090 # if USE_ITT_NOTIFY
1091         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1092             itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1093 # endif
1094 #endif /* USE_ITT_BUILD */
1095         if (__kmp_tasking_mode == tskm_extra_barrier) {
1096             __kmp_tasking_barrier(team, this_thr, gtid);
1097             KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1098                           gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1099         }
1100 
1101         /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1102            the team struct is not guaranteed to exist. */
1103         // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1104         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1105             this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1106             this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1107         }
1108 
1109 #if USE_ITT_BUILD
1110         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1111             __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1112 #endif /* USE_ITT_BUILD */
1113 #if USE_DEBUGGER
1114         // Let the debugger know: the thread arrived to the barrier and waiting.
1115         if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1116             team->t.t_bar[bt].b_master_arrived += 1;
1117         } else {
1118             this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1119         } // if
1120 #endif /* USE_DEBUGGER */
1121         if (reduce != NULL) {
1122             //KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1123             this_thr->th.th_local.reduce_data = reduce_data;
1124         }
1125         switch (__kmp_barrier_gather_pattern[bt]) {
1126         case bp_hyper_bar: {
1127             KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1128             __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1129                                        USE_ITT_BUILD_ARG(itt_sync_obj) );
1130             break;
1131         }
1132         case bp_hierarchical_bar: {
1133             __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1134                                               USE_ITT_BUILD_ARG(itt_sync_obj));
1135             break;
1136         }
1137         case bp_tree_bar: {
1138             KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1139             __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1140                                       USE_ITT_BUILD_ARG(itt_sync_obj) );
1141             break;
1142         }
1143         default: {
1144             __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1145                                         USE_ITT_BUILD_ARG(itt_sync_obj) );
1146         }
1147         }
1148 
1149         KMP_MB();
1150 
1151         if (KMP_MASTER_TID(tid)) {
1152             status = 0;
1153             if (__kmp_tasking_mode != tskm_immediate_exec) {
1154                 __kmp_task_team_wait(this_thr, team
1155                                      USE_ITT_BUILD_ARG(itt_sync_obj) );
1156                 __kmp_task_team_setup(this_thr, team, 0, 0); // use 0,0 to only setup the current team if nthreads > 1
1157             }
1158 #if USE_DEBUGGER
1159             // Let the debugger know: All threads are arrived and starting leaving the barrier.
1160             team->t.t_bar[bt].b_team_arrived += 1;
1161 #endif
1162 
1163 #if USE_ITT_BUILD
1164             /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1165                before the final summation into the shared variable is done (final summation can be a
1166                long operation for array reductions).  */
1167             if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1168                 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1169 #endif /* USE_ITT_BUILD */
1170 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1171             // Barrier - report frame end (only if active_level == 1)
1172             if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1173 #if OMP_40_ENABLED
1174                 this_thr->th.th_teams_microtask == NULL &&
1175 #endif
1176                 team->t.t_active_level == 1)
1177             {
1178                 kmp_uint64 cur_time = __itt_get_timestamp();
1179                 kmp_info_t **other_threads = team->t.t_threads;
1180                 int nproc = this_thr->th.th_team_nproc;
1181                 int i;
1182                 switch(__kmp_forkjoin_frames_mode) {
1183                 case 1:
1184                     __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1185                     this_thr->th.th_frame_time = cur_time;
1186                     break;
1187                 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
1188                     __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1189                     break;
1190                 case 3:
1191                     if( __itt_metadata_add_ptr ) {
1192                         // Initialize with master's wait time
1193                         kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1194                         for (i=1; i<nproc; ++i) {
1195                             delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1196                         }
1197                         __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1198                     }
1199                     __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1200                     this_thr->th.th_frame_time = cur_time;
1201                     break;
1202                 }
1203             }
1204 #endif /* USE_ITT_BUILD */
1205         } else {
1206             status = 1;
1207 #if USE_ITT_BUILD
1208             if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1209                 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1210 #endif /* USE_ITT_BUILD */
1211         }
1212         if (status == 1 || ! is_split) {
1213             switch (__kmp_barrier_release_pattern[bt]) {
1214             case bp_hyper_bar: {
1215                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1216                 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1217                                             USE_ITT_BUILD_ARG(itt_sync_obj) );
1218                 break;
1219             }
1220             case bp_hierarchical_bar: {
1221                 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1222                                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
1223                 break;
1224             }
1225             case bp_tree_bar: {
1226                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1227                 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1228                                            USE_ITT_BUILD_ARG(itt_sync_obj) );
1229                 break;
1230             }
1231             default: {
1232                 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1233                                              USE_ITT_BUILD_ARG(itt_sync_obj) );
1234             }
1235             }
1236             if (__kmp_tasking_mode != tskm_immediate_exec) {
1237                 __kmp_task_team_sync(this_thr, team);
1238             }
1239         }
1240 
1241 #if USE_ITT_BUILD
1242         /* GEH: TODO: Move this under if-condition above and also include in
1243            __kmp_end_split_barrier(). This will more accurately represent the actual release time
1244            of the threads for split barriers.  */
1245         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1246             __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1247 #endif /* USE_ITT_BUILD */
1248     } else { // Team is serialized.
1249         status = 0;
1250         if (__kmp_tasking_mode != tskm_immediate_exec) {
1251 #if OMP_41_ENABLED
1252             if ( this_thr->th.th_task_team != NULL ) {
1253                 void *itt_sync_obj = NULL;
1254 #if USE_ITT_NOTIFY
1255                 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1256                     itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1257                     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1258                 }
1259 #endif
1260 
1261                 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE);
1262                 __kmp_task_team_wait(this_thr, team
1263                                                USE_ITT_BUILD_ARG(itt_sync_obj));
1264                 __kmp_task_team_setup(this_thr, team, 0, 0);
1265 
1266 #if USE_ITT_BUILD
1267                 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1268                     __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1269 #endif /* USE_ITT_BUILD */
1270             }
1271 #else
1272             // The task team should be NULL for serialized code (tasks will be executed immediately)
1273             KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1274             KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1275 #endif
1276         }
1277     }
1278     KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1279                   gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1280 
1281 #if OMPT_SUPPORT
1282     if (ompt_enabled) {
1283 #if OMPT_BLAME
1284         if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1285             ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1286                 my_parallel_id, my_task_id);
1287         }
1288 #endif
1289         this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1290     }
1291 #endif
1292 
1293     return status;
1294 }
1295 
1296 
1297 void
1298 __kmp_end_split_barrier(enum barrier_type bt, int gtid)
1299 {
1300     KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier);
1301     int tid = __kmp_tid_from_gtid(gtid);
1302     kmp_info_t *this_thr = __kmp_threads[gtid];
1303     kmp_team_t *team = this_thr->th.th_team;
1304 
1305     if (!team->t.t_serialized) {
1306         if (KMP_MASTER_GTID(gtid)) {
1307             switch (__kmp_barrier_release_pattern[bt]) {
1308             case bp_hyper_bar: {
1309                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1310                 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1311                                             USE_ITT_BUILD_ARG(NULL) );
1312                 break;
1313             }
1314             case bp_hierarchical_bar: {
1315                 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1316                                                    USE_ITT_BUILD_ARG(NULL));
1317                 break;
1318             }
1319             case bp_tree_bar: {
1320                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1321                 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1322                                            USE_ITT_BUILD_ARG(NULL) );
1323                 break;
1324             }
1325             default: {
1326                 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1327                                              USE_ITT_BUILD_ARG(NULL) );
1328             }
1329             }
1330             if (__kmp_tasking_mode != tskm_immediate_exec) {
1331                 __kmp_task_team_sync(this_thr, team);
1332             } // if
1333         }
1334     }
1335 }
1336 
1337 
1338 void
1339 __kmp_join_barrier(int gtid)
1340 {
1341     KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier);
1342     register kmp_info_t *this_thr = __kmp_threads[gtid];
1343     register kmp_team_t *team;
1344     register kmp_uint nproc;
1345     kmp_info_t *master_thread;
1346     int tid;
1347 #ifdef KMP_DEBUG
1348     int team_id;
1349 #endif /* KMP_DEBUG */
1350 #if USE_ITT_BUILD
1351     void *itt_sync_obj = NULL;
1352 # if USE_ITT_NOTIFY
1353     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1354         // Get object created at fork_barrier
1355         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1356 # endif
1357 #endif /* USE_ITT_BUILD */
1358     KMP_MB();
1359 
1360     // Get current info
1361     team = this_thr->th.th_team;
1362     nproc = this_thr->th.th_team_nproc;
1363     KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1364     tid = __kmp_tid_from_gtid(gtid);
1365 #ifdef KMP_DEBUG
1366     team_id = team->t.t_id;
1367 #endif /* KMP_DEBUG */
1368     master_thread = this_thr->th.th_team_master;
1369 #ifdef KMP_DEBUG
1370     if (master_thread != team->t.t_threads[0]) {
1371         __kmp_print_structure();
1372     }
1373 #endif /* KMP_DEBUG */
1374     KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1375     KMP_MB();
1376 
1377     // Verify state
1378     KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1379     KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1380     KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1381     KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1382     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1383 
1384 #if OMPT_SUPPORT
1385 #if OMPT_TRACE
1386     if (ompt_enabled &&
1387         ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1388         ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1389             team->t.ompt_team_info.parallel_id,
1390             team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1391     }
1392 #endif
1393     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1394 #endif
1395 
1396     if (__kmp_tasking_mode == tskm_extra_barrier) {
1397         __kmp_tasking_barrier(team, this_thr, gtid);
1398         KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1399     }
1400 # ifdef KMP_DEBUG
1401     if (__kmp_tasking_mode != tskm_immediate_exec) {
1402         KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1403                        __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
1404                        this_thr->th.th_task_team));
1405         KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
1406     }
1407 # endif /* KMP_DEBUG */
1408 
1409     /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1410        team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1411        down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1412        since the values are not used by __kmp_wait_template() in that case. */
1413     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1414         this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1415         this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1416     }
1417 
1418 #if USE_ITT_BUILD
1419     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1420         __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1421 #endif /* USE_ITT_BUILD */
1422 
1423     switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1424     case bp_hyper_bar: {
1425         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1426         __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1427                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
1428         break;
1429     }
1430     case bp_hierarchical_bar: {
1431         __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1432                                           USE_ITT_BUILD_ARG(itt_sync_obj) );
1433         break;
1434     }
1435     case bp_tree_bar: {
1436         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1437         __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1438                                   USE_ITT_BUILD_ARG(itt_sync_obj) );
1439         break;
1440     }
1441     default: {
1442         __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1443                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
1444     }
1445     }
1446 
1447     /* From this point on, the team data structure may be deallocated at any time by the
1448        master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1449        data items that need to be referenced before the end of the barrier should be moved to
1450        the kmp_task_team_t structs.  */
1451     if (KMP_MASTER_TID(tid)) {
1452         if (__kmp_tasking_mode != tskm_immediate_exec) {
1453             // Master shouldn't call decrease_load().         // TODO: enable master threads.
1454             // Master should have th_may_decrease_load == 0.  // TODO: enable master threads.
1455             __kmp_task_team_wait(this_thr, team
1456                                  USE_ITT_BUILD_ARG(itt_sync_obj) );
1457         }
1458 #if USE_ITT_BUILD
1459         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1460             __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1461 #endif /* USE_ITT_BUILD */
1462 
1463 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1464         // Join barrier - report frame end
1465         if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1466 #if OMP_40_ENABLED
1467             this_thr->th.th_teams_microtask == NULL &&
1468 #endif
1469             team->t.t_active_level == 1)
1470         {
1471             kmp_uint64 cur_time = __itt_get_timestamp();
1472             ident_t * loc = team->t.t_ident;
1473             kmp_info_t **other_threads = team->t.t_threads;
1474             int nproc = this_thr->th.th_team_nproc;
1475             int i;
1476             switch(__kmp_forkjoin_frames_mode) {
1477             case 1:
1478                 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1479                 break;
1480             case 2:
1481                 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1482                 break;
1483             case 3:
1484                 if( __itt_metadata_add_ptr ) {
1485                     // Initialize with master's wait time
1486                     kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1487                     for (i=1; i<nproc; ++i) {
1488                         delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1489                     }
1490                     __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1491                 }
1492                 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1493                 this_thr->th.th_frame_time = cur_time;
1494                 break;
1495             }
1496         }
1497 # endif /* USE_ITT_BUILD */
1498     }
1499 #if USE_ITT_BUILD
1500     else {
1501         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1502             __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1503     }
1504 #endif /* USE_ITT_BUILD */
1505 
1506 #if KMP_DEBUG
1507     if (KMP_MASTER_TID(tid)) {
1508         KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1509                       gtid, team_id, tid, nproc));
1510     }
1511 #endif /* KMP_DEBUG */
1512 
1513     // TODO now, mark worker threads as done so they may be disbanded
1514     KMP_MB(); // Flush all pending memory write invalidates.
1515     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1516 
1517 #if OMPT_SUPPORT
1518     if (ompt_enabled) {
1519 #if OMPT_BLAME
1520         if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1521             ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1522                 team->t.ompt_team_info.parallel_id,
1523                 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1524         }
1525 #endif
1526 
1527         // return to default state
1528         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1529     }
1530 #endif
1531 }
1532 
1533 
1534 // TODO release worker threads' fork barriers as we are ready instead of all at once
1535 void
1536 __kmp_fork_barrier(int gtid, int tid)
1537 {
1538     KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier);
1539     kmp_info_t *this_thr = __kmp_threads[gtid];
1540     kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1541 #if USE_ITT_BUILD
1542     void * itt_sync_obj = NULL;
1543 #endif /* USE_ITT_BUILD */
1544 
1545     KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1546                   gtid, (team != NULL) ? team->t.t_id : -1, tid));
1547 
1548     // th_team pointer only valid for master thread here
1549     if (KMP_MASTER_TID(tid)) {
1550 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1551         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1552             // Create itt barrier object
1553             itt_sync_obj  = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1554             __kmp_itt_barrier_middle(gtid, itt_sync_obj);  // Call acquired/releasing
1555         }
1556 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1557 
1558 #ifdef KMP_DEBUG
1559         register kmp_info_t **other_threads = team->t.t_threads;
1560         register int i;
1561 
1562         // Verify state
1563         KMP_MB();
1564 
1565         for(i=1; i<team->t.t_nproc; ++i) {
1566             KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1567                            gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1568                            team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1569                            other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1570             KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1571                               & ~(KMP_BARRIER_SLEEP_STATE))
1572                              == KMP_INIT_BARRIER_STATE);
1573             KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1574         }
1575 #endif
1576 
1577         if (__kmp_tasking_mode != tskm_immediate_exec) {
1578             __kmp_task_team_setup(this_thr, team, 1, 0);  // 1,0 indicates setup both task teams if nthreads > 1
1579         }
1580 
1581         /* The master thread may have changed its blocktime between the join barrier and the
1582            fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1583            access it when the team struct is not guaranteed to exist. */
1584         // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1585         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1586             this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1587             this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1588         }
1589     } // master
1590 
1591     switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1592     case bp_hyper_bar: {
1593         KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1594         __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1595                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
1596         break;
1597     }
1598     case bp_hierarchical_bar: {
1599         __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1600                                            USE_ITT_BUILD_ARG(itt_sync_obj) );
1601         break;
1602     }
1603     case bp_tree_bar: {
1604         KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1605         __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1606                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
1607         break;
1608     }
1609     default: {
1610         __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1611                                      USE_ITT_BUILD_ARG(itt_sync_obj) );
1612     }
1613     }
1614 
1615     // Early exit for reaping threads releasing forkjoin barrier
1616     if (TCR_4(__kmp_global.g.g_done)) {
1617         if (this_thr->th.th_task_team != NULL) {
1618             if (KMP_MASTER_TID(tid)) {
1619                 TCW_PTR(this_thr->th.th_task_team, NULL);
1620             }
1621             else {
1622                 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1623             }
1624         }
1625 
1626 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1627         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1628             if (!KMP_MASTER_TID(tid)) {
1629                 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1630                 if (itt_sync_obj)
1631                     __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1632             }
1633         }
1634 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1635         KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1636         return;
1637     }
1638 
1639     /* We can now assume that a valid team structure has been allocated by the master and
1640        propagated to all worker threads. The current thread, however, may not be part of the
1641        team, so we can't blindly assume that the team pointer is non-null.  */
1642     team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1643     KMP_DEBUG_ASSERT(team != NULL);
1644     tid = __kmp_tid_from_gtid(gtid);
1645 
1646 
1647 #if KMP_BARRIER_ICV_PULL
1648     /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1649        __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1650        this data before this function is called. We cannot modify __kmp_fork_call() to look at
1651        the fixed ICVs in the master's thread struct, because it is not always the case that the
1652        threads arrays have been allocated when __kmp_fork_call() is executed. */
1653     {
1654         KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
1655         if (!KMP_MASTER_TID(tid)) {  // master thread already has ICVs
1656             // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1657             KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1658             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1659             copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1660                       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1661         }
1662     }
1663 #endif // KMP_BARRIER_ICV_PULL
1664 
1665     if (__kmp_tasking_mode != tskm_immediate_exec) {
1666         __kmp_task_team_sync(this_thr, team);
1667     }
1668 
1669 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1670     kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1671     if (proc_bind == proc_bind_intel) {
1672 #endif
1673 #if KMP_AFFINITY_SUPPORTED
1674         // Call dynamic affinity settings
1675         if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1676             __kmp_balanced_affinity(tid, team->t.t_nproc);
1677         }
1678 #endif // KMP_AFFINITY_SUPPORTED
1679 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1680     }
1681     else if (proc_bind != proc_bind_false) {
1682         if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1683             KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1684                            __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1685         }
1686         else {
1687             __kmp_affinity_set_place(gtid);
1688         }
1689     }
1690 #endif
1691 
1692 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1693     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1694         if (!KMP_MASTER_TID(tid)) {
1695             // Get correct barrier object
1696             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1697             __kmp_itt_barrier_finished(gtid, itt_sync_obj);  // Workers call acquired
1698         } // (prepare called inside barrier_release)
1699     }
1700 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1701     KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1702 }
1703 
1704 
1705 void
1706 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1707 {
1708     KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy);
1709 
1710     KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1711     KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1712 
1713     /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1714        __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1715        this data before this function is called. */
1716 #if KMP_BARRIER_ICV_PULL
1717     /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1718        all of the worker threads can access them and make their own copies after the barrier. */
1719     KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
1720     copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1721     KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1722                   0, team->t.t_threads[0], team));
1723 #elif KMP_BARRIER_ICV_PUSH
1724     // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1725     KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1726                   0, team->t.t_threads[0], team));
1727 #else
1728     // Copy the ICVs to each of the non-master threads.  This takes O(nthreads) time.
1729     ngo_load(new_icvs);
1730     KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
1731     for (int f=1; f<new_nproc; ++f) { // Skip the master thread
1732         // TODO: GEH - pass in better source location info since usually NULL here
1733         KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1734                       f, team->t.t_threads[f], team));
1735         __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1736         ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1737         KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1738                       f, team->t.t_threads[f], team));
1739     }
1740     ngo_sync();
1741 #endif // KMP_BARRIER_ICV_PULL
1742 }
1743