1 /*
2  * kmp_barrier.cpp
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_stats.h"
19 #include "kmp_itt.h"
20 #include "kmp_os.h"
21 
22 
23 #if KMP_MIC
24 #include <immintrin.h>
25 #define USE_NGO_STORES 1
26 #endif // KMP_MIC
27 
28 #include "tsan_annotations.h"
29 
30 #if KMP_MIC && USE_NGO_STORES
31 // ICV copying
32 #define ngo_load(src)            __m512d Vt = _mm512_load_pd((void *)(src))
33 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_store_go(dst, src)   _mm512_storenrngo_pd((void *)(dst), Vt)
35 #define ngo_sync()               __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
36 #else
37 #define ngo_load(src)            ((void)0)
38 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
39 #define ngo_store_go(dst, src)   KMP_MEMCPY((dst), (src), CACHE_LINE)
40 #define ngo_sync()               ((void)0)
41 #endif /* KMP_MIC && USE_NGO_STORES */
42 
43 void __kmp_print_structure(void); // Forward declaration
44 
45 // ---------------------------- Barrier Algorithms ----------------------------
46 
47 // Linear Barrier
48 static void
49 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50                             void (*reduce)(void *, void *)
51                             USE_ITT_BUILD_ARG(void * itt_sync_obj) )
52 {
53     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
54     register kmp_team_t *team = this_thr->th.th_team;
55     register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
56     register kmp_info_t **other_threads = team->t.t_threads;
57 
58     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59                   gtid, team->t.t_id, tid, bt));
60     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61 
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
63     // Barrier imbalance - save arrive time to the thread
64     if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65         this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
66     }
67 #endif
68     // We now perform a linear reduction to signal that all of the threads have arrived.
69     if (!KMP_MASTER_TID(tid)) {
70         KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
71                       "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
72                       __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
73                       thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
74         // Mark arrival to master thread
75         /* After performing this write, a worker thread may not assume that the team is valid
76            any more - it could be deallocated by the master thread at any time. */
77         kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
78         flag.release();
79     } else {
80         register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
81         register int nproc = this_thr->th.th_team_nproc;
82         register int i;
83         // Don't have to worry about sleep bit here or atomic since team setting
84         register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
85 
86         // Collect all the worker team member threads.
87         for (i=1; i<nproc; ++i) {
88 #if KMP_CACHE_MANAGE
89             // Prefetch next thread's arrived count
90             if (i+1 < nproc)
91                 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
92 #endif /* KMP_CACHE_MANAGE */
93             KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
94                           "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
95                             __kmp_gtid_from_tid(i, team), team->t.t_id, i,
96                             &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
97 
98             // Wait for worker thread to arrive
99             kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
100             flag.wait(this_thr, FALSE
101                       USE_ITT_BUILD_ARG(itt_sync_obj) );
102 #if USE_ITT_BUILD && USE_ITT_NOTIFY
103             // Barrier imbalance - write min of the thread time and the other thread time to the thread.
104             if (__kmp_forkjoin_frames_mode == 2) {
105                 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
106                                                           other_threads[i]->th.th_bar_min_time);
107             }
108 #endif
109             if (reduce) {
110                 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
111                                team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
112                 ANNOTATE_REDUCE_AFTER(reduce);
113                 (*reduce)(this_thr->th.th_local.reduce_data,
114                           other_threads[i]->th.th_local.reduce_data);
115                 ANNOTATE_REDUCE_BEFORE(reduce);
116                 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
117             }
118         }
119         // Don't have to worry about sleep bit here or atomic since team setting
120         team_bar->b_arrived = new_state;
121         KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
122                       gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
123     }
124     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
125                   gtid, team->t.t_id, tid, bt));
126 }
127 
128 static void
129 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
130                              int propagate_icvs
131                              USE_ITT_BUILD_ARG(void *itt_sync_obj) )
132 {
133     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
134     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
135     register kmp_team_t *team;
136 
137     if (KMP_MASTER_TID(tid)) {
138         register unsigned int i;
139         register kmp_uint32 nproc = this_thr->th.th_team_nproc;
140         register kmp_info_t **other_threads;
141 
142         team = __kmp_threads[gtid]->th.th_team;
143         KMP_DEBUG_ASSERT(team != NULL);
144         other_threads = team->t.t_threads;
145 
146         KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
147                       gtid, team->t.t_id, tid, bt));
148 
149         if (nproc > 1) {
150 #if KMP_BARRIER_ICV_PUSH
151             {
152                 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
153                 if (propagate_icvs) {
154                     ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
155                     for (i=1; i<nproc; ++i) {
156                         __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
157                         ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
158                                        &team->t.t_implicit_task_taskdata[0].td_icvs);
159                     }
160                     ngo_sync();
161                 }
162             }
163 #endif // KMP_BARRIER_ICV_PUSH
164 
165             // Now, release all of the worker threads
166             for (i=1; i<nproc; ++i) {
167 #if KMP_CACHE_MANAGE
168                 // Prefetch next thread's go flag
169                 if (i+1 < nproc)
170                     KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
171 #endif /* KMP_CACHE_MANAGE */
172                 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
173                               "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
174                               other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
175                               &other_threads[i]->th.th_bar[bt].bb.b_go,
176                               other_threads[i]->th.th_bar[bt].bb.b_go,
177                               other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
178                 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
179                 flag.release();
180             }
181         }
182     } else { // Wait for the MASTER thread to release us
183         KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
184                       gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
185         kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
186         flag.wait(this_thr, TRUE
187                   USE_ITT_BUILD_ARG(itt_sync_obj) );
188 #if USE_ITT_BUILD && USE_ITT_NOTIFY
189         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
190             // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
191             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
192             // Cancel wait on previous parallel region...
193             __kmp_itt_task_starting(itt_sync_obj);
194 
195             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
196                 return;
197 
198             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
199             if (itt_sync_obj != NULL)
200                 // Call prepare as early as possible for "new" barrier
201                 __kmp_itt_task_finished(itt_sync_obj);
202         } else
203 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
204         // Early exit for reaping threads releasing forkjoin barrier
205         if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
206             return;
207         // The worker thread may now assume that the team is valid.
208 #ifdef KMP_DEBUG
209         tid = __kmp_tid_from_gtid(gtid);
210         team = __kmp_threads[gtid]->th.th_team;
211 #endif
212         KMP_DEBUG_ASSERT(team != NULL);
213         TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
214         KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
215                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
216         KMP_MB();  // Flush all pending memory write invalidates.
217     }
218     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
219                   gtid, team->t.t_id, tid, bt));
220 }
221 
222 // Tree barrier
223 static void
224 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
225                           void (*reduce)(void *, void *)
226                           USE_ITT_BUILD_ARG(void *itt_sync_obj) )
227 {
228     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
229     register kmp_team_t *team = this_thr->th.th_team;
230     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
231     register kmp_info_t **other_threads = team->t.t_threads;
232     register kmp_uint32 nproc = this_thr->th.th_team_nproc;
233     register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
234     register kmp_uint32 branch_factor = 1 << branch_bits;
235     register kmp_uint32 child;
236     register kmp_uint32 child_tid;
237     register kmp_uint64 new_state;
238 
239     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
240                   gtid, team->t.t_id, tid, bt));
241     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
242 
243 #if USE_ITT_BUILD && USE_ITT_NOTIFY
244     // Barrier imbalance - save arrive time to the thread
245     if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
246         this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
247     }
248 #endif
249     // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
250     child_tid = (tid << branch_bits) + 1;
251     if (child_tid < nproc) {
252         // Parent threads wait for all their children to arrive
253         new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
254         child = 1;
255         do {
256             register kmp_info_t *child_thr = other_threads[child_tid];
257             register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
258 #if KMP_CACHE_MANAGE
259             // Prefetch next thread's arrived count
260             if (child+1 <= branch_factor && child_tid+1 < nproc)
261                 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
262 #endif /* KMP_CACHE_MANAGE */
263             KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
264                           "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
265                             __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
266                             &child_bar->b_arrived, new_state));
267             // Wait for child to arrive
268             kmp_flag_64 flag(&child_bar->b_arrived, new_state);
269             flag.wait(this_thr, FALSE
270                       USE_ITT_BUILD_ARG(itt_sync_obj) );
271 #if USE_ITT_BUILD && USE_ITT_NOTIFY
272             // Barrier imbalance - write min of the thread time and a child time to the thread.
273             if (__kmp_forkjoin_frames_mode == 2) {
274                 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
275                                                           child_thr->th.th_bar_min_time);
276             }
277 #endif
278             if (reduce) {
279                 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
280                                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
281                                team->t.t_id, child_tid));
282                 ANNOTATE_REDUCE_AFTER(reduce);
283                 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
284                 ANNOTATE_REDUCE_BEFORE(reduce);
285                 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
286             }
287             child++;
288             child_tid++;
289         }
290         while (child <= branch_factor && child_tid < nproc);
291     }
292 
293     if (!KMP_MASTER_TID(tid)) { // Worker threads
294         register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
295 
296         KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
297                       "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
298                       __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
299                       &thr_bar->b_arrived, thr_bar->b_arrived,
300                       thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
301 
302         // Mark arrival to parent thread
303         /* After performing this write, a worker thread may not assume that the team is valid
304            any more - it could be deallocated by the master thread at any time.  */
305         kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
306         flag.release();
307     } else {
308         // Need to update the team arrived pointer if we are the master thread
309         if (nproc > 1) // New value was already computed above
310             team->t.t_bar[bt].b_arrived = new_state;
311         else
312             team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
313         KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
314                       gtid, team->t.t_id, tid, team->t.t_id,
315                       &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
316     }
317     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
318                   gtid, team->t.t_id, tid, bt));
319 }
320 
321 static void
322 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
323                            int propagate_icvs
324                            USE_ITT_BUILD_ARG(void *itt_sync_obj) )
325 {
326     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
327     register kmp_team_t *team;
328     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
329     register kmp_uint32 nproc;
330     register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
331     register kmp_uint32 branch_factor = 1 << branch_bits;
332     register kmp_uint32 child;
333     register kmp_uint32 child_tid;
334 
335     // Perform a tree release for all of the threads that have been gathered
336     if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
337         KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
338                       gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
339         // Wait for parent thread to release us
340         kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
341         flag.wait(this_thr, TRUE
342                   USE_ITT_BUILD_ARG(itt_sync_obj) );
343 #if USE_ITT_BUILD && USE_ITT_NOTIFY
344         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
345             // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
346             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
347             // Cancel wait on previous parallel region...
348             __kmp_itt_task_starting(itt_sync_obj);
349 
350             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
351                 return;
352 
353             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
354             if (itt_sync_obj != NULL)
355                 // Call prepare as early as possible for "new" barrier
356                 __kmp_itt_task_finished(itt_sync_obj);
357         } else
358 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
359         // Early exit for reaping threads releasing forkjoin barrier
360         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
361             return;
362 
363         // The worker thread may now assume that the team is valid.
364         team = __kmp_threads[gtid]->th.th_team;
365         KMP_DEBUG_ASSERT(team != NULL);
366         tid = __kmp_tid_from_gtid(gtid);
367 
368         TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
369         KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
370                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
371         KMP_MB();  // Flush all pending memory write invalidates.
372     } else {
373         team = __kmp_threads[gtid]->th.th_team;
374         KMP_DEBUG_ASSERT(team != NULL);
375         KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
376                       gtid, team->t.t_id, tid, bt));
377     }
378     nproc = this_thr->th.th_team_nproc;
379     child_tid = (tid << branch_bits) + 1;
380 
381     if (child_tid < nproc) {
382         register kmp_info_t **other_threads = team->t.t_threads;
383         child = 1;
384         // Parent threads release all their children
385         do {
386             register kmp_info_t *child_thr = other_threads[child_tid];
387             register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
388 #if KMP_CACHE_MANAGE
389             // Prefetch next thread's go count
390             if (child+1 <= branch_factor && child_tid+1 < nproc)
391                 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
392 #endif /* KMP_CACHE_MANAGE */
393 
394 #if KMP_BARRIER_ICV_PUSH
395             {
396                 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
397                 if (propagate_icvs) {
398                     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
399                                              team, child_tid, FALSE);
400                     copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
401                               &team->t.t_implicit_task_taskdata[0].td_icvs);
402                 }
403             }
404 #endif // KMP_BARRIER_ICV_PUSH
405             KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
406                           "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
407                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
408                           child_tid, &child_bar->b_go, child_bar->b_go,
409                           child_bar->b_go + KMP_BARRIER_STATE_BUMP));
410             // Release child from barrier
411             kmp_flag_64 flag(&child_bar->b_go, child_thr);
412             flag.release();
413             child++;
414             child_tid++;
415         }
416         while (child <= branch_factor && child_tid < nproc);
417     }
418     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
419                   gtid, team->t.t_id, tid, bt));
420 }
421 
422 
423 // Hyper Barrier
424 static void
425 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
426                            void (*reduce)(void *, void *)
427                            USE_ITT_BUILD_ARG(void *itt_sync_obj) )
428 {
429     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
430     register kmp_team_t *team = this_thr->th.th_team;
431     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
432     register kmp_info_t **other_threads = team->t.t_threads;
433     register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
434     register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
435     register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
436     register kmp_uint32 branch_factor = 1 << branch_bits;
437     register kmp_uint32 offset;
438     register kmp_uint32 level;
439 
440     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
441                   gtid, team->t.t_id, tid, bt));
442 
443     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
444 
445 #if USE_ITT_BUILD && USE_ITT_NOTIFY
446     // Barrier imbalance - save arrive time to the thread
447     if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
448         this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
449     }
450 #endif
451     /* Perform a hypercube-embedded tree gather to wait until all of the threads have
452        arrived, and reduce any required data as we go.  */
453     kmp_flag_64 p_flag(&thr_bar->b_arrived);
454     for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
455     {
456         register kmp_uint32 child;
457         register kmp_uint32 child_tid;
458 
459         if (((tid >> level) & (branch_factor - 1)) != 0) {
460             register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
461 
462             KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
463                           "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
464                           __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
465                           &thr_bar->b_arrived, thr_bar->b_arrived,
466                           thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
467             // Mark arrival to parent thread
468             /* After performing this write (in the last iteration of the enclosing for loop),
469                a worker thread may not assume that the team is valid any more - it could be
470                deallocated by the master thread at any time.  */
471             p_flag.set_waiter(other_threads[parent_tid]);
472             p_flag.release();
473             break;
474         }
475 
476         // Parent threads wait for children to arrive
477         if (new_state == KMP_BARRIER_UNUSED_STATE)
478             new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
479         for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
480              child++, child_tid+=(1 << level))
481         {
482             register kmp_info_t *child_thr = other_threads[child_tid];
483             register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
484 #if KMP_CACHE_MANAGE
485             register kmp_uint32 next_child_tid = child_tid + (1 << level);
486             // Prefetch next thread's arrived count
487             if (child+1 < branch_factor && next_child_tid < num_threads)
488                 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
489 #endif /* KMP_CACHE_MANAGE */
490             KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
491                           "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
492                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
493                           &child_bar->b_arrived, new_state));
494             // Wait for child to arrive
495             kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
496             c_flag.wait(this_thr, FALSE
497                         USE_ITT_BUILD_ARG(itt_sync_obj) );
498 #if USE_ITT_BUILD && USE_ITT_NOTIFY
499             // Barrier imbalance - write min of the thread time and a child time to the thread.
500             if (__kmp_forkjoin_frames_mode == 2) {
501                 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
502                                                           child_thr->th.th_bar_min_time);
503             }
504 #endif
505             if (reduce) {
506                 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
507                                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
508                                team->t.t_id, child_tid));
509                 ANNOTATE_REDUCE_AFTER(reduce);
510                 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
511                 ANNOTATE_REDUCE_BEFORE(reduce);
512                 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
513             }
514         }
515     }
516 
517     if (KMP_MASTER_TID(tid)) {
518         // Need to update the team arrived pointer if we are the master thread
519         if (new_state == KMP_BARRIER_UNUSED_STATE)
520             team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
521         else
522             team->t.t_bar[bt].b_arrived = new_state;
523         KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
524                       gtid, team->t.t_id, tid, team->t.t_id,
525                       &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
526     }
527     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
528                   gtid, team->t.t_id, tid, bt));
529 }
530 
531 // The reverse versions seem to beat the forward versions overall
532 #define KMP_REVERSE_HYPER_BAR
533 static void
534 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
535                             int propagate_icvs
536                             USE_ITT_BUILD_ARG(void *itt_sync_obj) )
537 {
538     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
539     register kmp_team_t    *team;
540     register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
541     register kmp_info_t   **other_threads;
542     register kmp_uint32     num_threads;
543     register kmp_uint32     branch_bits   = __kmp_barrier_release_branch_bits[ bt ];
544     register kmp_uint32     branch_factor = 1 << branch_bits;
545     register kmp_uint32     child;
546     register kmp_uint32     child_tid;
547     register kmp_uint32     offset;
548     register kmp_uint32     level;
549 
550     /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
551        If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
552        order of the corresponding gather, otherwise threads are released in the same order. */
553     if (KMP_MASTER_TID(tid)) { // master
554         team = __kmp_threads[gtid]->th.th_team;
555         KMP_DEBUG_ASSERT(team != NULL);
556         KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
557                       gtid, team->t.t_id, tid, bt));
558 #if KMP_BARRIER_ICV_PUSH
559         if (propagate_icvs) { // master already has ICVs in final destination; copy
560             copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
561         }
562 #endif
563     }
564     else  { // Handle fork barrier workers who aren't part of a team yet
565         KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
566                       gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
567         // Wait for parent thread to release us
568         kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
569         flag.wait(this_thr, TRUE
570                   USE_ITT_BUILD_ARG(itt_sync_obj) );
571 #if USE_ITT_BUILD && USE_ITT_NOTIFY
572         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
573             // In fork barrier where we could not get the object reliably
574             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
575             // Cancel wait on previous parallel region...
576             __kmp_itt_task_starting(itt_sync_obj);
577 
578             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
579                 return;
580 
581             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
582             if (itt_sync_obj != NULL)
583                 // Call prepare as early as possible for "new" barrier
584                 __kmp_itt_task_finished(itt_sync_obj);
585         } else
586 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
587         // Early exit for reaping threads releasing forkjoin barrier
588         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
589             return;
590 
591         // The worker thread may now assume that the team is valid.
592         team = __kmp_threads[gtid]->th.th_team;
593         KMP_DEBUG_ASSERT(team != NULL);
594         tid = __kmp_tid_from_gtid(gtid);
595 
596         TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
597         KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
598                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
599         KMP_MB();  // Flush all pending memory write invalidates.
600     }
601     num_threads = this_thr->th.th_team_nproc;
602     other_threads = team->t.t_threads;
603 
604 #ifdef KMP_REVERSE_HYPER_BAR
605     // Count up to correct level for parent
606     for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
607          level+=branch_bits, offset<<=branch_bits);
608 
609     // Now go down from there
610     for (level-=branch_bits, offset>>=branch_bits; offset != 0;
611          level-=branch_bits, offset>>=branch_bits)
612 #else
613     // Go down the tree, level by level
614     for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
615 #endif // KMP_REVERSE_HYPER_BAR
616     {
617 #ifdef KMP_REVERSE_HYPER_BAR
618         /* Now go in reverse order through the children, highest to lowest.
619            Initial setting of child is conservative here. */
620         child = num_threads >> ((level==0)?level:level-1);
621         for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
622              child>=1; child--, child_tid-=(1<<level))
623 #else
624         if (((tid >> level) & (branch_factor - 1)) != 0)
625             // No need to go lower than this, since this is the level parent would be notified
626             break;
627         // Iterate through children on this level of the tree
628         for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
629              child++, child_tid+=(1<<level))
630 #endif // KMP_REVERSE_HYPER_BAR
631         {
632             if (child_tid >= num_threads) continue;  // Child doesn't exist so keep going
633             else {
634                 register kmp_info_t *child_thr = other_threads[child_tid];
635                 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
636 #if KMP_CACHE_MANAGE
637                 register kmp_uint32 next_child_tid = child_tid - (1 << level);
638                 // Prefetch next thread's go count
639 # ifdef KMP_REVERSE_HYPER_BAR
640                 if (child-1 >= 1 && next_child_tid < num_threads)
641 # else
642                 if (child+1 < branch_factor && next_child_tid < num_threads)
643 # endif // KMP_REVERSE_HYPER_BAR
644                     KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
645 #endif /* KMP_CACHE_MANAGE */
646 
647 #if KMP_BARRIER_ICV_PUSH
648                 if (propagate_icvs) // push my fixed ICVs to my child
649                     copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
650 #endif // KMP_BARRIER_ICV_PUSH
651 
652                 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
653                               "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
654                               __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
655                               child_tid, &child_bar->b_go, child_bar->b_go,
656                               child_bar->b_go + KMP_BARRIER_STATE_BUMP));
657                 // Release child from barrier
658                 kmp_flag_64 flag(&child_bar->b_go, child_thr);
659                 flag.release();
660             }
661         }
662     }
663 #if KMP_BARRIER_ICV_PUSH
664     if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
665         __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
666         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
667     }
668 #endif
669     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
670                   gtid, team->t.t_id, tid, bt));
671 }
672 
673 // Hierarchical Barrier
674 
675 // Initialize thread barrier data
676 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.  Performs the
677    minimum amount of initialization required based on how the team has changed.  Returns true if
678    leaf children will require both on-core and traditional wake-up mechanisms.  For example, if the
679    team size increases, threads already in the team will respond to on-core wakeup on their parent
680    thread, but threads newly added to the team will only be listening on the their local b_go. */
681 static bool
682 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
683                                        int gtid, int tid, kmp_team_t *team)
684 {
685     // Checks to determine if (re-)initialization is needed
686     bool uninitialized = thr_bar->team == NULL;
687     bool team_changed = team != thr_bar->team;
688     bool team_sz_changed = nproc != thr_bar->nproc;
689     bool tid_changed = tid != thr_bar->old_tid;
690     bool retval = false;
691 
692     if (uninitialized || team_sz_changed) {
693         __kmp_get_hierarchy(nproc, thr_bar);
694     }
695 
696     if (uninitialized || team_sz_changed || tid_changed) {
697         thr_bar->my_level = thr_bar->depth-1; // default for master
698         thr_bar->parent_tid = -1; // default for master
699         if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
700             kmp_uint32 d=0;
701             while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
702                 kmp_uint32 rem;
703                 if (d == thr_bar->depth-2) { // reached level right below the master
704                     thr_bar->parent_tid = 0;
705                     thr_bar->my_level = d;
706                     break;
707                 }
708                 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
709                     // thread is not a subtree root at next level, so this is max
710                     thr_bar->parent_tid = tid - rem;
711                     thr_bar->my_level = d;
712                     break;
713                 }
714                 ++d;
715             }
716         }
717         thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
718         thr_bar->old_tid = tid;
719         thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
720         thr_bar->team = team;
721         thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
722     }
723     if (uninitialized || team_changed || tid_changed) {
724         thr_bar->team = team;
725         thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
726         retval = true;
727     }
728     if (uninitialized || team_sz_changed || tid_changed) {
729         thr_bar->nproc = nproc;
730         thr_bar->leaf_kids = thr_bar->base_leaf_kids;
731         if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
732         if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
733             thr_bar->leaf_kids = nproc - tid - 1;
734         thr_bar->leaf_state = 0;
735         for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
736     }
737     return retval;
738 }
739 
740 static void
741 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
742                                   int gtid, int tid, void (*reduce) (void *, void *)
743                                   USE_ITT_BUILD_ARG(void * itt_sync_obj) )
744 {
745     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
746     register kmp_team_t *team = this_thr->th.th_team;
747     register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
748     register kmp_uint32 nproc = this_thr->th.th_team_nproc;
749     register kmp_info_t **other_threads = team->t.t_threads;
750     register kmp_uint64 new_state;
751 
752     int level = team->t.t_level;
753 #if OMP_40_ENABLED
754     if (other_threads[0]->th.th_teams_microtask)    // are we inside the teams construct?
755         if (this_thr->th.th_teams_size.nteams > 1)
756             ++level; // level was not increased in teams construct for team_of_masters
757 #endif
758     if (level == 1) thr_bar->use_oncore_barrier = 1;
759     else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
760 
761     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
762                   gtid, team->t.t_id, tid, bt));
763     KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
764 
765 #if USE_ITT_BUILD && USE_ITT_NOTIFY
766     // Barrier imbalance - save arrive time to the thread
767     if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
768         this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
769     }
770 #endif
771 
772     (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
773 
774     if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
775         register kmp_int32 child_tid;
776         new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
777         if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
778             if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
779                 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
780                 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n",
781                               gtid, team->t.t_id, tid));
782                 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
783                 flag.wait(this_thr, FALSE
784                           USE_ITT_BUILD_ARG(itt_sync_obj) );
785                 if (reduce) {
786                     ANNOTATE_REDUCE_AFTER(reduce);
787                     for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
788                         KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
789                                        gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
790                                        team->t.t_id, child_tid));
791                         (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
792                     }
793                     ANNOTATE_REDUCE_BEFORE(reduce);
794                     ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
795                 }
796                 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
797             }
798             // Next, wait for higher level children on each child's b_arrived flag
799             for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
800                 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
801                 if (last > nproc) last = nproc;
802                 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
803                     register kmp_info_t *child_thr = other_threads[child_tid];
804                     register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
805                     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
806                                   "arrived(%p) == %llu\n",
807                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
808                                   team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
809                     kmp_flag_64 flag(&child_bar->b_arrived, new_state);
810                     flag.wait(this_thr, FALSE
811                               USE_ITT_BUILD_ARG(itt_sync_obj) );
812                     if (reduce) {
813                         KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
814                                        gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
815                                        team->t.t_id, child_tid));
816                         ANNOTATE_REDUCE_AFTER(reduce);
817                         (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
818                         ANNOTATE_REDUCE_BEFORE(reduce);
819                         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
820                     }
821                 }
822             }
823         }
824         else { // Blocktime is not infinite
825             for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
826                 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
827                 if (last > nproc) last = nproc;
828                 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
829                     register kmp_info_t *child_thr = other_threads[child_tid];
830                     register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
831                     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
832                                   "arrived(%p) == %llu\n",
833                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
834                                   team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
835                     kmp_flag_64 flag(&child_bar->b_arrived, new_state);
836                     flag.wait(this_thr, FALSE
837                               USE_ITT_BUILD_ARG(itt_sync_obj) );
838                     if (reduce) {
839                         KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
840                                        gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
841                                        team->t.t_id, child_tid));
842                         ANNOTATE_REDUCE_AFTER(reduce);
843                         (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
844                         ANNOTATE_REDUCE_BEFORE(reduce);
845                         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
846                     }
847                 }
848             }
849         }
850     }
851     // All subordinates are gathered; now release parent if not master thread
852 
853     if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
854         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
855                       "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
856                       __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
857                       &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
858         /* Mark arrival to parent: After performing this write, a worker thread may not assume that
859            the team is valid any more - it could be deallocated by the master thread at any time. */
860         if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
861             || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
862             kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
863             flag.release();
864         }
865         else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
866             thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
867             kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
868             flag.set_waiter(other_threads[thr_bar->parent_tid]);
869             flag.release();
870         }
871     } else { // Master thread needs to update the team's b_arrived value
872         team->t.t_bar[bt].b_arrived = new_state;
873         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
874                       gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
875     }
876     // Is the team access below unsafe or just technically invalid?
877     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
878                   gtid, team->t.t_id, tid, bt));
879 }
880 
881 static void
882 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
883                                    int propagate_icvs
884                                    USE_ITT_BUILD_ARG(void * itt_sync_obj) )
885 {
886     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
887     register kmp_team_t *team;
888     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
889     register kmp_uint32 nproc;
890     bool team_change = false; // indicates on-core barrier shouldn't be used
891 
892     if (KMP_MASTER_TID(tid)) {
893         team = __kmp_threads[gtid]->th.th_team;
894         KMP_DEBUG_ASSERT(team != NULL);
895         KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
896                       gtid, team->t.t_id, tid, bt));
897     }
898     else { // Worker threads
899         // Wait for parent thread to release me
900         if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
901             || thr_bar->my_level != 0 || thr_bar->team == NULL) {
902             // Use traditional method of waiting on my own b_go flag
903             thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
904             kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
905             flag.wait(this_thr, TRUE
906                       USE_ITT_BUILD_ARG(itt_sync_obj) );
907             TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
908         }
909         else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
910             // Wait on my "offset" bits on parent's b_go flag
911             thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
912             kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
913                                  bt, this_thr
914                                  USE_ITT_BUILD_ARG(itt_sync_obj) );
915             flag.wait(this_thr, TRUE);
916             if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
917                 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
918             }
919             else { // Reset my bits on parent's b_go flag
920                 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
921             }
922         }
923         thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
924         // Early exit for reaping threads releasing forkjoin barrier
925         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
926             return;
927         // The worker thread may now assume that the team is valid.
928         team = __kmp_threads[gtid]->th.th_team;
929         KMP_DEBUG_ASSERT(team != NULL);
930         tid = __kmp_tid_from_gtid(gtid);
931 
932         KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
933                       gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
934         KMP_MB();  // Flush all pending memory write invalidates.
935     }
936 
937     nproc = this_thr->th.th_team_nproc;
938     int level = team->t.t_level;
939 #if OMP_40_ENABLED
940     if (team->t.t_threads[0]->th.th_teams_microtask ) {    // are we inside the teams construct?
941         if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
942             ++level; // level was not increased in teams construct for team_of_workers
943         if( this_thr->th.th_teams_size.nteams > 1 )
944             ++level; // level was not increased in teams construct for team_of_masters
945     }
946 #endif
947     if (level == 1) thr_bar->use_oncore_barrier = 1;
948     else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
949 
950     // If the team size has increased, we still communicate with old leaves via oncore barrier.
951     unsigned short int old_leaf_kids = thr_bar->leaf_kids;
952     kmp_uint64 old_leaf_state = thr_bar->leaf_state;
953     team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
954     // But if the entire team changes, we won't use oncore barrier at all
955     if (team_change) old_leaf_kids = 0;
956 
957 #if KMP_BARRIER_ICV_PUSH
958     if (propagate_icvs) {
959         __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
960         if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
961             copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
962         }
963         else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
964             if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
965                 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
966                 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
967                           &thr_bar->parent_bar->th_fixed_icvs);
968             // non-leaves will get ICVs piggybacked with b_go via NGO store
969         }
970         else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
971             if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
972                 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
973             else // leaves copy parent's fixed ICVs directly to local ICV store
974                 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
975                           &thr_bar->parent_bar->th_fixed_icvs);
976         }
977     }
978 #endif // KMP_BARRIER_ICV_PUSH
979 
980     // Now, release my children
981     if (thr_bar->my_level) { // not a leaf
982         register kmp_int32 child_tid;
983         kmp_uint32 last;
984         if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
985             if (KMP_MASTER_TID(tid)) { // do a flat release
986                 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
987                 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
988                 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
989                 ngo_load(&thr_bar->th_fixed_icvs);
990                 // This loops over all the threads skipping only the leaf nodes in the hierarchy
991                 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
992                     register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
993                     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
994                                   " go(%p): %u => %u\n",
995                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
996                                   team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
997                                   child_bar->b_go + KMP_BARRIER_STATE_BUMP));
998                     // Use ngo store (if available) to both store ICVs and release child via child's b_go
999                     ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1000                 }
1001                 ngo_sync();
1002             }
1003             TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1004             // Now, release leaf children
1005             if (thr_bar->leaf_kids) { // if there are any
1006                 // We test team_change on the off-chance that the level 1 team changed.
1007                 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
1008                     if (old_leaf_kids) { // release old leaf kids
1009                         thr_bar->b_go |= old_leaf_state;
1010                     }
1011                     // Release new leaf kids
1012                     last = tid+thr_bar->skip_per_level[1];
1013                     if (last > nproc) last = nproc;
1014                     for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
1015                         register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
1016                         register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1017                         KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1018                                       " T#%d(%d:%d) go(%p): %u => %u\n",
1019                                       gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1020                                       team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1021                                       child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1022                         // Release child using child's b_go flag
1023                         kmp_flag_64 flag(&child_bar->b_go, child_thr);
1024                         flag.release();
1025                     }
1026                 }
1027                 else { // Release all children at once with leaf_state bits on my own b_go flag
1028                     thr_bar->b_go |= thr_bar->leaf_state;
1029                 }
1030             }
1031         }
1032         else { // Blocktime is not infinite; do a simple hierarchical release
1033             for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1034                 last = tid+thr_bar->skip_per_level[d+1];
1035                 kmp_uint32 skip = thr_bar->skip_per_level[d];
1036                 if (last > nproc) last = nproc;
1037                 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1038                     register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
1039                     register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1040                     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1041                                   " go(%p): %u => %u\n",
1042                                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1043                                   team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1044                                   child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1045                     // Release child using child's b_go flag
1046                     kmp_flag_64 flag(&child_bar->b_go, child_thr);
1047                     flag.release();
1048                 }
1049             }
1050         }
1051 #if KMP_BARRIER_ICV_PUSH
1052         if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1053             copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1054 #endif // KMP_BARRIER_ICV_PUSH
1055     }
1056     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1057                   gtid, team->t.t_id, tid, bt));
1058 }
1059 
1060 // ---------------------------- End of Barrier Algorithms ----------------------------
1061 
1062 // Internal function to do a barrier.
1063 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1064    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1065    Returns 0 if master thread, 1 if worker thread.  */
1066 int
1067 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1068               void *reduce_data, void (*reduce)(void *, void *))
1069 {
1070     KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1071     KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1072     register int tid = __kmp_tid_from_gtid(gtid);
1073     register kmp_info_t *this_thr = __kmp_threads[gtid];
1074     register kmp_team_t *team = this_thr->th.th_team;
1075     register int status = 0;
1076     ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1077 #if OMPT_SUPPORT
1078     ompt_task_id_t my_task_id;
1079     ompt_parallel_id_t my_parallel_id;
1080 #endif
1081 
1082     KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1083                   gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1084 
1085     ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
1086 #if OMPT_SUPPORT
1087     if (ompt_enabled) {
1088 #if OMPT_BLAME
1089         my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1090         my_parallel_id = team->t.ompt_team_info.parallel_id;
1091 
1092 #if OMPT_TRACE
1093         if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1094             if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1095                 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1096                     my_parallel_id, my_task_id);
1097             }
1098         }
1099 #endif
1100         if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1101             ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1102                 my_parallel_id, my_task_id);
1103         }
1104 #endif
1105         // It is OK to report the barrier state after the barrier begin callback.
1106         // According to the OMPT specification, a compliant implementation may
1107         // even delay reporting this state until the barrier begins to wait.
1108         this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1109     }
1110 #endif
1111 
1112     if (! team->t.t_serialized) {
1113 #if USE_ITT_BUILD
1114         // This value will be used in itt notify events below.
1115         void *itt_sync_obj = NULL;
1116 # if USE_ITT_NOTIFY
1117         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1118             itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1119 # endif
1120 #endif /* USE_ITT_BUILD */
1121         if (__kmp_tasking_mode == tskm_extra_barrier) {
1122             __kmp_tasking_barrier(team, this_thr, gtid);
1123             KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1124                           gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1125         }
1126 
1127         /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1128            the team struct is not guaranteed to exist. */
1129         // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1130         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1131 #if KMP_USE_MONITOR
1132             this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1133 #endif
1134             this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1135         }
1136 
1137 #if USE_ITT_BUILD
1138         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1139             __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1140 #endif /* USE_ITT_BUILD */
1141 #if USE_DEBUGGER
1142         // Let the debugger know: the thread arrived to the barrier and waiting.
1143         if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1144             team->t.t_bar[bt].b_master_arrived += 1;
1145         } else {
1146             this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1147         } // if
1148 #endif /* USE_DEBUGGER */
1149         if (reduce != NULL) {
1150             //KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1151             this_thr->th.th_local.reduce_data = reduce_data;
1152         }
1153 
1154         if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1155             __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1
1156 
1157         switch (__kmp_barrier_gather_pattern[bt]) {
1158         case bp_hyper_bar: {
1159             KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1160             __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1161                                        USE_ITT_BUILD_ARG(itt_sync_obj) );
1162             break;
1163         }
1164         case bp_hierarchical_bar: {
1165             __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1166                                               USE_ITT_BUILD_ARG(itt_sync_obj));
1167             break;
1168         }
1169         case bp_tree_bar: {
1170             KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1171             __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1172                                       USE_ITT_BUILD_ARG(itt_sync_obj) );
1173             break;
1174         }
1175         default: {
1176             __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1177                                         USE_ITT_BUILD_ARG(itt_sync_obj) );
1178         }
1179         }
1180 
1181         KMP_MB();
1182 
1183         if (KMP_MASTER_TID(tid)) {
1184             status = 0;
1185             if (__kmp_tasking_mode != tskm_immediate_exec) {
1186                 __kmp_task_team_wait(this_thr, team
1187                                      USE_ITT_BUILD_ARG(itt_sync_obj) );
1188             }
1189 #if USE_DEBUGGER
1190             // Let the debugger know: All threads are arrived and starting leaving the barrier.
1191             team->t.t_bar[bt].b_team_arrived += 1;
1192 #endif
1193 
1194 #if USE_ITT_BUILD
1195             /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1196                before the final summation into the shared variable is done (final summation can be a
1197                long operation for array reductions).  */
1198             if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1199                 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1200 #endif /* USE_ITT_BUILD */
1201 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1202             // Barrier - report frame end (only if active_level == 1)
1203             if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1204 #if OMP_40_ENABLED
1205                 this_thr->th.th_teams_microtask == NULL &&
1206 #endif
1207                 team->t.t_active_level == 1)
1208             {
1209                 kmp_uint64 cur_time = __itt_get_timestamp();
1210                 kmp_info_t **other_threads = team->t.t_threads;
1211                 int nproc = this_thr->th.th_team_nproc;
1212                 int i;
1213                 switch(__kmp_forkjoin_frames_mode) {
1214                 case 1:
1215                     __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1216                     this_thr->th.th_frame_time = cur_time;
1217                     break;
1218                 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
1219                     __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1220                     break;
1221                 case 3:
1222                     if( __itt_metadata_add_ptr ) {
1223                         // Initialize with master's wait time
1224                         kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1225                         // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1226                         this_thr->th.th_bar_arrive_time = 0;
1227                         for (i=1; i<nproc; ++i) {
1228                             delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1229                             other_threads[i]->th.th_bar_arrive_time = 0;
1230                         }
1231                         __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1232                     }
1233                     __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1234                     this_thr->th.th_frame_time = cur_time;
1235                     break;
1236                 }
1237             }
1238 #endif /* USE_ITT_BUILD */
1239         } else {
1240             status = 1;
1241 #if USE_ITT_BUILD
1242             if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1243                 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1244 #endif /* USE_ITT_BUILD */
1245         }
1246         if (status == 1 || ! is_split) {
1247             switch (__kmp_barrier_release_pattern[bt]) {
1248             case bp_hyper_bar: {
1249                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1250                 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1251                                             USE_ITT_BUILD_ARG(itt_sync_obj) );
1252                 break;
1253             }
1254             case bp_hierarchical_bar: {
1255                 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1256                                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
1257                 break;
1258             }
1259             case bp_tree_bar: {
1260                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1261                 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1262                                            USE_ITT_BUILD_ARG(itt_sync_obj) );
1263                 break;
1264             }
1265             default: {
1266                 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1267                                              USE_ITT_BUILD_ARG(itt_sync_obj) );
1268             }
1269             }
1270             if (__kmp_tasking_mode != tskm_immediate_exec) {
1271                 __kmp_task_team_sync(this_thr, team);
1272             }
1273         }
1274 
1275 #if USE_ITT_BUILD
1276         /* GEH: TODO: Move this under if-condition above and also include in
1277            __kmp_end_split_barrier(). This will more accurately represent the actual release time
1278            of the threads for split barriers.  */
1279         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1280             __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1281 #endif /* USE_ITT_BUILD */
1282     } else { // Team is serialized.
1283         status = 0;
1284         if (__kmp_tasking_mode != tskm_immediate_exec) {
1285 #if OMP_45_ENABLED
1286             if ( this_thr->th.th_task_team != NULL ) {
1287                 void *itt_sync_obj = NULL;
1288 #if USE_ITT_NOTIFY
1289                 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1290                     itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1291                     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1292                 }
1293 #endif
1294 
1295                 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE);
1296                 __kmp_task_team_wait(this_thr, team
1297                                                USE_ITT_BUILD_ARG(itt_sync_obj));
1298                 __kmp_task_team_setup(this_thr, team, 0);
1299 
1300 #if USE_ITT_BUILD
1301                 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1302                     __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1303 #endif /* USE_ITT_BUILD */
1304             }
1305 #else
1306             // The task team should be NULL for serialized code (tasks will be executed immediately)
1307             KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1308             KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1309 #endif
1310         }
1311     }
1312     KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1313                   gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1314 
1315 #if OMPT_SUPPORT
1316     if (ompt_enabled) {
1317 #if OMPT_BLAME
1318         if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1319             ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1320                 my_parallel_id, my_task_id);
1321         }
1322 #endif
1323         this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1324     }
1325 #endif
1326     ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
1327 
1328     return status;
1329 }
1330 
1331 
1332 void
1333 __kmp_end_split_barrier(enum barrier_type bt, int gtid)
1334 {
1335     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1336     KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1337     int tid = __kmp_tid_from_gtid(gtid);
1338     kmp_info_t *this_thr = __kmp_threads[gtid];
1339     kmp_team_t *team = this_thr->th.th_team;
1340 
1341     ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
1342     if (!team->t.t_serialized) {
1343         if (KMP_MASTER_GTID(gtid)) {
1344             switch (__kmp_barrier_release_pattern[bt]) {
1345             case bp_hyper_bar: {
1346                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1347                 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1348                                             USE_ITT_BUILD_ARG(NULL) );
1349                 break;
1350             }
1351             case bp_hierarchical_bar: {
1352                 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1353                                                    USE_ITT_BUILD_ARG(NULL));
1354                 break;
1355             }
1356             case bp_tree_bar: {
1357                 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1358                 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1359                                            USE_ITT_BUILD_ARG(NULL) );
1360                 break;
1361             }
1362             default: {
1363                 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1364                                              USE_ITT_BUILD_ARG(NULL) );
1365             }
1366             }
1367             if (__kmp_tasking_mode != tskm_immediate_exec) {
1368                 __kmp_task_team_sync(this_thr, team);
1369             } // if
1370         }
1371     }
1372     ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
1373 }
1374 
1375 
1376 void
1377 __kmp_join_barrier(int gtid)
1378 {
1379     KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1380     KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1381     register kmp_info_t *this_thr = __kmp_threads[gtid];
1382     register kmp_team_t *team;
1383     register kmp_uint nproc;
1384     kmp_info_t *master_thread;
1385     int tid;
1386 #ifdef KMP_DEBUG
1387     int team_id;
1388 #endif /* KMP_DEBUG */
1389 #if USE_ITT_BUILD
1390     void *itt_sync_obj = NULL;
1391 # if USE_ITT_NOTIFY
1392     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1393         // Get object created at fork_barrier
1394         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1395 # endif
1396 #endif /* USE_ITT_BUILD */
1397     KMP_MB();
1398 
1399     // Get current info
1400     team = this_thr->th.th_team;
1401     nproc = this_thr->th.th_team_nproc;
1402     KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1403     tid = __kmp_tid_from_gtid(gtid);
1404 #ifdef KMP_DEBUG
1405     team_id = team->t.t_id;
1406 #endif /* KMP_DEBUG */
1407     master_thread = this_thr->th.th_team_master;
1408 #ifdef KMP_DEBUG
1409     if (master_thread != team->t.t_threads[0]) {
1410         __kmp_print_structure();
1411     }
1412 #endif /* KMP_DEBUG */
1413     KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1414     KMP_MB();
1415 
1416     // Verify state
1417     KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1418     KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1419     KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1420     KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1421     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1422 
1423     ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
1424 #if OMPT_SUPPORT
1425 #if OMPT_TRACE
1426     if (ompt_enabled &&
1427         ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1428         ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1429             team->t.ompt_team_info.parallel_id,
1430             team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1431     }
1432 #endif
1433     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1434 #endif
1435 
1436     if (__kmp_tasking_mode == tskm_extra_barrier) {
1437         __kmp_tasking_barrier(team, this_thr, gtid);
1438         KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1439     }
1440 # ifdef KMP_DEBUG
1441     if (__kmp_tasking_mode != tskm_immediate_exec) {
1442         KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1443                        __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
1444                        this_thr->th.th_task_team));
1445         KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
1446     }
1447 # endif /* KMP_DEBUG */
1448 
1449     /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1450        team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1451        down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1452        since the values are not used by __kmp_wait_template() in that case. */
1453     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1454 #if KMP_USE_MONITOR
1455         this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1456 #endif
1457         this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1458     }
1459 
1460 #if USE_ITT_BUILD
1461     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1462         __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1463 #endif /* USE_ITT_BUILD */
1464 
1465     switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1466     case bp_hyper_bar: {
1467         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1468         __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1469                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
1470         break;
1471     }
1472     case bp_hierarchical_bar: {
1473         __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1474                                           USE_ITT_BUILD_ARG(itt_sync_obj) );
1475         break;
1476     }
1477     case bp_tree_bar: {
1478         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1479         __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1480                                   USE_ITT_BUILD_ARG(itt_sync_obj) );
1481         break;
1482     }
1483     default: {
1484         __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1485                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
1486     }
1487     }
1488 
1489     /* From this point on, the team data structure may be deallocated at any time by the
1490        master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1491        data items that need to be referenced before the end of the barrier should be moved to
1492        the kmp_task_team_t structs.  */
1493     if (KMP_MASTER_TID(tid)) {
1494         if (__kmp_tasking_mode != tskm_immediate_exec) {
1495             __kmp_task_team_wait(this_thr, team
1496                                  USE_ITT_BUILD_ARG(itt_sync_obj) );
1497         }
1498 #if KMP_STATS_ENABLED
1499         // Have master thread flag the workers to indicate they are now waiting for
1500         // next parallel region, Also wake them up so they switch their timers to idle.
1501         for (int i=0; i<team->t.t_nproc; ++i) {
1502             kmp_info_t* team_thread = team->t.t_threads[i];
1503             if (team_thread == this_thr)
1504                 continue;
1505             team_thread->th.th_stats->setIdleFlag();
1506             if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL)
1507                 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc);
1508         }
1509 #endif
1510 #if USE_ITT_BUILD
1511         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1512             __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1513 #endif /* USE_ITT_BUILD */
1514 
1515 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1516         // Join barrier - report frame end
1517         if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1518 #if OMP_40_ENABLED
1519             this_thr->th.th_teams_microtask == NULL &&
1520 #endif
1521             team->t.t_active_level == 1)
1522         {
1523             kmp_uint64 cur_time = __itt_get_timestamp();
1524             ident_t * loc = team->t.t_ident;
1525             kmp_info_t **other_threads = team->t.t_threads;
1526             int nproc = this_thr->th.th_team_nproc;
1527             int i;
1528             switch(__kmp_forkjoin_frames_mode) {
1529             case 1:
1530                 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1531                 break;
1532             case 2:
1533                 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1534                 break;
1535             case 3:
1536                 if( __itt_metadata_add_ptr ) {
1537                     // Initialize with master's wait time
1538                     kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1539                     // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1540                     this_thr->th.th_bar_arrive_time = 0;
1541                     for (i=1; i<nproc; ++i) {
1542                         delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1543                         other_threads[i]->th.th_bar_arrive_time = 0;
1544                     }
1545                     __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1546                 }
1547                 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1548                 this_thr->th.th_frame_time = cur_time;
1549                 break;
1550             }
1551         }
1552 # endif /* USE_ITT_BUILD */
1553     }
1554 #if USE_ITT_BUILD
1555     else {
1556         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1557             __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1558     }
1559 #endif /* USE_ITT_BUILD */
1560 
1561 #if KMP_DEBUG
1562     if (KMP_MASTER_TID(tid)) {
1563         KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1564                       gtid, team_id, tid, nproc));
1565     }
1566 #endif /* KMP_DEBUG */
1567 
1568     // TODO now, mark worker threads as done so they may be disbanded
1569     KMP_MB(); // Flush all pending memory write invalidates.
1570     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1571 
1572 #if OMPT_SUPPORT
1573     if (ompt_enabled) {
1574 #if OMPT_BLAME
1575         if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1576             ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1577                 team->t.ompt_team_info.parallel_id,
1578                 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1579         }
1580 #endif
1581 
1582         // return to default state
1583         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1584     }
1585 #endif
1586     ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
1587 }
1588 
1589 
1590 // TODO release worker threads' fork barriers as we are ready instead of all at once
1591 void
1592 __kmp_fork_barrier(int gtid, int tid)
1593 {
1594     KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1595     KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1596     kmp_info_t *this_thr = __kmp_threads[gtid];
1597     kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1598 #if USE_ITT_BUILD
1599     void * itt_sync_obj = NULL;
1600 #endif /* USE_ITT_BUILD */
1601     if (team)
1602       ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
1603 
1604     KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1605                   gtid, (team != NULL) ? team->t.t_id : -1, tid));
1606 
1607     // th_team pointer only valid for master thread here
1608     if (KMP_MASTER_TID(tid)) {
1609 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1610         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1611             // Create itt barrier object
1612             itt_sync_obj  = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1613             __kmp_itt_barrier_middle(gtid, itt_sync_obj);  // Call acquired/releasing
1614         }
1615 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1616 
1617 #ifdef KMP_DEBUG
1618         register kmp_info_t **other_threads = team->t.t_threads;
1619         register int i;
1620 
1621         // Verify state
1622         KMP_MB();
1623 
1624         for(i=1; i<team->t.t_nproc; ++i) {
1625             KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1626                            gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1627                            team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1628                            other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1629             KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1630                               & ~(KMP_BARRIER_SLEEP_STATE))
1631                              == KMP_INIT_BARRIER_STATE);
1632             KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1633         }
1634 #endif
1635 
1636         if (__kmp_tasking_mode != tskm_immediate_exec) {
1637             __kmp_task_team_setup(this_thr, team, 0);  // 0 indicates setup current task team if nthreads > 1
1638         }
1639 
1640         /* The master thread may have changed its blocktime between the join barrier and the
1641            fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1642            access it when the team struct is not guaranteed to exist. */
1643         // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1644         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1645 #if KMP_USE_MONITOR
1646             this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1647 #endif
1648             this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1649         }
1650     } // master
1651 
1652     switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1653     case bp_hyper_bar: {
1654         KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1655         __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1656                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
1657         break;
1658     }
1659     case bp_hierarchical_bar: {
1660         __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1661                                            USE_ITT_BUILD_ARG(itt_sync_obj) );
1662         break;
1663     }
1664     case bp_tree_bar: {
1665         KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1666         __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1667                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
1668         break;
1669     }
1670     default: {
1671         __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1672                                      USE_ITT_BUILD_ARG(itt_sync_obj) );
1673     }
1674     }
1675 
1676     // Early exit for reaping threads releasing forkjoin barrier
1677     if (TCR_4(__kmp_global.g.g_done)) {
1678         this_thr->th.th_task_team = NULL;
1679 
1680 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1681         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1682             if (!KMP_MASTER_TID(tid)) {
1683                 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1684                 if (itt_sync_obj)
1685                     __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1686             }
1687         }
1688 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1689         KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1690         return;
1691     }
1692 
1693     /* We can now assume that a valid team structure has been allocated by the master and
1694        propagated to all worker threads. The current thread, however, may not be part of the
1695        team, so we can't blindly assume that the team pointer is non-null.  */
1696     team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1697     KMP_DEBUG_ASSERT(team != NULL);
1698     tid = __kmp_tid_from_gtid(gtid);
1699 
1700 
1701 #if KMP_BARRIER_ICV_PULL
1702     /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1703        __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1704        this data before this function is called. We cannot modify __kmp_fork_call() to look at
1705        the fixed ICVs in the master's thread struct, because it is not always the case that the
1706        threads arrays have been allocated when __kmp_fork_call() is executed. */
1707     {
1708         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
1709         if (!KMP_MASTER_TID(tid)) {  // master thread already has ICVs
1710             // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1711             KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1712             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1713             copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1714                       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1715         }
1716     }
1717 #endif // KMP_BARRIER_ICV_PULL
1718 
1719     if (__kmp_tasking_mode != tskm_immediate_exec) {
1720         __kmp_task_team_sync(this_thr, team);
1721     }
1722 
1723 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1724     kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1725     if (proc_bind == proc_bind_intel) {
1726 #endif
1727 #if KMP_AFFINITY_SUPPORTED
1728         // Call dynamic affinity settings
1729         if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1730             __kmp_balanced_affinity(tid, team->t.t_nproc);
1731         }
1732 #endif // KMP_AFFINITY_SUPPORTED
1733 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1734     }
1735     else if (proc_bind != proc_bind_false) {
1736         if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1737             KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1738                            __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1739         }
1740         else {
1741             __kmp_affinity_set_place(gtid);
1742         }
1743     }
1744 #endif
1745 
1746 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1747     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1748         if (!KMP_MASTER_TID(tid)) {
1749             // Get correct barrier object
1750             itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1751             __kmp_itt_barrier_finished(gtid, itt_sync_obj);  // Workers call acquired
1752         } // (prepare called inside barrier_release)
1753     }
1754 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1755     ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
1756     KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1757 }
1758 
1759 
1760 void
1761 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1762 {
1763     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
1764 
1765     KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1766     KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1767 
1768     /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1769        __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1770        this data before this function is called. */
1771 #if KMP_BARRIER_ICV_PULL
1772     /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1773        all of the worker threads can access them and make their own copies after the barrier. */
1774     KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
1775     copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1776     KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1777                   0, team->t.t_threads[0], team));
1778 #elif KMP_BARRIER_ICV_PUSH
1779     // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1780     KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1781                   0, team->t.t_threads[0], team));
1782 #else
1783     // Copy the ICVs to each of the non-master threads.  This takes O(nthreads) time.
1784     ngo_load(new_icvs);
1785     KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
1786     for (int f=1; f<new_nproc; ++f) { // Skip the master thread
1787         // TODO: GEH - pass in better source location info since usually NULL here
1788         KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1789                       f, team->t.t_threads[f], team));
1790         __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1791         ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1792         KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1793                       f, team->t.t_threads[f], team));
1794     }
1795     ngo_sync();
1796 #endif // KMP_BARRIER_ICV_PULL
1797 }
1798