1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_affinity.h"
18 #include "kmp_atomic.h"
19 #include "kmp_environment.h"
20 #include "kmp_error.h"
21 #include "kmp_i18n.h"
22 #include "kmp_io.h"
23 #include "kmp_itt.h"
24 #include "kmp_settings.h"
25 #include "kmp_stats.h"
26 #include "kmp_str.h"
27 #include "kmp_wait_release.h"
28 #include "kmp_wrapper_getpid.h"
29 
30 #if OMPT_SUPPORT
31 #include "ompt-specific.h"
32 #endif
33 
34 /* these are temporary issues to be dealt with */
35 #define KMP_USE_PRCTL 0
36 
37 #if KMP_OS_WINDOWS
38 #include <process.h>
39 #endif
40 
41 #include "tsan_annotations.h"
42 
43 #if defined(KMP_GOMP_COMPAT)
44 char const __kmp_version_alt_comp[] =
45     KMP_VERSION_PREFIX "alternative compiler support: yes";
46 #endif /* defined(KMP_GOMP_COMPAT) */
47 
48 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
49 #if OMP_50_ENABLED
50                                                         "5.0 (201611)";
51 #elif OMP_45_ENABLED
52                                                         "4.5 (201511)";
53 #elif OMP_40_ENABLED
54                                                         "4.0 (201307)";
55 #else
56                                                         "3.1 (201107)";
57 #endif
58 
59 #ifdef KMP_DEBUG
60 char const __kmp_version_lock[] =
61     KMP_VERSION_PREFIX "lock type: run time selectable";
62 #endif /* KMP_DEBUG */
63 
64 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
65 
66 /* ------------------------------------------------------------------------ */
67 
68 kmp_info_t __kmp_monitor;
69 
70 /* Forward declarations */
71 
72 void __kmp_cleanup(void);
73 
74 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
75                                   int gtid);
76 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
77                                   kmp_internal_control_t *new_icvs,
78                                   ident_t *loc);
79 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
80 static void __kmp_partition_places(kmp_team_t *team,
81                                    int update_master_only = 0);
82 #endif
83 static void __kmp_do_serial_initialize(void);
84 void __kmp_fork_barrier(int gtid, int tid);
85 void __kmp_join_barrier(int gtid);
86 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
87                           kmp_internal_control_t *new_icvs, ident_t *loc);
88 
89 #ifdef USE_LOAD_BALANCE
90 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
91 #endif
92 
93 static int __kmp_expand_threads(int nWish, int nNeed);
94 #if KMP_OS_WINDOWS
95 static int __kmp_unregister_root_other_thread(int gtid);
96 #endif
97 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
98 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
99 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
100 
101 /* Calculate the identifier of the current thread */
102 /* fast (and somewhat portable) way to get unique identifier of executing
103    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
104 int __kmp_get_global_thread_id() {
105   int i;
106   kmp_info_t **other_threads;
107   size_t stack_data;
108   char *stack_addr;
109   size_t stack_size;
110   char *stack_base;
111 
112   KA_TRACE(
113       1000,
114       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
115        __kmp_nth, __kmp_all_nth));
116 
117   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
118      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
119      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
120      __kmp_init_gtid for this to work. */
121 
122   if (!TCR_4(__kmp_init_gtid))
123     return KMP_GTID_DNE;
124 
125 #ifdef KMP_TDATA_GTID
126   if (TCR_4(__kmp_gtid_mode) >= 3) {
127     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
128     return __kmp_gtid;
129   }
130 #endif
131   if (TCR_4(__kmp_gtid_mode) >= 2) {
132     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
133     return __kmp_gtid_get_specific();
134   }
135   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
136 
137   stack_addr = (char *)&stack_data;
138   other_threads = __kmp_threads;
139 
140   /* ATT: The code below is a source of potential bugs due to unsynchronized
141      access to __kmp_threads array. For example:
142      1. Current thread loads other_threads[i] to thr and checks it, it is
143         non-NULL.
144      2. Current thread is suspended by OS.
145      3. Another thread unregisters and finishes (debug versions of free()
146         may fill memory with something like 0xEF).
147      4. Current thread is resumed.
148      5. Current thread reads junk from *thr.
149      TODO: Fix it.  --ln  */
150 
151   for (i = 0; i < __kmp_threads_capacity; i++) {
152 
153     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
154     if (!thr)
155       continue;
156 
157     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
158     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
159 
160     /* stack grows down -- search through all of the active threads */
161 
162     if (stack_addr <= stack_base) {
163       size_t stack_diff = stack_base - stack_addr;
164 
165       if (stack_diff <= stack_size) {
166         /* The only way we can be closer than the allocated */
167         /* stack size is if we are running on this thread. */
168         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
169         return i;
170       }
171     }
172   }
173 
174   /* get specific to try and determine our gtid */
175   KA_TRACE(1000,
176            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
177             "thread, using TLS\n"));
178   i = __kmp_gtid_get_specific();
179 
180   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
181 
182   /* if we havn't been assigned a gtid, then return code */
183   if (i < 0)
184     return i;
185 
186   /* dynamically updated stack window for uber threads to avoid get_specific
187      call */
188   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
189     KMP_FATAL(StackOverflow, i);
190   }
191 
192   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
193   if (stack_addr > stack_base) {
194     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
195     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
196             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
197                 stack_base);
198   } else {
199     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
200             stack_base - stack_addr);
201   }
202 
203   /* Reprint stack bounds for ubermaster since they have been refined */
204   if (__kmp_storage_map) {
205     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
207     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
208                                  other_threads[i]->th.th_info.ds.ds_stacksize,
209                                  "th_%d stack (refinement)", i);
210   }
211   return i;
212 }
213 
214 int __kmp_get_global_thread_id_reg() {
215   int gtid;
216 
217   if (!__kmp_init_serial) {
218     gtid = KMP_GTID_DNE;
219   } else
220 #ifdef KMP_TDATA_GTID
221       if (TCR_4(__kmp_gtid_mode) >= 3) {
222     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
223     gtid = __kmp_gtid;
224   } else
225 #endif
226       if (TCR_4(__kmp_gtid_mode) >= 2) {
227     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
228     gtid = __kmp_gtid_get_specific();
229   } else {
230     KA_TRACE(1000,
231              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
232     gtid = __kmp_get_global_thread_id();
233   }
234 
235   /* we must be a new uber master sibling thread */
236   if (gtid == KMP_GTID_DNE) {
237     KA_TRACE(10,
238              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
239               "Registering a new gtid.\n"));
240     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
241     if (!__kmp_init_serial) {
242       __kmp_do_serial_initialize();
243       gtid = __kmp_gtid_get_specific();
244     } else {
245       gtid = __kmp_register_root(FALSE);
246     }
247     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
248     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
249   }
250 
251   KMP_DEBUG_ASSERT(gtid >= 0);
252 
253   return gtid;
254 }
255 
256 /* caller must hold forkjoin_lock */
257 void __kmp_check_stack_overlap(kmp_info_t *th) {
258   int f;
259   char *stack_beg = NULL;
260   char *stack_end = NULL;
261   int gtid;
262 
263   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
264   if (__kmp_storage_map) {
265     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
266     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
267 
268     gtid = __kmp_gtid_from_thread(th);
269 
270     if (gtid == KMP_GTID_MONITOR) {
271       __kmp_print_storage_map_gtid(
272           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273           "th_%s stack (%s)", "mon",
274           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
275     } else {
276       __kmp_print_storage_map_gtid(
277           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
278           "th_%d stack (%s)", gtid,
279           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
280     }
281   }
282 
283   /* No point in checking ubermaster threads since they use refinement and
284    * cannot overlap */
285   gtid = __kmp_gtid_from_thread(th);
286   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
287     KA_TRACE(10,
288              ("__kmp_check_stack_overlap: performing extensive checking\n"));
289     if (stack_beg == NULL) {
290       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
291       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
292     }
293 
294     for (f = 0; f < __kmp_threads_capacity; f++) {
295       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
296 
297       if (f_th && f_th != th) {
298         char *other_stack_end =
299             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
300         char *other_stack_beg =
301             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
302         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
303             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
304 
305           /* Print the other stack values before the abort */
306           if (__kmp_storage_map)
307             __kmp_print_storage_map_gtid(
308                 -1, other_stack_beg, other_stack_end,
309                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
310                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
311 
312           __kmp_msg(kmp_ms_fatal, KMP_MSG(StackOverlap),
313                     KMP_HNT(ChangeStackLimit), __kmp_msg_null);
314         }
315       }
316     }
317   }
318   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
319 }
320 
321 /* ------------------------------------------------------------------------ */
322 
323 void __kmp_infinite_loop(void) {
324   static int done = FALSE;
325 
326   while (!done) {
327     KMP_YIELD(1);
328   }
329 }
330 
331 #define MAX_MESSAGE 512
332 
333 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
334                                   char const *format, ...) {
335   char buffer[MAX_MESSAGE];
336   va_list ap;
337 
338   va_start(ap, format);
339   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
340                p2, (unsigned long)size, format);
341   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
342   __kmp_vprintf(kmp_err, buffer, ap);
343 #if KMP_PRINT_DATA_PLACEMENT
344   int node;
345   if (gtid >= 0) {
346     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
347       if (__kmp_storage_map_verbose) {
348         node = __kmp_get_host_node(p1);
349         if (node < 0) /* doesn't work, so don't try this next time */
350           __kmp_storage_map_verbose = FALSE;
351         else {
352           char *last;
353           int lastNode;
354           int localProc = __kmp_get_cpu_from_gtid(gtid);
355 
356           const int page_size = KMP_GET_PAGE_SIZE();
357 
358           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
359           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
360           if (localProc >= 0)
361             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
362                                  localProc >> 1);
363           else
364             __kmp_printf_no_lock("  GTID %d\n", gtid);
365 #if KMP_USE_PRCTL
366           /* The more elaborate format is disabled for now because of the prctl
367            * hanging bug. */
368           do {
369             last = p1;
370             lastNode = node;
371             /* This loop collates adjacent pages with the same host node. */
372             do {
373               (char *)p1 += page_size;
374             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
375             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
376                                  lastNode);
377           } while (p1 <= p2);
378 #else
379           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
380                                (char *)p1 + (page_size - 1),
381                                __kmp_get_host_node(p1));
382           if (p1 < p2) {
383             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
384                                  (char *)p2 + (page_size - 1),
385                                  __kmp_get_host_node(p2));
386           }
387 #endif
388         }
389       }
390     } else
391       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
392   }
393 #endif /* KMP_PRINT_DATA_PLACEMENT */
394   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
395 }
396 
397 void __kmp_warn(char const *format, ...) {
398   char buffer[MAX_MESSAGE];
399   va_list ap;
400 
401   if (__kmp_generate_warnings == kmp_warnings_off) {
402     return;
403   }
404 
405   va_start(ap, format);
406 
407   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
408   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
409   __kmp_vprintf(kmp_err, buffer, ap);
410   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
411 
412   va_end(ap);
413 }
414 
415 void __kmp_abort_process() {
416   // Later threads may stall here, but that's ok because abort() will kill them.
417   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
418 
419   if (__kmp_debug_buf) {
420     __kmp_dump_debug_buffer();
421   }; // if
422 
423   if (KMP_OS_WINDOWS) {
424     // Let other threads know of abnormal termination and prevent deadlock
425     // if abort happened during library initialization or shutdown
426     __kmp_global.g.g_abort = SIGABRT;
427 
428     /* On Windows* OS by default abort() causes pop-up error box, which stalls
429        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
430        boxes. _set_abort_behavior() works well, but this function is not
431        available in VS7 (this is not problem for DLL, but it is a problem for
432        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
433        help, at least in some versions of MS C RTL.
434 
435        It seems following sequence is the only way to simulate abort() and
436        avoid pop-up error box. */
437     raise(SIGABRT);
438     _exit(3); // Just in case, if signal ignored, exit anyway.
439   } else {
440     abort();
441   }; // if
442 
443   __kmp_infinite_loop();
444   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
445 
446 } // __kmp_abort_process
447 
448 void __kmp_abort_thread(void) {
449   // TODO: Eliminate g_abort global variable and this function.
450   // In case of abort just call abort(), it will kill all the threads.
451   __kmp_infinite_loop();
452 } // __kmp_abort_thread
453 
454 /* Print out the storage map for the major kmp_info_t thread data structures
455    that are allocated together. */
456 
457 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
458   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
459                                gtid);
460 
461   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
462                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
463 
464   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
465                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
466 
467   __kmp_print_storage_map_gtid(
468       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
469       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
470 
471   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
472                                &thr->th.th_bar[bs_plain_barrier + 1],
473                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
474                                gtid);
475 
476   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
477                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
478                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
479                                gtid);
480 
481 #if KMP_FAST_REDUCTION_BARRIER
482   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
483                                &thr->th.th_bar[bs_reduction_barrier + 1],
484                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
485                                gtid);
486 #endif // KMP_FAST_REDUCTION_BARRIER
487 }
488 
489 /* Print out the storage map for the major kmp_team_t team data structures
490    that are allocated together. */
491 
492 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
493                                          int team_id, int num_thr) {
494   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
495   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
496                                header, team_id);
497 
498   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
499                                &team->t.t_bar[bs_last_barrier],
500                                sizeof(kmp_balign_team_t) * bs_last_barrier,
501                                "%s_%d.t_bar", header, team_id);
502 
503   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
504                                &team->t.t_bar[bs_plain_barrier + 1],
505                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
506                                header, team_id);
507 
508   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
509                                &team->t.t_bar[bs_forkjoin_barrier + 1],
510                                sizeof(kmp_balign_team_t),
511                                "%s_%d.t_bar[forkjoin]", header, team_id);
512 
513 #if KMP_FAST_REDUCTION_BARRIER
514   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
515                                &team->t.t_bar[bs_reduction_barrier + 1],
516                                sizeof(kmp_balign_team_t),
517                                "%s_%d.t_bar[reduction]", header, team_id);
518 #endif // KMP_FAST_REDUCTION_BARRIER
519 
520   __kmp_print_storage_map_gtid(
521       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
522       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
523 
524   __kmp_print_storage_map_gtid(
525       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
526       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
527 
528   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
529                                &team->t.t_disp_buffer[num_disp_buff],
530                                sizeof(dispatch_shared_info_t) * num_disp_buff,
531                                "%s_%d.t_disp_buffer", header, team_id);
532 
533   __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
534                                sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
535                                team_id);
536 }
537 
538 static void __kmp_init_allocator() {}
539 static void __kmp_fini_allocator() {}
540 
541 /* ------------------------------------------------------------------------ */
542 
543 #ifdef KMP_DYNAMIC_LIB
544 #if KMP_OS_WINDOWS
545 
546 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
547   // TODO: Change to __kmp_break_bootstrap_lock().
548   __kmp_init_bootstrap_lock(lck); // make the lock released
549 }
550 
551 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
552   int i;
553   int thread_count;
554 
555   // PROCESS_DETACH is expected to be called by a thread that executes
556   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
557   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
558   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
559   // threads can be still alive here, although being about to be terminated. The
560   // threads in the array with ds_thread==0 are most suspicious. Actually, it
561   // can be not safe to access the __kmp_threads[].
562 
563   // TODO: does it make sense to check __kmp_roots[] ?
564 
565   // Let's check that there are no other alive threads registered with the OMP
566   // lib.
567   while (1) {
568     thread_count = 0;
569     for (i = 0; i < __kmp_threads_capacity; ++i) {
570       if (!__kmp_threads)
571         continue;
572       kmp_info_t *th = __kmp_threads[i];
573       if (th == NULL)
574         continue;
575       int gtid = th->th.th_info.ds.ds_gtid;
576       if (gtid == gtid_req)
577         continue;
578       if (gtid < 0)
579         continue;
580       DWORD exit_val;
581       int alive = __kmp_is_thread_alive(th, &exit_val);
582       if (alive) {
583         ++thread_count;
584       }
585     }
586     if (thread_count == 0)
587       break; // success
588   }
589 
590   // Assume that I'm alone. Now it might be safe to check and reset locks.
591   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
592   __kmp_reset_lock(&__kmp_forkjoin_lock);
593 #ifdef KMP_DEBUG
594   __kmp_reset_lock(&__kmp_stdio_lock);
595 #endif // KMP_DEBUG
596 }
597 
598 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
599   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
600 
601   switch (fdwReason) {
602 
603   case DLL_PROCESS_ATTACH:
604     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
605 
606     return TRUE;
607 
608   case DLL_PROCESS_DETACH:
609     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
610 
611     if (lpReserved != NULL) {
612       // lpReserved is used for telling the difference:
613       //   lpReserved == NULL when FreeLibrary() was called,
614       //   lpReserved != NULL when the process terminates.
615       // When FreeLibrary() is called, worker threads remain alive. So they will
616       // release the forkjoin lock by themselves. When the process terminates,
617       // worker threads disappear triggering the problem of unreleased forkjoin
618       // lock as described below.
619 
620       // A worker thread can take the forkjoin lock. The problem comes up if
621       // that worker thread becomes dead before it releases the forkjoin lock.
622       // The forkjoin lock remains taken, while the thread executing
623       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
624       // to take the forkjoin lock and will always fail, so that the application
625       // will never finish [normally]. This scenario is possible if
626       // __kmpc_end() has not been executed. It looks like it's not a corner
627       // case, but common cases:
628       // - the main function was compiled by an alternative compiler;
629       // - the main function was compiled by icl but without /Qopenmp
630       //   (application with plugins);
631       // - application terminates by calling C exit(), Fortran CALL EXIT() or
632       //   Fortran STOP.
633       // - alive foreign thread prevented __kmpc_end from doing cleanup.
634       //
635       // This is a hack to work around the problem.
636       // TODO: !!! figure out something better.
637       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
638     }
639 
640     __kmp_internal_end_library(__kmp_gtid_get_specific());
641 
642     return TRUE;
643 
644   case DLL_THREAD_ATTACH:
645     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
646 
647     /* if we want to register new siblings all the time here call
648      * __kmp_get_gtid(); */
649     return TRUE;
650 
651   case DLL_THREAD_DETACH:
652     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
653 
654     __kmp_internal_end_thread(__kmp_gtid_get_specific());
655     return TRUE;
656   }
657 
658   return TRUE;
659 }
660 
661 #endif /* KMP_OS_WINDOWS */
662 #endif /* KMP_DYNAMIC_LIB */
663 
664 /* Change the library type to "status" and return the old type */
665 /* called from within initialization routines where __kmp_initz_lock is held */
666 int __kmp_change_library(int status) {
667   int old_status;
668 
669   old_status = __kmp_yield_init &
670                1; // check whether KMP_LIBRARY=throughput (even init count)
671 
672   if (status) {
673     __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
674   } else {
675     __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
676   }
677 
678   return old_status; // return previous setting of whether
679   // KMP_LIBRARY=throughput
680 }
681 
682 /* __kmp_parallel_deo -- Wait until it's our turn. */
683 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
684   int gtid = *gtid_ref;
685 #ifdef BUILD_PARALLEL_ORDERED
686   kmp_team_t *team = __kmp_team_from_gtid(gtid);
687 #endif /* BUILD_PARALLEL_ORDERED */
688 
689   if (__kmp_env_consistency_check) {
690     if (__kmp_threads[gtid]->th.th_root->r.r_active)
691 #if KMP_USE_DYNAMIC_LOCK
692       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
693 #else
694       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
695 #endif
696   }
697 #ifdef BUILD_PARALLEL_ORDERED
698   if (!team->t.t_serialized) {
699     KMP_MB();
700     KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
701                    KMP_EQ, NULL);
702     KMP_MB();
703   }
704 #endif /* BUILD_PARALLEL_ORDERED */
705 }
706 
707 /* __kmp_parallel_dxo -- Signal the next task. */
708 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
709   int gtid = *gtid_ref;
710 #ifdef BUILD_PARALLEL_ORDERED
711   int tid = __kmp_tid_from_gtid(gtid);
712   kmp_team_t *team = __kmp_team_from_gtid(gtid);
713 #endif /* BUILD_PARALLEL_ORDERED */
714 
715   if (__kmp_env_consistency_check) {
716     if (__kmp_threads[gtid]->th.th_root->r.r_active)
717       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
718   }
719 #ifdef BUILD_PARALLEL_ORDERED
720   if (!team->t.t_serialized) {
721     KMP_MB(); /* Flush all pending memory write invalidates.  */
722 
723     /* use the tid of the next thread in this team */
724     /* TODO replace with general release procedure */
725     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
726 
727 #if OMPT_SUPPORT && OMPT_BLAME
728     if (ompt_enabled &&
729         ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
730       /* accept blame for "ordered" waiting */
731       kmp_info_t *this_thread = __kmp_threads[gtid];
732       ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
733           this_thread->th.ompt_thread_info.wait_id);
734     }
735 #endif
736 
737     KMP_MB(); /* Flush all pending memory write invalidates.  */
738   }
739 #endif /* BUILD_PARALLEL_ORDERED */
740 }
741 
742 /* ------------------------------------------------------------------------ */
743 /* The BARRIER for a SINGLE process section is always explicit   */
744 
745 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
746   int status;
747   kmp_info_t *th;
748   kmp_team_t *team;
749 
750   if (!TCR_4(__kmp_init_parallel))
751     __kmp_parallel_initialize();
752 
753   th = __kmp_threads[gtid];
754   team = th->th.th_team;
755   status = 0;
756 
757   th->th.th_ident = id_ref;
758 
759   if (team->t.t_serialized) {
760     status = 1;
761   } else {
762     kmp_int32 old_this = th->th.th_local.this_construct;
763 
764     ++th->th.th_local.this_construct;
765     /* try to set team count to thread count--success means thread got the
766        single block */
767     /* TODO: Should this be acquire or release? */
768     if (team->t.t_construct == old_this) {
769       status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
770                                            th->th.th_local.this_construct);
771     }
772 #if USE_ITT_BUILD
773     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
774         KMP_MASTER_GTID(gtid) &&
775 #if OMP_40_ENABLED
776         th->th.th_teams_microtask == NULL &&
777 #endif
778         team->t.t_active_level ==
779             1) { // Only report metadata by master of active team at level 1
780       __kmp_itt_metadata_single(id_ref);
781     }
782 #endif /* USE_ITT_BUILD */
783   }
784 
785   if (__kmp_env_consistency_check) {
786     if (status && push_ws) {
787       __kmp_push_workshare(gtid, ct_psingle, id_ref);
788     } else {
789       __kmp_check_workshare(gtid, ct_psingle, id_ref);
790     }
791   }
792 #if USE_ITT_BUILD
793   if (status) {
794     __kmp_itt_single_start(gtid);
795   }
796 #endif /* USE_ITT_BUILD */
797   return status;
798 }
799 
800 void __kmp_exit_single(int gtid) {
801 #if USE_ITT_BUILD
802   __kmp_itt_single_end(gtid);
803 #endif /* USE_ITT_BUILD */
804   if (__kmp_env_consistency_check)
805     __kmp_pop_workshare(gtid, ct_psingle, NULL);
806 }
807 
808 /* determine if we can go parallel or must use a serialized parallel region and
809  * how many threads we can use
810  * set_nproc is the number of threads requested for the team
811  * returns 0 if we should serialize or only use one thread,
812  * otherwise the number of threads to use
813  * The forkjoin lock is held by the caller. */
814 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
815                                  int master_tid, int set_nthreads
816 #if OMP_40_ENABLED
817                                  ,
818                                  int enter_teams
819 #endif /* OMP_40_ENABLED */
820                                  ) {
821   int capacity;
822   int new_nthreads;
823   KMP_DEBUG_ASSERT(__kmp_init_serial);
824   KMP_DEBUG_ASSERT(root && parent_team);
825 
826   // If dyn-var is set, dynamically adjust the number of desired threads,
827   // according to the method specified by dynamic_mode.
828   new_nthreads = set_nthreads;
829   if (!get__dynamic_2(parent_team, master_tid)) {
830     ;
831   }
832 #ifdef USE_LOAD_BALANCE
833   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
834     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
835     if (new_nthreads == 1) {
836       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
837                     "reservation to 1 thread\n",
838                     master_tid));
839       return 1;
840     }
841     if (new_nthreads < set_nthreads) {
842       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
843                     "reservation to %d threads\n",
844                     master_tid, new_nthreads));
845     }
846   }
847 #endif /* USE_LOAD_BALANCE */
848   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
849     new_nthreads = __kmp_avail_proc - __kmp_nth +
850                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
851     if (new_nthreads <= 1) {
852       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
853                     "reservation to 1 thread\n",
854                     master_tid));
855       return 1;
856     }
857     if (new_nthreads < set_nthreads) {
858       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
859                     "reservation to %d threads\n",
860                     master_tid, new_nthreads));
861     } else {
862       new_nthreads = set_nthreads;
863     }
864   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
865     if (set_nthreads > 2) {
866       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
867       new_nthreads = (new_nthreads % set_nthreads) + 1;
868       if (new_nthreads == 1) {
869         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
870                       "reservation to 1 thread\n",
871                       master_tid));
872         return 1;
873       }
874       if (new_nthreads < set_nthreads) {
875         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
876                       "reservation to %d threads\n",
877                       master_tid, new_nthreads));
878       }
879     }
880   } else {
881     KMP_ASSERT(0);
882   }
883 
884   // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
885   if (__kmp_nth + new_nthreads -
886           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
887       __kmp_max_nth) {
888     int tl_nthreads = __kmp_max_nth - __kmp_nth +
889                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
890     if (tl_nthreads <= 0) {
891       tl_nthreads = 1;
892     }
893 
894     // If dyn-var is false, emit a 1-time warning.
895     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
896       __kmp_reserve_warn = 1;
897       __kmp_msg(kmp_ms_warning,
898                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
899                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
900     }
901     if (tl_nthreads == 1) {
902       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced "
903                     "reservation to 1 thread\n",
904                     master_tid));
905       return 1;
906     }
907     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced "
908                   "reservation to %d threads\n",
909                   master_tid, tl_nthreads));
910     new_nthreads = tl_nthreads;
911   }
912 
913   // Check if the threads array is large enough, or needs expanding.
914   // See comment in __kmp_register_root() about the adjustment if
915   // __kmp_threads[0] == NULL.
916   capacity = __kmp_threads_capacity;
917   if (TCR_PTR(__kmp_threads[0]) == NULL) {
918     --capacity;
919   }
920   if (__kmp_nth + new_nthreads -
921           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
922       capacity) {
923     // Expand the threads array.
924     int slotsRequired = __kmp_nth + new_nthreads -
925                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
926                         capacity;
927     int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
928     if (slotsAdded < slotsRequired) {
929       // The threads array was not expanded enough.
930       new_nthreads -= (slotsRequired - slotsAdded);
931       KMP_ASSERT(new_nthreads >= 1);
932 
933       // If dyn-var is false, emit a 1-time warning.
934       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
935         __kmp_reserve_warn = 1;
936         if (__kmp_tp_cached) {
937           __kmp_msg(kmp_ms_warning,
938                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
939                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
940                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
941         } else {
942           __kmp_msg(kmp_ms_warning,
943                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
944                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
945         }
946       }
947     }
948   }
949 
950 #ifdef KMP_DEBUG
951   if (new_nthreads == 1) {
952     KC_TRACE(10,
953              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
954               "dead roots and rechecking; requested %d threads\n",
955               __kmp_get_gtid(), set_nthreads));
956   } else {
957     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
958                   " %d threads\n",
959                   __kmp_get_gtid(), new_nthreads, set_nthreads));
960   }
961 #endif // KMP_DEBUG
962   return new_nthreads;
963 }
964 
965 /* Allocate threads from the thread pool and assign them to the new team. We are
966    assured that there are enough threads available, because we checked on that
967    earlier within critical section forkjoin */
968 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
969                                     kmp_info_t *master_th, int master_gtid) {
970   int i;
971   int use_hot_team;
972 
973   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
974   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
975   KMP_MB();
976 
977   /* first, let's setup the master thread */
978   master_th->th.th_info.ds.ds_tid = 0;
979   master_th->th.th_team = team;
980   master_th->th.th_team_nproc = team->t.t_nproc;
981   master_th->th.th_team_master = master_th;
982   master_th->th.th_team_serialized = FALSE;
983   master_th->th.th_dispatch = &team->t.t_dispatch[0];
984 
985 /* make sure we are not the optimized hot team */
986 #if KMP_NESTED_HOT_TEAMS
987   use_hot_team = 0;
988   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
989   if (hot_teams) { // hot teams array is not allocated if
990     // KMP_HOT_TEAMS_MAX_LEVEL=0
991     int level = team->t.t_active_level - 1; // index in array of hot teams
992     if (master_th->th.th_teams_microtask) { // are we inside the teams?
993       if (master_th->th.th_teams_size.nteams > 1) {
994         ++level; // level was not increased in teams construct for
995         // team_of_masters
996       }
997       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
998           master_th->th.th_teams_level == team->t.t_level) {
999         ++level; // level was not increased in teams construct for
1000         // team_of_workers before the parallel
1001       } // team->t.t_level will be increased inside parallel
1002     }
1003     if (level < __kmp_hot_teams_max_level) {
1004       if (hot_teams[level].hot_team) {
1005         // hot team has already been allocated for given level
1006         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1007         use_hot_team = 1; // the team is ready to use
1008       } else {
1009         use_hot_team = 0; // AC: threads are not allocated yet
1010         hot_teams[level].hot_team = team; // remember new hot team
1011         hot_teams[level].hot_team_nth = team->t.t_nproc;
1012       }
1013     } else {
1014       use_hot_team = 0;
1015     }
1016   }
1017 #else
1018   use_hot_team = team == root->r.r_hot_team;
1019 #endif
1020   if (!use_hot_team) {
1021 
1022     /* install the master thread */
1023     team->t.t_threads[0] = master_th;
1024     __kmp_initialize_info(master_th, team, 0, master_gtid);
1025 
1026     /* now, install the worker threads */
1027     for (i = 1; i < team->t.t_nproc; i++) {
1028 
1029       /* fork or reallocate a new thread and install it in team */
1030       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1031       team->t.t_threads[i] = thr;
1032       KMP_DEBUG_ASSERT(thr);
1033       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1034       /* align team and thread arrived states */
1035       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1036                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1037                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1038                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1039                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1040                     team->t.t_bar[bs_plain_barrier].b_arrived));
1041 #if OMP_40_ENABLED
1042       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1043       thr->th.th_teams_level = master_th->th.th_teams_level;
1044       thr->th.th_teams_size = master_th->th.th_teams_size;
1045 #endif
1046       { // Initialize threads' barrier data.
1047         int b;
1048         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1049         for (b = 0; b < bs_last_barrier; ++b) {
1050           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1051           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1052 #if USE_DEBUGGER
1053           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1054 #endif
1055         }; // for b
1056       }
1057     }
1058 
1059 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1060     __kmp_partition_places(team);
1061 #endif
1062   }
1063 
1064   KMP_MB();
1065 }
1066 
1067 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1068 // Propagate any changes to the floating point control registers out to the team
1069 // We try to avoid unnecessary writes to the relevant cache line in the team
1070 // structure, so we don't make changes unless they are needed.
1071 inline static void propagateFPControl(kmp_team_t *team) {
1072   if (__kmp_inherit_fp_control) {
1073     kmp_int16 x87_fpu_control_word;
1074     kmp_uint32 mxcsr;
1075 
1076     // Get master values of FPU control flags (both X87 and vector)
1077     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1078     __kmp_store_mxcsr(&mxcsr);
1079     mxcsr &= KMP_X86_MXCSR_MASK;
1080 
1081 // There is no point looking at t_fp_control_saved here.
1082 // If it is TRUE, we still have to update the values if they are different from
1083 // those we now have.
1084 // If it is FALSE we didn't save anything yet, but our objective is the same. We
1085 // have to ensure that the values in the team are the same as those we have.
1086 // So, this code achieves what we need whether or not t_fp_control_saved is
1087 // true. By checking whether the value needs updating we avoid unnecessary
1088 // writes that would put the cache-line into a written state, causing all
1089 // threads in the team to have to read it again.
1090     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1091     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1092     // Although we don't use this value, other code in the runtime wants to know
1093     // whether it should restore them. So we must ensure it is correct.
1094     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1095   } else {
1096     // Similarly here. Don't write to this cache-line in the team structure
1097     // unless we have to.
1098     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1099   }
1100 }
1101 
1102 // Do the opposite, setting the hardware registers to the updated values from
1103 // the team.
1104 inline static void updateHWFPControl(kmp_team_t *team) {
1105   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1106     // Only reset the fp control regs if they have been changed in the team.
1107     // the parallel region that we are exiting.
1108     kmp_int16 x87_fpu_control_word;
1109     kmp_uint32 mxcsr;
1110     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1111     __kmp_store_mxcsr(&mxcsr);
1112     mxcsr &= KMP_X86_MXCSR_MASK;
1113 
1114     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1115       __kmp_clear_x87_fpu_status_word();
1116       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1117     }
1118 
1119     if (team->t.t_mxcsr != mxcsr) {
1120       __kmp_load_mxcsr(&team->t.t_mxcsr);
1121     }
1122   }
1123 }
1124 #else
1125 #define propagateFPControl(x) ((void)0)
1126 #define updateHWFPControl(x) ((void)0)
1127 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1128 
1129 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1130                                      int realloc); // forward declaration
1131 
1132 /* Run a parallel region that has been serialized, so runs only in a team of the
1133    single master thread. */
1134 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1135   kmp_info_t *this_thr;
1136   kmp_team_t *serial_team;
1137 
1138   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1139 
1140   /* Skip all this code for autopar serialized loops since it results in
1141      unacceptable overhead */
1142   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1143     return;
1144 
1145   if (!TCR_4(__kmp_init_parallel))
1146     __kmp_parallel_initialize();
1147 
1148   this_thr = __kmp_threads[global_tid];
1149   serial_team = this_thr->th.th_serial_team;
1150 
1151   /* utilize the serialized team held by this thread */
1152   KMP_DEBUG_ASSERT(serial_team);
1153   KMP_MB();
1154 
1155   if (__kmp_tasking_mode != tskm_immediate_exec) {
1156     KMP_DEBUG_ASSERT(
1157         this_thr->th.th_task_team ==
1158         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1159     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1160                      NULL);
1161     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1162                   "team %p, new task_team = NULL\n",
1163                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1164     this_thr->th.th_task_team = NULL;
1165   }
1166 
1167 #if OMP_40_ENABLED
1168   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1169   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1170     proc_bind = proc_bind_false;
1171   } else if (proc_bind == proc_bind_default) {
1172     // No proc_bind clause was specified, so use the current value
1173     // of proc-bind-var for this parallel region.
1174     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1175   }
1176   // Reset for next parallel region
1177   this_thr->th.th_set_proc_bind = proc_bind_default;
1178 #endif /* OMP_40_ENABLED */
1179 
1180   if (this_thr->th.th_team != serial_team) {
1181     // Nested level will be an index in the nested nthreads array
1182     int level = this_thr->th.th_team->t.t_level;
1183 
1184     if (serial_team->t.t_serialized) {
1185       /* this serial team was already used
1186          TODO increase performance by making this locks more specific */
1187       kmp_team_t *new_team;
1188 
1189       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1190 
1191 #if OMPT_SUPPORT
1192       ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1193 #endif
1194 
1195       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1196 #if OMPT_SUPPORT
1197                                      ompt_parallel_id,
1198 #endif
1199 #if OMP_40_ENABLED
1200                                      proc_bind,
1201 #endif
1202                                      &this_thr->th.th_current_task->td_icvs,
1203                                      0 USE_NESTED_HOT_ARG(NULL));
1204       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1205       KMP_ASSERT(new_team);
1206 
1207       /* setup new serialized team and install it */
1208       new_team->t.t_threads[0] = this_thr;
1209       new_team->t.t_parent = this_thr->th.th_team;
1210       serial_team = new_team;
1211       this_thr->th.th_serial_team = serial_team;
1212 
1213       KF_TRACE(
1214           10,
1215           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1216            global_tid, serial_team));
1217 
1218       /* TODO the above breaks the requirement that if we run out of resources,
1219          then we can still guarantee that serialized teams are ok, since we may
1220          need to allocate a new one */
1221     } else {
1222       KF_TRACE(
1223           10,
1224           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1225            global_tid, serial_team));
1226     }
1227 
1228     /* we have to initialize this serial team */
1229     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1230     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1231     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1232     serial_team->t.t_ident = loc;
1233     serial_team->t.t_serialized = 1;
1234     serial_team->t.t_nproc = 1;
1235     serial_team->t.t_parent = this_thr->th.th_team;
1236     serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1237     this_thr->th.th_team = serial_team;
1238     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1239 
1240     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1241                   this_thr->th.th_current_task));
1242     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1243     this_thr->th.th_current_task->td_flags.executing = 0;
1244 
1245     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1246 
1247     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1248        implicit task for each serialized task represented by
1249        team->t.t_serialized? */
1250     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1251               &this_thr->th.th_current_task->td_parent->td_icvs);
1252 
1253     // Thread value exists in the nested nthreads array for the next nested
1254     // level
1255     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1256       this_thr->th.th_current_task->td_icvs.nproc =
1257           __kmp_nested_nth.nth[level + 1];
1258     }
1259 
1260 #if OMP_40_ENABLED
1261     if (__kmp_nested_proc_bind.used &&
1262         (level + 1 < __kmp_nested_proc_bind.used)) {
1263       this_thr->th.th_current_task->td_icvs.proc_bind =
1264           __kmp_nested_proc_bind.bind_types[level + 1];
1265     }
1266 #endif /* OMP_40_ENABLED */
1267 
1268 #if USE_DEBUGGER
1269     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1270 #endif
1271     this_thr->th.th_info.ds.ds_tid = 0;
1272 
1273     /* set thread cache values */
1274     this_thr->th.th_team_nproc = 1;
1275     this_thr->th.th_team_master = this_thr;
1276     this_thr->th.th_team_serialized = 1;
1277 
1278     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1279     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1280 
1281     propagateFPControl(serial_team);
1282 
1283     /* check if we need to allocate dispatch buffers stack */
1284     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1285     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1286       serial_team->t.t_dispatch->th_disp_buffer =
1287           (dispatch_private_info_t *)__kmp_allocate(
1288               sizeof(dispatch_private_info_t));
1289     }
1290     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1291 
1292 #if OMPT_SUPPORT
1293     ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1294     __ompt_team_assign_id(serial_team, ompt_parallel_id);
1295 #endif
1296 
1297     KMP_MB();
1298 
1299   } else {
1300     /* this serialized team is already being used,
1301      * that's fine, just add another nested level */
1302     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1303     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1304     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1305     ++serial_team->t.t_serialized;
1306     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1307 
1308     // Nested level will be an index in the nested nthreads array
1309     int level = this_thr->th.th_team->t.t_level;
1310     // Thread value exists in the nested nthreads array for the next nested
1311     // level
1312     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1313       this_thr->th.th_current_task->td_icvs.nproc =
1314           __kmp_nested_nth.nth[level + 1];
1315     }
1316     serial_team->t.t_level++;
1317     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1318                   "of serial team %p to %d\n",
1319                   global_tid, serial_team, serial_team->t.t_level));
1320 
1321     /* allocate/push dispatch buffers stack */
1322     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1323     {
1324       dispatch_private_info_t *disp_buffer =
1325           (dispatch_private_info_t *)__kmp_allocate(
1326               sizeof(dispatch_private_info_t));
1327       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1328       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1329     }
1330     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1331 
1332     KMP_MB();
1333   }
1334 #if OMP_40_ENABLED
1335   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1336 #endif
1337 
1338   if (__kmp_env_consistency_check)
1339     __kmp_push_parallel(global_tid, NULL);
1340 }
1341 
1342 /* most of the work for a fork */
1343 /* return true if we really went parallel, false if serialized */
1344 int __kmp_fork_call(ident_t *loc, int gtid,
1345                     enum fork_context_e call_context, // Intel, GNU, ...
1346                     kmp_int32 argc,
1347 #if OMPT_SUPPORT
1348                     void *unwrapped_task,
1349 #endif
1350                     microtask_t microtask, launch_t invoker,
1351 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1352 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1353                     va_list *ap
1354 #else
1355                     va_list ap
1356 #endif
1357                     ) {
1358   void **argv;
1359   int i;
1360   int master_tid;
1361   int master_this_cons;
1362   kmp_team_t *team;
1363   kmp_team_t *parent_team;
1364   kmp_info_t *master_th;
1365   kmp_root_t *root;
1366   int nthreads;
1367   int master_active;
1368   int master_set_numthreads;
1369   int level;
1370 #if OMP_40_ENABLED
1371   int active_level;
1372   int teams_level;
1373 #endif
1374 #if KMP_NESTED_HOT_TEAMS
1375   kmp_hot_team_ptr_t **p_hot_teams;
1376 #endif
1377   { // KMP_TIME_BLOCK
1378     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1379     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1380 
1381     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1382     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1383       /* Some systems prefer the stack for the root thread(s) to start with */
1384       /* some gap from the parent stack to prevent false sharing. */
1385       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1386       /* These 2 lines below are so this does not get optimized out */
1387       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1388         __kmp_stkpadding += (short)((kmp_int64)dummy);
1389     }
1390 
1391     /* initialize if needed */
1392     KMP_DEBUG_ASSERT(
1393         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1394     if (!TCR_4(__kmp_init_parallel))
1395       __kmp_parallel_initialize();
1396 
1397     /* setup current data */
1398     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1399     // shutdown
1400     parent_team = master_th->th.th_team;
1401     master_tid = master_th->th.th_info.ds.ds_tid;
1402     master_this_cons = master_th->th.th_local.this_construct;
1403     root = master_th->th.th_root;
1404     master_active = root->r.r_active;
1405     master_set_numthreads = master_th->th.th_set_nproc;
1406 
1407 #if OMPT_SUPPORT
1408     ompt_parallel_id_t ompt_parallel_id;
1409     ompt_task_id_t ompt_task_id;
1410     ompt_frame_t *ompt_frame;
1411     ompt_task_id_t my_task_id;
1412     ompt_parallel_id_t my_parallel_id;
1413 
1414     if (ompt_enabled) {
1415       ompt_parallel_id = __ompt_parallel_id_new(gtid);
1416       ompt_task_id = __ompt_get_task_id_internal(0);
1417       ompt_frame = __ompt_get_task_frame_internal(0);
1418     }
1419 #endif
1420 
1421     // Nested level will be an index in the nested nthreads array
1422     level = parent_team->t.t_level;
1423     // used to launch non-serial teams even if nested is not allowed
1424     active_level = parent_team->t.t_active_level;
1425 #if OMP_40_ENABLED
1426     // needed to check nesting inside the teams
1427     teams_level = master_th->th.th_teams_level;
1428 #endif
1429 #if KMP_NESTED_HOT_TEAMS
1430     p_hot_teams = &master_th->th.th_hot_teams;
1431     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1432       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1433           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1434       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1435       // it is either actual or not needed (when active_level > 0)
1436       (*p_hot_teams)[0].hot_team_nth = 1;
1437     }
1438 #endif
1439 
1440 #if OMPT_SUPPORT
1441     if (ompt_enabled &&
1442         ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1443       int team_size = master_set_numthreads;
1444 
1445       ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1446           ompt_task_id, ompt_frame, ompt_parallel_id, team_size, unwrapped_task,
1447           OMPT_INVOKER(call_context));
1448     }
1449 #endif
1450 
1451     master_th->th.th_ident = loc;
1452 
1453 #if OMP_40_ENABLED
1454     if (master_th->th.th_teams_microtask && ap &&
1455         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1456       // AC: This is start of parallel that is nested inside teams construct.
1457       // The team is actual (hot), all workers are ready at the fork barrier.
1458       // No lock needed to initialize the team a bit, then free workers.
1459       parent_team->t.t_ident = loc;
1460       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1461       parent_team->t.t_argc = argc;
1462       argv = (void **)parent_team->t.t_argv;
1463       for (i = argc - 1; i >= 0; --i)
1464 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1465 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1466         *argv++ = va_arg(*ap, void *);
1467 #else
1468         *argv++ = va_arg(ap, void *);
1469 #endif
1470       // Increment our nested depth levels, but not increase the serialization
1471       if (parent_team == master_th->th.th_serial_team) {
1472         // AC: we are in serialized parallel
1473         __kmpc_serialized_parallel(loc, gtid);
1474         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1475         // AC: need this in order enquiry functions work
1476         // correctly, will restore at join time
1477         parent_team->t.t_serialized--;
1478 #if OMPT_SUPPORT
1479         void *dummy;
1480         void **exit_runtime_p;
1481 
1482         ompt_lw_taskteam_t lw_taskteam;
1483 
1484         if (ompt_enabled) {
1485           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, unwrapped_task,
1486                                   ompt_parallel_id);
1487           lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1488           exit_runtime_p =
1489               &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1490 
1491           __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1492 
1493 #if OMPT_TRACE
1494           /* OMPT implicit task begin */
1495           my_task_id = lw_taskteam.ompt_task_info.task_id;
1496           my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1497           if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1498             ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1499                 my_parallel_id, my_task_id);
1500           }
1501 #endif
1502 
1503           /* OMPT state */
1504           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1505         } else {
1506           exit_runtime_p = &dummy;
1507         }
1508 #endif
1509 
1510         {
1511           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1512           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1513           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1514 #if OMPT_SUPPORT
1515                                  ,
1516                                  exit_runtime_p
1517 #endif
1518                                  );
1519         }
1520 
1521 #if OMPT_SUPPORT
1522         *exit_runtime_p = NULL;
1523         if (ompt_enabled) {
1524 #if OMPT_TRACE
1525           lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1526 
1527           if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1528             ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1529                 ompt_parallel_id, ompt_task_id);
1530           }
1531 
1532           __ompt_lw_taskteam_unlink(master_th);
1533           // reset clear the task id only after unlinking the task
1534           lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1535 #endif
1536 
1537           if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1538             ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1539                 ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
1540           }
1541           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1542         }
1543 #endif
1544         return TRUE;
1545       }
1546 
1547       parent_team->t.t_pkfn = microtask;
1548 #if OMPT_SUPPORT
1549       parent_team->t.ompt_team_info.microtask = unwrapped_task;
1550 #endif
1551       parent_team->t.t_invoke = invoker;
1552       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1553       parent_team->t.t_active_level++;
1554       parent_team->t.t_level++;
1555 
1556       /* Change number of threads in the team if requested */
1557       if (master_set_numthreads) { // The parallel has num_threads clause
1558         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1559           // AC: only can reduce number of threads dynamically, can't increase
1560           kmp_info_t **other_threads = parent_team->t.t_threads;
1561           parent_team->t.t_nproc = master_set_numthreads;
1562           for (i = 0; i < master_set_numthreads; ++i) {
1563             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1564           }
1565           // Keep extra threads hot in the team for possible next parallels
1566         }
1567         master_th->th.th_set_nproc = 0;
1568       }
1569 
1570 #if USE_DEBUGGER
1571       if (__kmp_debugging) { // Let debugger override number of threads.
1572         int nth = __kmp_omp_num_threads(loc);
1573         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1574           master_set_numthreads = nth;
1575         }; // if
1576       }; // if
1577 #endif
1578 
1579       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1580                     "master_th=%p, gtid=%d\n",
1581                     root, parent_team, master_th, gtid));
1582       __kmp_internal_fork(loc, gtid, parent_team);
1583       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1584                     "master_th=%p, gtid=%d\n",
1585                     root, parent_team, master_th, gtid));
1586 
1587       /* Invoke microtask for MASTER thread */
1588       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1589                     parent_team->t.t_id, parent_team->t.t_pkfn));
1590 
1591       {
1592         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1593         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1594         if (!parent_team->t.t_invoke(gtid)) {
1595           KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1596         }
1597       }
1598       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1599                     parent_team->t.t_id, parent_team->t.t_pkfn));
1600       KMP_MB(); /* Flush all pending memory write invalidates.  */
1601 
1602       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1603 
1604       return TRUE;
1605     } // Parallel closely nested in teams construct
1606 #endif /* OMP_40_ENABLED */
1607 
1608 #if KMP_DEBUG
1609     if (__kmp_tasking_mode != tskm_immediate_exec) {
1610       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1611                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1612     }
1613 #endif
1614 
1615     if (parent_team->t.t_active_level >=
1616         master_th->th.th_current_task->td_icvs.max_active_levels) {
1617       nthreads = 1;
1618     } else {
1619 #if OMP_40_ENABLED
1620       int enter_teams = ((ap == NULL && active_level == 0) ||
1621                          (ap && teams_level > 0 && teams_level == level));
1622 #endif
1623       nthreads =
1624           master_set_numthreads
1625               ? master_set_numthreads
1626               : get__nproc_2(
1627                     parent_team,
1628                     master_tid); // TODO: get nproc directly from current task
1629 
1630       // Check if we need to take forkjoin lock? (no need for serialized
1631       // parallel out of teams construct). This code moved here from
1632       // __kmp_reserve_threads() to speedup nested serialized parallels.
1633       if (nthreads > 1) {
1634         if ((!get__nested(master_th) && (root->r.r_in_parallel
1635 #if OMP_40_ENABLED
1636                                          && !enter_teams
1637 #endif /* OMP_40_ENABLED */
1638                                          )) ||
1639             (__kmp_library == library_serial)) {
1640           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1641                         " threads\n",
1642                         gtid, nthreads));
1643           nthreads = 1;
1644         }
1645       }
1646       if (nthreads > 1) {
1647         /* determine how many new threads we can use */
1648         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1649         nthreads = __kmp_reserve_threads(
1650             root, parent_team, master_tid, nthreads
1651 #if OMP_40_ENABLED
1652             /* AC: If we execute teams from parallel region (on host), then
1653                teams should be created but each can only have 1 thread if
1654                nesting is disabled. If teams called from serial region, then
1655                teams and their threads should be created regardless of the
1656                nesting setting. */
1657             ,
1658             enter_teams
1659 #endif /* OMP_40_ENABLED */
1660             );
1661         if (nthreads == 1) {
1662           // Free lock for single thread execution here; for multi-thread
1663           // execution it will be freed later after team of threads created
1664           // and initialized
1665           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1666         }
1667       }
1668     }
1669     KMP_DEBUG_ASSERT(nthreads > 0);
1670 
1671     // If we temporarily changed the set number of threads then restore it now
1672     master_th->th.th_set_nproc = 0;
1673 
1674     /* create a serialized parallel region? */
1675     if (nthreads == 1) {
1676 /* josh todo: hypothetical question: what do we do for OS X*? */
1677 #if KMP_OS_LINUX &&                                                            \
1678     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1679       void *args[argc];
1680 #else
1681       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1682 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1683           KMP_ARCH_AARCH64) */
1684 
1685       KA_TRACE(20,
1686                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1687 
1688       __kmpc_serialized_parallel(loc, gtid);
1689 
1690       if (call_context == fork_context_intel) {
1691         /* TODO this sucks, use the compiler itself to pass args! :) */
1692         master_th->th.th_serial_team->t.t_ident = loc;
1693 #if OMP_40_ENABLED
1694         if (!ap) {
1695           // revert change made in __kmpc_serialized_parallel()
1696           master_th->th.th_serial_team->t.t_level--;
1697 // Get args from parent team for teams construct
1698 
1699 #if OMPT_SUPPORT
1700           void *dummy;
1701           void **exit_runtime_p;
1702 
1703           ompt_lw_taskteam_t lw_taskteam;
1704 
1705           if (ompt_enabled) {
1706             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1707                                     unwrapped_task, ompt_parallel_id);
1708             lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1709             exit_runtime_p =
1710                 &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1711 
1712             __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1713 
1714 #if OMPT_TRACE
1715             my_task_id = lw_taskteam.ompt_task_info.task_id;
1716             if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1717               ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1718                   ompt_parallel_id, my_task_id);
1719             }
1720 #endif
1721 
1722             /* OMPT state */
1723             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1724           } else {
1725             exit_runtime_p = &dummy;
1726           }
1727 #endif
1728 
1729           {
1730             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1731             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1732             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1733                                    parent_team->t.t_argv
1734 #if OMPT_SUPPORT
1735                                    ,
1736                                    exit_runtime_p
1737 #endif
1738                                    );
1739           }
1740 
1741 #if OMPT_SUPPORT
1742           *exit_runtime_p = NULL;
1743           if (ompt_enabled) {
1744             lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1745 
1746 #if OMPT_TRACE
1747             if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1748               ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1749                   ompt_parallel_id, ompt_task_id);
1750             }
1751 #endif
1752 
1753             __ompt_lw_taskteam_unlink(master_th);
1754             // reset clear the task id only after unlinking the task
1755             lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1756 
1757             if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1758               ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1759                   ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
1760             }
1761             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1762           }
1763 #endif
1764         } else if (microtask == (microtask_t)__kmp_teams_master) {
1765           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1766                            master_th->th.th_serial_team);
1767           team = master_th->th.th_team;
1768           // team->t.t_pkfn = microtask;
1769           team->t.t_invoke = invoker;
1770           __kmp_alloc_argv_entries(argc, team, TRUE);
1771           team->t.t_argc = argc;
1772           argv = (void **)team->t.t_argv;
1773           if (ap) {
1774             for (i = argc - 1; i >= 0; --i)
1775 // TODO: revert workaround for Intel(R) 64 tracker #96
1776 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1777               *argv++ = va_arg(*ap, void *);
1778 #else
1779               *argv++ = va_arg(ap, void *);
1780 #endif
1781           } else {
1782             for (i = 0; i < argc; ++i)
1783               // Get args from parent team for teams construct
1784               argv[i] = parent_team->t.t_argv[i];
1785           }
1786           // AC: revert change made in __kmpc_serialized_parallel()
1787           //     because initial code in teams should have level=0
1788           team->t.t_level--;
1789           // AC: call special invoker for outer "parallel" of teams construct
1790           {
1791             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1792             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1793             invoker(gtid);
1794           }
1795         } else {
1796 #endif /* OMP_40_ENABLED */
1797           argv = args;
1798           for (i = argc - 1; i >= 0; --i)
1799 // TODO: revert workaround for Intel(R) 64 tracker #96
1800 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1801             *argv++ = va_arg(*ap, void *);
1802 #else
1803           *argv++ = va_arg(ap, void *);
1804 #endif
1805           KMP_MB();
1806 
1807 #if OMPT_SUPPORT
1808           void *dummy;
1809           void **exit_runtime_p;
1810 
1811           ompt_lw_taskteam_t lw_taskteam;
1812 
1813           if (ompt_enabled) {
1814             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1815                                     unwrapped_task, ompt_parallel_id);
1816             lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1817             exit_runtime_p =
1818                 &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1819 
1820             __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1821 
1822 #if OMPT_TRACE
1823             /* OMPT implicit task begin */
1824             my_task_id = lw_taskteam.ompt_task_info.task_id;
1825             my_parallel_id = ompt_parallel_id;
1826             if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1827               ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1828                   my_parallel_id, my_task_id);
1829             }
1830 #endif
1831 
1832             /* OMPT state */
1833             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1834           } else {
1835             exit_runtime_p = &dummy;
1836           }
1837 #endif
1838 
1839           {
1840             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1841             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1842             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1843 #if OMPT_SUPPORT
1844                                    ,
1845                                    exit_runtime_p
1846 #endif
1847                                    );
1848           }
1849 
1850 #if OMPT_SUPPORT
1851           *exit_runtime_p = NULL;
1852           if (ompt_enabled) {
1853 #if OMPT_TRACE
1854             lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1855 
1856             if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1857               ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1858                   my_parallel_id, my_task_id);
1859             }
1860 #endif
1861 
1862             __ompt_lw_taskteam_unlink(master_th);
1863             // reset clear the task id only after unlinking the task
1864             lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1865 
1866             if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1867               ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1868                   ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
1869             }
1870             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1871           }
1872 #endif
1873 #if OMP_40_ENABLED
1874         }
1875 #endif /* OMP_40_ENABLED */
1876       } else if (call_context == fork_context_gnu) {
1877 #if OMPT_SUPPORT
1878         ompt_lw_taskteam_t *lwt =
1879             (ompt_lw_taskteam_t *)__kmp_allocate(sizeof(ompt_lw_taskteam_t));
1880         __ompt_lw_taskteam_init(lwt, master_th, gtid, unwrapped_task,
1881                                 ompt_parallel_id);
1882 
1883         lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1884         lwt->ompt_task_info.frame.exit_runtime_frame = NULL;
1885         __ompt_lw_taskteam_link(lwt, master_th);
1886 #endif
1887 
1888         // we were called from GNU native code
1889         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1890         return FALSE;
1891       }
1892       else {
1893         KMP_ASSERT2(call_context < fork_context_last,
1894                     "__kmp_fork_call: unknown fork_context parameter");
1895       }
1896 
1897       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1898       KMP_MB();
1899       return FALSE;
1900     }
1901 
1902     // GEH: only modify the executing flag in the case when not serialized
1903     //      serialized case is handled in kmpc_serialized_parallel
1904     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1905                   "curtask=%p, curtask_max_aclevel=%d\n",
1906                   parent_team->t.t_active_level, master_th,
1907                   master_th->th.th_current_task,
1908                   master_th->th.th_current_task->td_icvs.max_active_levels));
1909     // TODO: GEH - cannot do this assertion because root thread not set up as
1910     // executing
1911     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1912     master_th->th.th_current_task->td_flags.executing = 0;
1913 
1914 #if OMP_40_ENABLED
1915     if (!master_th->th.th_teams_microtask || level > teams_level)
1916 #endif /* OMP_40_ENABLED */
1917     {
1918       /* Increment our nested depth level */
1919       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1920     }
1921 
1922     // See if we need to make a copy of the ICVs.
1923     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1924     if ((level + 1 < __kmp_nested_nth.used) &&
1925         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1926       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1927     } else {
1928       nthreads_icv = 0; // don't update
1929     }
1930 
1931 #if OMP_40_ENABLED
1932     // Figure out the proc_bind_policy for the new team.
1933     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1934     kmp_proc_bind_t proc_bind_icv =
1935         proc_bind_default; // proc_bind_default means don't update
1936     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1937       proc_bind = proc_bind_false;
1938     } else {
1939       if (proc_bind == proc_bind_default) {
1940         // No proc_bind clause specified; use current proc-bind-var for this
1941         // parallel region
1942         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1943       }
1944       /* else: The proc_bind policy was specified explicitly on parallel clause.
1945          This overrides proc-bind-var for this parallel region, but does not
1946          change proc-bind-var. */
1947       // Figure the value of proc-bind-var for the child threads.
1948       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1949           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1950            master_th->th.th_current_task->td_icvs.proc_bind)) {
1951         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1952       }
1953     }
1954 
1955     // Reset for next parallel region
1956     master_th->th.th_set_proc_bind = proc_bind_default;
1957 #endif /* OMP_40_ENABLED */
1958 
1959     if ((nthreads_icv > 0)
1960 #if OMP_40_ENABLED
1961         || (proc_bind_icv != proc_bind_default)
1962 #endif /* OMP_40_ENABLED */
1963             ) {
1964       kmp_internal_control_t new_icvs;
1965       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1966       new_icvs.next = NULL;
1967       if (nthreads_icv > 0) {
1968         new_icvs.nproc = nthreads_icv;
1969       }
1970 
1971 #if OMP_40_ENABLED
1972       if (proc_bind_icv != proc_bind_default) {
1973         new_icvs.proc_bind = proc_bind_icv;
1974       }
1975 #endif /* OMP_40_ENABLED */
1976 
1977       /* allocate a new parallel team */
1978       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1979       team = __kmp_allocate_team(root, nthreads, nthreads,
1980 #if OMPT_SUPPORT
1981                                  ompt_parallel_id,
1982 #endif
1983 #if OMP_40_ENABLED
1984                                  proc_bind,
1985 #endif
1986                                  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
1987     } else {
1988       /* allocate a new parallel team */
1989       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1990       team = __kmp_allocate_team(root, nthreads, nthreads,
1991 #if OMPT_SUPPORT
1992                                  ompt_parallel_id,
1993 #endif
1994 #if OMP_40_ENABLED
1995                                  proc_bind,
1996 #endif
1997                                  &master_th->th.th_current_task->td_icvs,
1998                                  argc USE_NESTED_HOT_ARG(master_th));
1999     }
2000     KF_TRACE(
2001         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2002 
2003     /* setup the new team */
2004     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2005     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2006     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2007     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2008     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2009 #if OMPT_SUPPORT
2010     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.microtask, unwrapped_task);
2011 #endif
2012     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2013 // TODO: parent_team->t.t_level == INT_MAX ???
2014 #if OMP_40_ENABLED
2015     if (!master_th->th.th_teams_microtask || level > teams_level) {
2016 #endif /* OMP_40_ENABLED */
2017       int new_level = parent_team->t.t_level + 1;
2018       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2019       new_level = parent_team->t.t_active_level + 1;
2020       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2021 #if OMP_40_ENABLED
2022     } else {
2023       // AC: Do not increase parallel level at start of the teams construct
2024       int new_level = parent_team->t.t_level;
2025       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2026       new_level = parent_team->t.t_active_level;
2027       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2028     }
2029 #endif /* OMP_40_ENABLED */
2030     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2031     if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
2032         team->t.t_sched.chunk != new_sched.chunk)
2033       team->t.t_sched =
2034           new_sched; // set master's schedule as new run-time schedule
2035 
2036 #if OMP_40_ENABLED
2037     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2038 #endif
2039 
2040     // Update the floating point rounding in the team if required.
2041     propagateFPControl(team);
2042 
2043     if (__kmp_tasking_mode != tskm_immediate_exec) {
2044       // Set master's task team to team's task team. Unless this is hot team, it
2045       // should be NULL.
2046       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2047                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2048       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2049                     "%p, new task_team %p / team %p\n",
2050                     __kmp_gtid_from_thread(master_th),
2051                     master_th->th.th_task_team, parent_team,
2052                     team->t.t_task_team[master_th->th.th_task_state], team));
2053 
2054       if (active_level || master_th->th.th_task_team) {
2055         // Take a memo of master's task_state
2056         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2057         if (master_th->th.th_task_state_top >=
2058             master_th->th.th_task_state_stack_sz) { // increase size
2059           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2060           kmp_uint8 *old_stack, *new_stack;
2061           kmp_uint32 i;
2062           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2063           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2064             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2065           }
2066           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2067                ++i) { // zero-init rest of stack
2068             new_stack[i] = 0;
2069           }
2070           old_stack = master_th->th.th_task_state_memo_stack;
2071           master_th->th.th_task_state_memo_stack = new_stack;
2072           master_th->th.th_task_state_stack_sz = new_size;
2073           __kmp_free(old_stack);
2074         }
2075         // Store master's task_state on stack
2076         master_th->th
2077             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2078             master_th->th.th_task_state;
2079         master_th->th.th_task_state_top++;
2080 #if KMP_NESTED_HOT_TEAMS
2081         if (team == master_th->th.th_hot_teams[active_level].hot_team) {
2082           // Restore master's nested state if nested hot team
2083           master_th->th.th_task_state =
2084               master_th->th
2085                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2086         } else {
2087 #endif
2088           master_th->th.th_task_state = 0;
2089 #if KMP_NESTED_HOT_TEAMS
2090         }
2091 #endif
2092       }
2093 #if !KMP_NESTED_HOT_TEAMS
2094       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2095                        (team == root->r.r_hot_team));
2096 #endif
2097     }
2098 
2099     KA_TRACE(
2100         20,
2101         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2102          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2103          team->t.t_nproc));
2104     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2105                      (team->t.t_master_tid == 0 &&
2106                       (team->t.t_parent == root->r.r_root_team ||
2107                        team->t.t_parent->t.t_serialized)));
2108     KMP_MB();
2109 
2110     /* now, setup the arguments */
2111     argv = (void **)team->t.t_argv;
2112 #if OMP_40_ENABLED
2113     if (ap) {
2114 #endif /* OMP_40_ENABLED */
2115       for (i = argc - 1; i >= 0; --i) {
2116 // TODO: revert workaround for Intel(R) 64 tracker #96
2117 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2118         void *new_argv = va_arg(*ap, void *);
2119 #else
2120       void *new_argv = va_arg(ap, void *);
2121 #endif
2122         KMP_CHECK_UPDATE(*argv, new_argv);
2123         argv++;
2124       }
2125 #if OMP_40_ENABLED
2126     } else {
2127       for (i = 0; i < argc; ++i) {
2128         // Get args from parent team for teams construct
2129         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2130       }
2131     }
2132 #endif /* OMP_40_ENABLED */
2133 
2134     /* now actually fork the threads */
2135     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2136     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2137       root->r.r_active = TRUE;
2138 
2139     __kmp_fork_team_threads(root, team, master_th, gtid);
2140     __kmp_setup_icv_copy(team, nthreads,
2141                          &master_th->th.th_current_task->td_icvs, loc);
2142 
2143 #if OMPT_SUPPORT
2144     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2145 #endif
2146 
2147     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2148 
2149 #if USE_ITT_BUILD
2150     if (team->t.t_active_level == 1 // only report frames at level 1
2151 #if OMP_40_ENABLED
2152         && !master_th->th.th_teams_microtask // not in teams construct
2153 #endif /* OMP_40_ENABLED */
2154         ) {
2155 #if USE_ITT_NOTIFY
2156       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2157           (__kmp_forkjoin_frames_mode == 3 ||
2158            __kmp_forkjoin_frames_mode == 1)) {
2159         kmp_uint64 tmp_time = 0;
2160         if (__itt_get_timestamp_ptr)
2161           tmp_time = __itt_get_timestamp();
2162         // Internal fork - report frame begin
2163         master_th->th.th_frame_time = tmp_time;
2164         if (__kmp_forkjoin_frames_mode == 3)
2165           team->t.t_region_time = tmp_time;
2166       } else
2167 // only one notification scheme (either "submit" or "forking/joined", not both)
2168 #endif /* USE_ITT_NOTIFY */
2169           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2170               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2171         // Mark start of "parallel" region for VTune.
2172         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2173       }
2174     }
2175 #endif /* USE_ITT_BUILD */
2176 
2177     /* now go on and do the work */
2178     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2179     KMP_MB();
2180     KF_TRACE(10,
2181              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2182               root, team, master_th, gtid));
2183 
2184 #if USE_ITT_BUILD
2185     if (__itt_stack_caller_create_ptr) {
2186       team->t.t_stack_id =
2187           __kmp_itt_stack_caller_create(); // create new stack stitching id
2188       // before entering fork barrier
2189     }
2190 #endif /* USE_ITT_BUILD */
2191 
2192 #if OMP_40_ENABLED
2193     // AC: skip __kmp_internal_fork at teams construct, let only master
2194     // threads execute
2195     if (ap)
2196 #endif /* OMP_40_ENABLED */
2197     {
2198       __kmp_internal_fork(loc, gtid, team);
2199       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2200                     "master_th=%p, gtid=%d\n",
2201                     root, team, master_th, gtid));
2202     }
2203 
2204     if (call_context == fork_context_gnu) {
2205       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2206       return TRUE;
2207     }
2208 
2209     /* Invoke microtask for MASTER thread */
2210     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2211                   team->t.t_id, team->t.t_pkfn));
2212   } // END of timer KMP_fork_call block
2213 
2214   {
2215     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2216     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2217     if (!team->t.t_invoke(gtid)) {
2218       KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2219     }
2220   }
2221   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2222                 team->t.t_id, team->t.t_pkfn));
2223   KMP_MB(); /* Flush all pending memory write invalidates.  */
2224 
2225   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2226 
2227 #if OMPT_SUPPORT
2228   if (ompt_enabled) {
2229     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2230   }
2231 #endif
2232 
2233   return TRUE;
2234 }
2235 
2236 #if OMPT_SUPPORT
2237 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2238                                             kmp_team_t *team) {
2239   // restore state outside the region
2240   thread->th.ompt_thread_info.state =
2241       ((team->t.t_serialized) ? ompt_state_work_serial
2242                               : ompt_state_work_parallel);
2243 }
2244 
2245 static inline void __kmp_join_ompt(kmp_info_t *thread, kmp_team_t *team,
2246                                    ompt_parallel_id_t parallel_id,
2247                                    fork_context_e fork_context) {
2248   ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2249   if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2250     ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2251         parallel_id, task_info->task_id, OMPT_INVOKER(fork_context));
2252   }
2253 
2254   task_info->frame.reenter_runtime_frame = NULL;
2255   __kmp_join_restore_state(thread, team);
2256 }
2257 #endif
2258 
2259 void __kmp_join_call(ident_t *loc, int gtid
2260 #if OMPT_SUPPORT
2261                      ,
2262                      enum fork_context_e fork_context
2263 #endif
2264 #if OMP_40_ENABLED
2265                      ,
2266                      int exit_teams
2267 #endif /* OMP_40_ENABLED */
2268                      ) {
2269   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2270   kmp_team_t *team;
2271   kmp_team_t *parent_team;
2272   kmp_info_t *master_th;
2273   kmp_root_t *root;
2274   int master_active;
2275   int i;
2276 
2277   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2278 
2279   /* setup current data */
2280   master_th = __kmp_threads[gtid];
2281   root = master_th->th.th_root;
2282   team = master_th->th.th_team;
2283   parent_team = team->t.t_parent;
2284 
2285   master_th->th.th_ident = loc;
2286 
2287 #if OMPT_SUPPORT
2288   if (ompt_enabled) {
2289     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2290   }
2291 #endif
2292 
2293 #if KMP_DEBUG
2294   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2295     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2296                   "th_task_team = %p\n",
2297                   __kmp_gtid_from_thread(master_th), team,
2298                   team->t.t_task_team[master_th->th.th_task_state],
2299                   master_th->th.th_task_team));
2300     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2301                      team->t.t_task_team[master_th->th.th_task_state]);
2302   }
2303 #endif
2304 
2305   if (team->t.t_serialized) {
2306 #if OMP_40_ENABLED
2307     if (master_th->th.th_teams_microtask) {
2308       // We are in teams construct
2309       int level = team->t.t_level;
2310       int tlevel = master_th->th.th_teams_level;
2311       if (level == tlevel) {
2312         // AC: we haven't incremented it earlier at start of teams construct,
2313         //     so do it here - at the end of teams construct
2314         team->t.t_level++;
2315       } else if (level == tlevel + 1) {
2316         // AC: we are exiting parallel inside teams, need to increment
2317         // serialization in order to restore it in the next call to
2318         // __kmpc_end_serialized_parallel
2319         team->t.t_serialized++;
2320       }
2321     }
2322 #endif /* OMP_40_ENABLED */
2323     __kmpc_end_serialized_parallel(loc, gtid);
2324 
2325 #if OMPT_SUPPORT
2326     if (ompt_enabled) {
2327       __kmp_join_restore_state(master_th, parent_team);
2328     }
2329 #endif
2330 
2331     return;
2332   }
2333 
2334   master_active = team->t.t_master_active;
2335 
2336 #if OMP_40_ENABLED
2337   if (!exit_teams)
2338 #endif /* OMP_40_ENABLED */
2339   {
2340     // AC: No barrier for internal teams at exit from teams construct.
2341     //     But there is barrier for external team (league).
2342     __kmp_internal_join(loc, gtid, team);
2343   }
2344 #if OMP_40_ENABLED
2345   else {
2346     master_th->th.th_task_state =
2347         0; // AC: no tasking in teams (out of any parallel)
2348   }
2349 #endif /* OMP_40_ENABLED */
2350 
2351   KMP_MB();
2352 
2353 #if OMPT_SUPPORT
2354   ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2355 #endif
2356 
2357 #if USE_ITT_BUILD
2358   if (__itt_stack_caller_create_ptr) {
2359     __kmp_itt_stack_caller_destroy(
2360         (__itt_caller)team->t
2361             .t_stack_id); // destroy the stack stitching id after join barrier
2362   }
2363 
2364   // Mark end of "parallel" region for VTune.
2365   if (team->t.t_active_level == 1
2366 #if OMP_40_ENABLED
2367       && !master_th->th.th_teams_microtask /* not in teams construct */
2368 #endif /* OMP_40_ENABLED */
2369       ) {
2370     master_th->th.th_ident = loc;
2371     // only one notification scheme (either "submit" or "forking/joined", not
2372     // both)
2373     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2374         __kmp_forkjoin_frames_mode == 3)
2375       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2376                              master_th->th.th_frame_time, 0, loc,
2377                              master_th->th.th_team_nproc, 1);
2378     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2379              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2380       __kmp_itt_region_joined(gtid);
2381   } // active_level == 1
2382 #endif /* USE_ITT_BUILD */
2383 
2384 #if OMP_40_ENABLED
2385   if (master_th->th.th_teams_microtask && !exit_teams &&
2386       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2387       team->t.t_level == master_th->th.th_teams_level + 1) {
2388     // AC: We need to leave the team structure intact at the end of parallel
2389     // inside the teams construct, so that at the next parallel same (hot) team
2390     // works, only adjust nesting levels
2391 
2392     /* Decrement our nested depth level */
2393     team->t.t_level--;
2394     team->t.t_active_level--;
2395     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2396 
2397     /* Restore number of threads in the team if needed */
2398     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2399       int old_num = master_th->th.th_team_nproc;
2400       int new_num = master_th->th.th_teams_size.nth;
2401       kmp_info_t **other_threads = team->t.t_threads;
2402       team->t.t_nproc = new_num;
2403       for (i = 0; i < old_num; ++i) {
2404         other_threads[i]->th.th_team_nproc = new_num;
2405       }
2406       // Adjust states of non-used threads of the team
2407       for (i = old_num; i < new_num; ++i) {
2408         // Re-initialize thread's barrier data.
2409         int b;
2410         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2411         for (b = 0; b < bs_last_barrier; ++b) {
2412           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2413           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2414 #if USE_DEBUGGER
2415           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2416 #endif
2417         }
2418         if (__kmp_tasking_mode != tskm_immediate_exec) {
2419           // Synchronize thread's task state
2420           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2421         }
2422       }
2423     }
2424 
2425 #if OMPT_SUPPORT
2426     if (ompt_enabled) {
2427       __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2428     }
2429 #endif
2430 
2431     return;
2432   }
2433 #endif /* OMP_40_ENABLED */
2434 
2435   /* do cleanup and restore the parent team */
2436   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2437   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2438 
2439   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2440 
2441   /* jc: The following lock has instructions with REL and ACQ semantics,
2442      separating the parallel user code called in this parallel region
2443      from the serial user code called after this function returns. */
2444   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2445 
2446 #if OMP_40_ENABLED
2447   if (!master_th->th.th_teams_microtask ||
2448       team->t.t_level > master_th->th.th_teams_level)
2449 #endif /* OMP_40_ENABLED */
2450   {
2451     /* Decrement our nested depth level */
2452     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2453   }
2454   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2455 
2456 #if OMPT_SUPPORT && OMPT_TRACE
2457   if (ompt_enabled) {
2458     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2459     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
2460       ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
2461           parallel_id, task_info->task_id);
2462     }
2463     task_info->frame.exit_runtime_frame = NULL;
2464     task_info->task_id = 0;
2465   }
2466 #endif
2467 
2468   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2469                 master_th, team));
2470   __kmp_pop_current_task_from_thread(master_th);
2471 
2472 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2473   // Restore master thread's partition.
2474   master_th->th.th_first_place = team->t.t_first_place;
2475   master_th->th.th_last_place = team->t.t_last_place;
2476 #endif /* OMP_40_ENABLED */
2477 
2478   updateHWFPControl(team);
2479 
2480   if (root->r.r_active != master_active)
2481     root->r.r_active = master_active;
2482 
2483   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2484                             master_th)); // this will free worker threads
2485 
2486   /* this race was fun to find. make sure the following is in the critical
2487      region otherwise assertions may fail occasionally since the old team may be
2488      reallocated and the hierarchy appears inconsistent. it is actually safe to
2489      run and won't cause any bugs, but will cause those assertion failures. it's
2490      only one deref&assign so might as well put this in the critical region */
2491   master_th->th.th_team = parent_team;
2492   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2493   master_th->th.th_team_master = parent_team->t.t_threads[0];
2494   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2495 
2496   /* restore serialized team, if need be */
2497   if (parent_team->t.t_serialized &&
2498       parent_team != master_th->th.th_serial_team &&
2499       parent_team != root->r.r_root_team) {
2500     __kmp_free_team(root,
2501                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2502     master_th->th.th_serial_team = parent_team;
2503   }
2504 
2505   if (__kmp_tasking_mode != tskm_immediate_exec) {
2506     if (master_th->th.th_task_state_top >
2507         0) { // Restore task state from memo stack
2508       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2509       // Remember master's state if we re-use this nested hot team
2510       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2511           master_th->th.th_task_state;
2512       --master_th->th.th_task_state_top; // pop
2513       // Now restore state at this level
2514       master_th->th.th_task_state =
2515           master_th->th
2516               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2517     }
2518     // Copy the task team from the parent team to the master thread
2519     master_th->th.th_task_team =
2520         parent_team->t.t_task_team[master_th->th.th_task_state];
2521     KA_TRACE(20,
2522              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2523               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2524               parent_team));
2525   }
2526 
2527   // TODO: GEH - cannot do this assertion because root thread not set up as
2528   // executing
2529   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2530   master_th->th.th_current_task->td_flags.executing = 1;
2531 
2532   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2533 
2534 #if OMPT_SUPPORT
2535   if (ompt_enabled) {
2536     __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2537   }
2538 #endif
2539 
2540   KMP_MB();
2541   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2542 }
2543 
2544 /* Check whether we should push an internal control record onto the
2545    serial team stack.  If so, do it.  */
2546 void __kmp_save_internal_controls(kmp_info_t *thread) {
2547 
2548   if (thread->th.th_team != thread->th.th_serial_team) {
2549     return;
2550   }
2551   if (thread->th.th_team->t.t_serialized > 1) {
2552     int push = 0;
2553 
2554     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2555       push = 1;
2556     } else {
2557       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2558           thread->th.th_team->t.t_serialized) {
2559         push = 1;
2560       }
2561     }
2562     if (push) { /* push a record on the serial team's stack */
2563       kmp_internal_control_t *control =
2564           (kmp_internal_control_t *)__kmp_allocate(
2565               sizeof(kmp_internal_control_t));
2566 
2567       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2568 
2569       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2570 
2571       control->next = thread->th.th_team->t.t_control_stack_top;
2572       thread->th.th_team->t.t_control_stack_top = control;
2573     }
2574   }
2575 }
2576 
2577 /* Changes set_nproc */
2578 void __kmp_set_num_threads(int new_nth, int gtid) {
2579   kmp_info_t *thread;
2580   kmp_root_t *root;
2581 
2582   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2583   KMP_DEBUG_ASSERT(__kmp_init_serial);
2584 
2585   if (new_nth < 1)
2586     new_nth = 1;
2587   else if (new_nth > __kmp_max_nth)
2588     new_nth = __kmp_max_nth;
2589 
2590   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2591   thread = __kmp_threads[gtid];
2592 
2593   __kmp_save_internal_controls(thread);
2594 
2595   set__nproc(thread, new_nth);
2596 
2597   // If this omp_set_num_threads() call will cause the hot team size to be
2598   // reduced (in the absence of a num_threads clause), then reduce it now,
2599   // rather than waiting for the next parallel region.
2600   root = thread->th.th_root;
2601   if (__kmp_init_parallel && (!root->r.r_active) &&
2602       (root->r.r_hot_team->t.t_nproc > new_nth)
2603 #if KMP_NESTED_HOT_TEAMS
2604       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2605 #endif
2606       ) {
2607     kmp_team_t *hot_team = root->r.r_hot_team;
2608     int f;
2609 
2610     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2611 
2612     // Release the extra threads we don't need any more.
2613     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2614       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2615       if (__kmp_tasking_mode != tskm_immediate_exec) {
2616         // When decreasing team size, threads no longer in the team should unref
2617         // task team.
2618         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2619       }
2620       __kmp_free_thread(hot_team->t.t_threads[f]);
2621       hot_team->t.t_threads[f] = NULL;
2622     }
2623     hot_team->t.t_nproc = new_nth;
2624 #if KMP_NESTED_HOT_TEAMS
2625     if (thread->th.th_hot_teams) {
2626       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2627       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2628     }
2629 #endif
2630 
2631     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2632 
2633     // Update the t_nproc field in the threads that are still active.
2634     for (f = 0; f < new_nth; f++) {
2635       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2636       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2637     }
2638     // Special flag in case omp_set_num_threads() call
2639     hot_team->t.t_size_changed = -1;
2640   }
2641 }
2642 
2643 /* Changes max_active_levels */
2644 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2645   kmp_info_t *thread;
2646 
2647   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2648                 "%d = (%d)\n",
2649                 gtid, max_active_levels));
2650   KMP_DEBUG_ASSERT(__kmp_init_serial);
2651 
2652   // validate max_active_levels
2653   if (max_active_levels < 0) {
2654     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2655     // We ignore this call if the user has specified a negative value.
2656     // The current setting won't be changed. The last valid setting will be
2657     // used. A warning will be issued (if warnings are allowed as controlled by
2658     // the KMP_WARNINGS env var).
2659     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2660                   "max_active_levels for thread %d = (%d)\n",
2661                   gtid, max_active_levels));
2662     return;
2663   }
2664   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2665     // it's OK, the max_active_levels is within the valid range: [ 0;
2666     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2667     // We allow a zero value. (implementation defined behavior)
2668   } else {
2669     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2670                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2671     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2672     // Current upper limit is MAX_INT. (implementation defined behavior)
2673     // If the input exceeds the upper limit, we correct the input to be the
2674     // upper limit. (implementation defined behavior)
2675     // Actually, the flow should never get here until we use MAX_INT limit.
2676   }
2677   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2678                 "max_active_levels for thread %d = (%d)\n",
2679                 gtid, max_active_levels));
2680 
2681   thread = __kmp_threads[gtid];
2682 
2683   __kmp_save_internal_controls(thread);
2684 
2685   set__max_active_levels(thread, max_active_levels);
2686 }
2687 
2688 /* Gets max_active_levels */
2689 int __kmp_get_max_active_levels(int gtid) {
2690   kmp_info_t *thread;
2691 
2692   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2693   KMP_DEBUG_ASSERT(__kmp_init_serial);
2694 
2695   thread = __kmp_threads[gtid];
2696   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2697   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2698                 "curtask_maxaclevel=%d\n",
2699                 gtid, thread->th.th_current_task,
2700                 thread->th.th_current_task->td_icvs.max_active_levels));
2701   return thread->th.th_current_task->td_icvs.max_active_levels;
2702 }
2703 
2704 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2705 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2706   kmp_info_t *thread;
2707   //    kmp_team_t *team;
2708 
2709   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2710                 gtid, (int)kind, chunk));
2711   KMP_DEBUG_ASSERT(__kmp_init_serial);
2712 
2713   // Check if the kind parameter is valid, correct if needed.
2714   // Valid parameters should fit in one of two intervals - standard or extended:
2715   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2716   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2717   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2718       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2719     // TODO: Hint needs attention in case we change the default schedule.
2720     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2721               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2722               __kmp_msg_null);
2723     kind = kmp_sched_default;
2724     chunk = 0; // ignore chunk value in case of bad kind
2725   }
2726 
2727   thread = __kmp_threads[gtid];
2728 
2729   __kmp_save_internal_controls(thread);
2730 
2731   if (kind < kmp_sched_upper_std) {
2732     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2733       // differ static chunked vs. unchunked:  chunk should be invalid to
2734       // indicate unchunked schedule (which is the default)
2735       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2736     } else {
2737       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2738           __kmp_sch_map[kind - kmp_sched_lower - 1];
2739     }
2740   } else {
2741     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2742     //    kmp_sched_lower - 2 ];
2743     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2744         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2745                       kmp_sched_lower - 2];
2746   }
2747   if (kind == kmp_sched_auto || chunk < 1) {
2748     // ignore parameter chunk for schedule auto
2749     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2750   } else {
2751     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2752   }
2753 }
2754 
2755 /* Gets def_sched_var ICV values */
2756 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2757   kmp_info_t *thread;
2758   enum sched_type th_type;
2759 
2760   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2761   KMP_DEBUG_ASSERT(__kmp_init_serial);
2762 
2763   thread = __kmp_threads[gtid];
2764 
2765   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2766 
2767   switch (th_type) {
2768   case kmp_sch_static:
2769   case kmp_sch_static_greedy:
2770   case kmp_sch_static_balanced:
2771     *kind = kmp_sched_static;
2772     *chunk = 0; // chunk was not set, try to show this fact via zero value
2773     return;
2774   case kmp_sch_static_chunked:
2775     *kind = kmp_sched_static;
2776     break;
2777   case kmp_sch_dynamic_chunked:
2778     *kind = kmp_sched_dynamic;
2779     break;
2780   case kmp_sch_guided_chunked:
2781   case kmp_sch_guided_iterative_chunked:
2782   case kmp_sch_guided_analytical_chunked:
2783     *kind = kmp_sched_guided;
2784     break;
2785   case kmp_sch_auto:
2786     *kind = kmp_sched_auto;
2787     break;
2788   case kmp_sch_trapezoidal:
2789     *kind = kmp_sched_trapezoidal;
2790     break;
2791 #if KMP_STATIC_STEAL_ENABLED
2792   case kmp_sch_static_steal:
2793     *kind = kmp_sched_static_steal;
2794     break;
2795 #endif
2796   default:
2797     KMP_FATAL(UnknownSchedulingType, th_type);
2798   }
2799 
2800   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2801 }
2802 
2803 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2804 
2805   int ii, dd;
2806   kmp_team_t *team;
2807   kmp_info_t *thr;
2808 
2809   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2810   KMP_DEBUG_ASSERT(__kmp_init_serial);
2811 
2812   // validate level
2813   if (level == 0)
2814     return 0;
2815   if (level < 0)
2816     return -1;
2817   thr = __kmp_threads[gtid];
2818   team = thr->th.th_team;
2819   ii = team->t.t_level;
2820   if (level > ii)
2821     return -1;
2822 
2823 #if OMP_40_ENABLED
2824   if (thr->th.th_teams_microtask) {
2825     // AC: we are in teams region where multiple nested teams have same level
2826     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2827     if (level <=
2828         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2829       KMP_DEBUG_ASSERT(ii >= tlevel);
2830       // AC: As we need to pass by the teams league, we need to artificially
2831       // increase ii
2832       if (ii == tlevel) {
2833         ii += 2; // three teams have same level
2834       } else {
2835         ii++; // two teams have same level
2836       }
2837     }
2838   }
2839 #endif
2840 
2841   if (ii == level)
2842     return __kmp_tid_from_gtid(gtid);
2843 
2844   dd = team->t.t_serialized;
2845   level++;
2846   while (ii > level) {
2847     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2848     }
2849     if ((team->t.t_serialized) && (!dd)) {
2850       team = team->t.t_parent;
2851       continue;
2852     }
2853     if (ii > level) {
2854       team = team->t.t_parent;
2855       dd = team->t.t_serialized;
2856       ii--;
2857     }
2858   }
2859 
2860   return (dd > 1) ? (0) : (team->t.t_master_tid);
2861 }
2862 
2863 int __kmp_get_team_size(int gtid, int level) {
2864 
2865   int ii, dd;
2866   kmp_team_t *team;
2867   kmp_info_t *thr;
2868 
2869   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2870   KMP_DEBUG_ASSERT(__kmp_init_serial);
2871 
2872   // validate level
2873   if (level == 0)
2874     return 1;
2875   if (level < 0)
2876     return -1;
2877   thr = __kmp_threads[gtid];
2878   team = thr->th.th_team;
2879   ii = team->t.t_level;
2880   if (level > ii)
2881     return -1;
2882 
2883 #if OMP_40_ENABLED
2884   if (thr->th.th_teams_microtask) {
2885     // AC: we are in teams region where multiple nested teams have same level
2886     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2887     if (level <=
2888         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2889       KMP_DEBUG_ASSERT(ii >= tlevel);
2890       // AC: As we need to pass by the teams league, we need to artificially
2891       // increase ii
2892       if (ii == tlevel) {
2893         ii += 2; // three teams have same level
2894       } else {
2895         ii++; // two teams have same level
2896       }
2897     }
2898   }
2899 #endif
2900 
2901   while (ii > level) {
2902     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2903     }
2904     if (team->t.t_serialized && (!dd)) {
2905       team = team->t.t_parent;
2906       continue;
2907     }
2908     if (ii > level) {
2909       team = team->t.t_parent;
2910       ii--;
2911     }
2912   }
2913 
2914   return team->t.t_nproc;
2915 }
2916 
2917 kmp_r_sched_t __kmp_get_schedule_global() {
2918   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2919   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2920   // independently. So one can get the updated schedule here.
2921 
2922   kmp_r_sched_t r_sched;
2923 
2924   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2925   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2926   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2927   // different roots (even in OMP 2.5)
2928   if (__kmp_sched == kmp_sch_static) {
2929     r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed
2930     // schedule (balanced or greedy)
2931   } else if (__kmp_sched == kmp_sch_guided_chunked) {
2932     r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed
2933     // schedule (iterative or analytical)
2934   } else {
2935     r_sched.r_sched_type =
2936         __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2937   }
2938 
2939   if (__kmp_chunk < KMP_DEFAULT_CHUNK) { // __kmp_chunk may be wrong here (if it
2940     // was not ever set)
2941     r_sched.chunk = KMP_DEFAULT_CHUNK;
2942   } else {
2943     r_sched.chunk = __kmp_chunk;
2944   }
2945 
2946   return r_sched;
2947 }
2948 
2949 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2950    at least argc number of *t_argv entries for the requested team. */
2951 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2952 
2953   KMP_DEBUG_ASSERT(team);
2954   if (!realloc || argc > team->t.t_max_argc) {
2955 
2956     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2957                    "current entries=%d\n",
2958                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2959     /* if previously allocated heap space for args, free them */
2960     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2961       __kmp_free((void *)team->t.t_argv);
2962 
2963     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2964       /* use unused space in the cache line for arguments */
2965       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2966       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2967                      "argv entries\n",
2968                      team->t.t_id, team->t.t_max_argc));
2969       team->t.t_argv = &team->t.t_inline_argv[0];
2970       if (__kmp_storage_map) {
2971         __kmp_print_storage_map_gtid(
2972             -1, &team->t.t_inline_argv[0],
2973             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2974             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
2975             team->t.t_id);
2976       }
2977     } else {
2978       /* allocate space for arguments in the heap */
2979       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
2980                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
2981                                : 2 * argc;
2982       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
2983                      "argv entries\n",
2984                      team->t.t_id, team->t.t_max_argc));
2985       team->t.t_argv =
2986           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
2987       if (__kmp_storage_map) {
2988         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
2989                                      &team->t.t_argv[team->t.t_max_argc],
2990                                      sizeof(void *) * team->t.t_max_argc,
2991                                      "team_%d.t_argv", team->t.t_id);
2992       }
2993     }
2994   }
2995 }
2996 
2997 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
2998   int i;
2999   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3000   team->t.t_threads =
3001       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3002   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3003       sizeof(dispatch_shared_info_t) * num_disp_buff);
3004   team->t.t_dispatch =
3005       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3006   team->t.t_implicit_task_taskdata =
3007       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3008   team->t.t_max_nproc = max_nth;
3009 
3010   /* setup dispatch buffers */
3011   for (i = 0; i < num_disp_buff; ++i) {
3012     team->t.t_disp_buffer[i].buffer_index = i;
3013 #if OMP_45_ENABLED
3014     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3015 #endif
3016   }
3017 }
3018 
3019 static void __kmp_free_team_arrays(kmp_team_t *team) {
3020   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3021   int i;
3022   for (i = 0; i < team->t.t_max_nproc; ++i) {
3023     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3024       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3025       team->t.t_dispatch[i].th_disp_buffer = NULL;
3026     }; // if
3027   }; // for
3028   __kmp_free(team->t.t_threads);
3029   __kmp_free(team->t.t_disp_buffer);
3030   __kmp_free(team->t.t_dispatch);
3031   __kmp_free(team->t.t_implicit_task_taskdata);
3032   team->t.t_threads = NULL;
3033   team->t.t_disp_buffer = NULL;
3034   team->t.t_dispatch = NULL;
3035   team->t.t_implicit_task_taskdata = 0;
3036 }
3037 
3038 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3039   kmp_info_t **oldThreads = team->t.t_threads;
3040 
3041   __kmp_free(team->t.t_disp_buffer);
3042   __kmp_free(team->t.t_dispatch);
3043   __kmp_free(team->t.t_implicit_task_taskdata);
3044   __kmp_allocate_team_arrays(team, max_nth);
3045 
3046   KMP_MEMCPY(team->t.t_threads, oldThreads,
3047              team->t.t_nproc * sizeof(kmp_info_t *));
3048 
3049   __kmp_free(oldThreads);
3050 }
3051 
3052 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3053 
3054   kmp_r_sched_t r_sched =
3055       __kmp_get_schedule_global(); // get current state of scheduling globals
3056 
3057 #if OMP_40_ENABLED
3058   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3059 #endif /* OMP_40_ENABLED */
3060 
3061   kmp_internal_control_t g_icvs = {
3062     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3063     (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3064     // for nested parallelism (per thread)
3065     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3066     // adjustment of threads (per thread)
3067     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3068     // whether blocktime is explicitly set
3069     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3070 #if KMP_USE_MONITOR
3071     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3072 // intervals
3073 #endif
3074     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3075     // next parallel region (per thread)
3076     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3077     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3078     // for max_active_levels
3079     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3080 // {sched,chunk} pair
3081 #if OMP_40_ENABLED
3082     __kmp_nested_proc_bind.bind_types[0],
3083     __kmp_default_device,
3084 #endif /* OMP_40_ENABLED */
3085     NULL // struct kmp_internal_control *next;
3086   };
3087 
3088   return g_icvs;
3089 }
3090 
3091 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3092 
3093   kmp_internal_control_t gx_icvs;
3094   gx_icvs.serial_nesting_level =
3095       0; // probably =team->t.t_serial like in save_inter_controls
3096   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3097   gx_icvs.next = NULL;
3098 
3099   return gx_icvs;
3100 }
3101 
3102 static void __kmp_initialize_root(kmp_root_t *root) {
3103   int f;
3104   kmp_team_t *root_team;
3105   kmp_team_t *hot_team;
3106   int hot_team_max_nth;
3107   kmp_r_sched_t r_sched =
3108       __kmp_get_schedule_global(); // get current state of scheduling globals
3109   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3110   KMP_DEBUG_ASSERT(root);
3111   KMP_ASSERT(!root->r.r_begin);
3112 
3113   /* setup the root state structure */
3114   __kmp_init_lock(&root->r.r_begin_lock);
3115   root->r.r_begin = FALSE;
3116   root->r.r_active = FALSE;
3117   root->r.r_in_parallel = 0;
3118   root->r.r_blocktime = __kmp_dflt_blocktime;
3119   root->r.r_nested = __kmp_dflt_nested;
3120 
3121   /* setup the root team for this task */
3122   /* allocate the root team structure */
3123   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3124 
3125   root_team =
3126       __kmp_allocate_team(root,
3127                           1, // new_nproc
3128                           1, // max_nproc
3129 #if OMPT_SUPPORT
3130                           0, // root parallel id
3131 #endif
3132 #if OMP_40_ENABLED
3133                           __kmp_nested_proc_bind.bind_types[0],
3134 #endif
3135                           &r_icvs,
3136                           0 // argc
3137                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3138                           );
3139 #if USE_DEBUGGER
3140   // Non-NULL value should be assigned to make the debugger display the root
3141   // team.
3142   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3143 #endif
3144 
3145   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3146 
3147   root->r.r_root_team = root_team;
3148   root_team->t.t_control_stack_top = NULL;
3149 
3150   /* initialize root team */
3151   root_team->t.t_threads[0] = NULL;
3152   root_team->t.t_nproc = 1;
3153   root_team->t.t_serialized = 1;
3154   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3155   root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3156   root_team->t.t_sched.chunk = r_sched.chunk;
3157   KA_TRACE(
3158       20,
3159       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3160        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3161 
3162   /* setup the  hot team for this task */
3163   /* allocate the hot team structure */
3164   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3165 
3166   hot_team =
3167       __kmp_allocate_team(root,
3168                           1, // new_nproc
3169                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3170 #if OMPT_SUPPORT
3171                           0, // root parallel id
3172 #endif
3173 #if OMP_40_ENABLED
3174                           __kmp_nested_proc_bind.bind_types[0],
3175 #endif
3176                           &r_icvs,
3177                           0 // argc
3178                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3179                           );
3180   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3181 
3182   root->r.r_hot_team = hot_team;
3183   root_team->t.t_control_stack_top = NULL;
3184 
3185   /* first-time initialization */
3186   hot_team->t.t_parent = root_team;
3187 
3188   /* initialize hot team */
3189   hot_team_max_nth = hot_team->t.t_max_nproc;
3190   for (f = 0; f < hot_team_max_nth; ++f) {
3191     hot_team->t.t_threads[f] = NULL;
3192   }; // for
3193   hot_team->t.t_nproc = 1;
3194   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3195   hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3196   hot_team->t.t_sched.chunk = r_sched.chunk;
3197   hot_team->t.t_size_changed = 0;
3198 }
3199 
3200 #ifdef KMP_DEBUG
3201 
3202 typedef struct kmp_team_list_item {
3203   kmp_team_p const *entry;
3204   struct kmp_team_list_item *next;
3205 } kmp_team_list_item_t;
3206 typedef kmp_team_list_item_t *kmp_team_list_t;
3207 
3208 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3209     kmp_team_list_t list, // List of teams.
3210     kmp_team_p const *team // Team to add.
3211     ) {
3212 
3213   // List must terminate with item where both entry and next are NULL.
3214   // Team is added to the list only once.
3215   // List is sorted in ascending order by team id.
3216   // Team id is *not* a key.
3217 
3218   kmp_team_list_t l;
3219 
3220   KMP_DEBUG_ASSERT(list != NULL);
3221   if (team == NULL) {
3222     return;
3223   }; // if
3224 
3225   __kmp_print_structure_team_accum(list, team->t.t_parent);
3226   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3227 
3228   // Search list for the team.
3229   l = list;
3230   while (l->next != NULL && l->entry != team) {
3231     l = l->next;
3232   }; // while
3233   if (l->next != NULL) {
3234     return; // Team has been added before, exit.
3235   }; // if
3236 
3237   // Team is not found. Search list again for insertion point.
3238   l = list;
3239   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3240     l = l->next;
3241   }; // while
3242 
3243   // Insert team.
3244   {
3245     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3246         sizeof(kmp_team_list_item_t));
3247     *item = *l;
3248     l->entry = team;
3249     l->next = item;
3250   }
3251 }
3252 
3253 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3254 
3255                                        ) {
3256   __kmp_printf("%s", title);
3257   if (team != NULL) {
3258     __kmp_printf("%2x %p\n", team->t.t_id, team);
3259   } else {
3260     __kmp_printf(" - (nil)\n");
3261   }; // if
3262 }
3263 
3264 static void __kmp_print_structure_thread(char const *title,
3265                                          kmp_info_p const *thread) {
3266   __kmp_printf("%s", title);
3267   if (thread != NULL) {
3268     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3269   } else {
3270     __kmp_printf(" - (nil)\n");
3271   }; // if
3272 }
3273 
3274 void __kmp_print_structure(void) {
3275 
3276   kmp_team_list_t list;
3277 
3278   // Initialize list of teams.
3279   list =
3280       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3281   list->entry = NULL;
3282   list->next = NULL;
3283 
3284   __kmp_printf("\n------------------------------\nGlobal Thread "
3285                "Table\n------------------------------\n");
3286   {
3287     int gtid;
3288     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3289       __kmp_printf("%2d", gtid);
3290       if (__kmp_threads != NULL) {
3291         __kmp_printf(" %p", __kmp_threads[gtid]);
3292       }; // if
3293       if (__kmp_root != NULL) {
3294         __kmp_printf(" %p", __kmp_root[gtid]);
3295       }; // if
3296       __kmp_printf("\n");
3297     }; // for gtid
3298   }
3299 
3300   // Print out __kmp_threads array.
3301   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3302                "----------\n");
3303   if (__kmp_threads != NULL) {
3304     int gtid;
3305     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3306       kmp_info_t const *thread = __kmp_threads[gtid];
3307       if (thread != NULL) {
3308         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3309         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3310         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3311         __kmp_print_structure_team("    Serial Team:  ",
3312                                    thread->th.th_serial_team);
3313         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3314         __kmp_print_structure_thread("    Master:       ",
3315                                      thread->th.th_team_master);
3316         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3317         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3318 #if OMP_40_ENABLED
3319         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3320 #endif
3321         __kmp_print_structure_thread("    Next in pool: ",
3322                                      thread->th.th_next_pool);
3323         __kmp_printf("\n");
3324         __kmp_print_structure_team_accum(list, thread->th.th_team);
3325         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3326       }; // if
3327     }; // for gtid
3328   } else {
3329     __kmp_printf("Threads array is not allocated.\n");
3330   }; // if
3331 
3332   // Print out __kmp_root array.
3333   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3334                "--------\n");
3335   if (__kmp_root != NULL) {
3336     int gtid;
3337     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3338       kmp_root_t const *root = __kmp_root[gtid];
3339       if (root != NULL) {
3340         __kmp_printf("GTID %2d %p:\n", gtid, root);
3341         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3342         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3343         __kmp_print_structure_thread("    Uber Thread:  ",
3344                                      root->r.r_uber_thread);
3345         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3346         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
3347         __kmp_printf("    In Parallel:  %2d\n", root->r.r_in_parallel);
3348         __kmp_printf("\n");
3349         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3350         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3351       }; // if
3352     }; // for gtid
3353   } else {
3354     __kmp_printf("Ubers array is not allocated.\n");
3355   }; // if
3356 
3357   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3358                "--------\n");
3359   while (list->next != NULL) {
3360     kmp_team_p const *team = list->entry;
3361     int i;
3362     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3363     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3364     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3365     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3366     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3367     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3368     for (i = 0; i < team->t.t_nproc; ++i) {
3369       __kmp_printf("    Thread %2d:      ", i);
3370       __kmp_print_structure_thread("", team->t.t_threads[i]);
3371     }; // for i
3372     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3373     __kmp_printf("\n");
3374     list = list->next;
3375   }; // while
3376 
3377   // Print out __kmp_thread_pool and __kmp_team_pool.
3378   __kmp_printf("\n------------------------------\nPools\n----------------------"
3379                "--------\n");
3380   __kmp_print_structure_thread("Thread pool:          ",
3381                                (kmp_info_t *)__kmp_thread_pool);
3382   __kmp_print_structure_team("Team pool:            ",
3383                              (kmp_team_t *)__kmp_team_pool);
3384   __kmp_printf("\n");
3385 
3386   // Free team list.
3387   while (list != NULL) {
3388     kmp_team_list_item_t *item = list;
3389     list = list->next;
3390     KMP_INTERNAL_FREE(item);
3391   }; // while
3392 }
3393 
3394 #endif
3395 
3396 //---------------------------------------------------------------------------
3397 //  Stuff for per-thread fast random number generator
3398 //  Table of primes
3399 static const unsigned __kmp_primes[] = {
3400     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3401     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3402     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3403     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3404     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3405     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3406     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3407     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3408     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3409     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3410     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3411 
3412 //---------------------------------------------------------------------------
3413 //  __kmp_get_random: Get a random number using a linear congruential method.
3414 unsigned short __kmp_get_random(kmp_info_t *thread) {
3415   unsigned x = thread->th.th_x;
3416   unsigned short r = x >> 16;
3417 
3418   thread->th.th_x = x * thread->th.th_a + 1;
3419 
3420   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3421                 thread->th.th_info.ds.ds_tid, r));
3422 
3423   return r;
3424 }
3425 //--------------------------------------------------------
3426 // __kmp_init_random: Initialize a random number generator
3427 void __kmp_init_random(kmp_info_t *thread) {
3428   unsigned seed = thread->th.th_info.ds.ds_tid;
3429 
3430   thread->th.th_a =
3431       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3432   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3433   KA_TRACE(30,
3434            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3435 }
3436 
3437 #if KMP_OS_WINDOWS
3438 /* reclaim array entries for root threads that are already dead, returns number
3439  * reclaimed */
3440 static int __kmp_reclaim_dead_roots(void) {
3441   int i, r = 0;
3442 
3443   for (i = 0; i < __kmp_threads_capacity; ++i) {
3444     if (KMP_UBER_GTID(i) &&
3445         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3446         !__kmp_root[i]
3447              ->r.r_active) { // AC: reclaim only roots died in non-active state
3448       r += __kmp_unregister_root_other_thread(i);
3449     }
3450   }
3451   return r;
3452 }
3453 #endif
3454 
3455 /* This function attempts to create free entries in __kmp_threads and
3456    __kmp_root, and returns the number of free entries generated.
3457 
3458    For Windows* OS static library, the first mechanism used is to reclaim array
3459    entries for root threads that are already dead.
3460 
3461    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3462    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3463    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3464    threadprivate cache array has been created. Synchronization with
3465    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3466 
3467    After any dead root reclamation, if the clipping value allows array expansion
3468    to result in the generation of a total of nWish free slots, the function does
3469    that expansion. If not, but the clipping value allows array expansion to
3470    result in the generation of a total of nNeed free slots, the function does
3471    that expansion. Otherwise, nothing is done beyond the possible initial root
3472    thread reclamation. However, if nNeed is zero, a best-effort attempt is made
3473    to fulfil nWish as far as possible, i.e. the function will attempt to create
3474    as many free slots as possible up to nWish.
3475 
3476    If any argument is negative, the behavior is undefined. */
3477 static int __kmp_expand_threads(int nWish, int nNeed) {
3478   int added = 0;
3479   int old_tp_cached;
3480   int __kmp_actual_max_nth;
3481 
3482   if (nNeed > nWish) /* normalize the arguments */
3483     nWish = nNeed;
3484 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3485   /* only for Windows static library */
3486   /* reclaim array entries for root threads that are already dead */
3487   added = __kmp_reclaim_dead_roots();
3488 
3489   if (nNeed) {
3490     nNeed -= added;
3491     if (nNeed < 0)
3492       nNeed = 0;
3493   }
3494   if (nWish) {
3495     nWish -= added;
3496     if (nWish < 0)
3497       nWish = 0;
3498   }
3499 #endif
3500   if (nWish <= 0)
3501     return added;
3502 
3503   while (1) {
3504     int nTarget;
3505     int minimumRequiredCapacity;
3506     int newCapacity;
3507     kmp_info_t **newThreads;
3508     kmp_root_t **newRoot;
3509 
3510     // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3511     // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3512     // user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may become
3513     // > __kmp_max_nth in one of two ways:
3514     //
3515     // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3516     //    may not be resused by another thread, so we may need to increase
3517     //    __kmp_threads_capacity to __kmp_max_threads + 1.
3518     //
3519     // 2) New foreign root(s) are encountered.  We always register new foreign
3520     //    roots. This may cause a smaller # of threads to be allocated at
3521     //    subsequent parallel regions, but the worker threads hang around (and
3522     //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3523     //
3524     // Anyway, that is the reason for moving the check to see if
3525     // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3526     // instead of having it performed here. -BB
3527     old_tp_cached = __kmp_tp_cached;
3528     __kmp_actual_max_nth =
3529         old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3530     KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3531 
3532     /* compute expansion headroom to check if we can expand and whether to aim
3533        for nWish or nNeed */
3534     nTarget = nWish;
3535     if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3536       /* can't fulfil nWish, so try nNeed */
3537       if (nNeed) {
3538         nTarget = nNeed;
3539         if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3540           /* possible expansion too small -- give up */
3541           break;
3542         }
3543       } else {
3544         /* best-effort */
3545         nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3546         if (!nTarget) {
3547           /* can expand at all -- give up */
3548           break;
3549         }
3550       }
3551     }
3552     minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3553 
3554     newCapacity = __kmp_threads_capacity;
3555     do {
3556       newCapacity = newCapacity <= (__kmp_actual_max_nth >> 1)
3557                         ? (newCapacity << 1)
3558                         : __kmp_actual_max_nth;
3559     } while (newCapacity < minimumRequiredCapacity);
3560     newThreads = (kmp_info_t **)__kmp_allocate(
3561         (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity +
3562         CACHE_LINE);
3563     newRoot = (kmp_root_t **)((char *)newThreads +
3564                               sizeof(kmp_info_t *) * newCapacity);
3565     KMP_MEMCPY(newThreads, __kmp_threads,
3566                __kmp_threads_capacity * sizeof(kmp_info_t *));
3567     KMP_MEMCPY(newRoot, __kmp_root,
3568                __kmp_threads_capacity * sizeof(kmp_root_t *));
3569     memset(newThreads + __kmp_threads_capacity, 0,
3570            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t *));
3571     memset(newRoot + __kmp_threads_capacity, 0,
3572            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t *));
3573 
3574     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3575       /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has
3576          allocated a threadprivate cache while we were allocating the expanded
3577          array, and our new capacity is larger than the threadprivate cache
3578          capacity, so we should deallocate the expanded arrays and try again.
3579          This is the first check of a double-check pair. */
3580       __kmp_free(newThreads);
3581       continue; /* start over and try again */
3582     }
3583     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3584     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3585       /* Same check as above, but this time with the lock so we can be sure if
3586          we can succeed. */
3587       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3588       __kmp_free(newThreads);
3589       continue; /* start over and try again */
3590     } else {
3591       /* success */
3592       // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be
3593       // investigated.
3594       *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3595       *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3596       added += newCapacity - __kmp_threads_capacity;
3597       *(volatile int *)&__kmp_threads_capacity = newCapacity;
3598       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3599       break; /* succeeded, so we can exit the loop */
3600     }
3601   }
3602   return added;
3603 }
3604 
3605 /* Register the current thread as a root thread and obtain our gtid. We must
3606    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3607    thread that calls from __kmp_do_serial_initialize() */
3608 int __kmp_register_root(int initial_thread) {
3609   kmp_info_t *root_thread;
3610   kmp_root_t *root;
3611   int gtid;
3612   int capacity;
3613   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3614   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3615   KMP_MB();
3616 
3617   /* 2007-03-02:
3618      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3619      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3620      work as expected -- it may return false (that means there is at least one
3621      empty slot in __kmp_threads array), but it is possible the only free slot
3622      is #0, which is reserved for initial thread and so cannot be used for this
3623      one. Following code workarounds this bug.
3624 
3625      However, right solution seems to be not reserving slot #0 for initial
3626      thread because:
3627      (1) there is no magic in slot #0,
3628      (2) we cannot detect initial thread reliably (the first thread which does
3629         serial initialization may be not a real initial thread).
3630   */
3631   capacity = __kmp_threads_capacity;
3632   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3633     --capacity;
3634   }; // if
3635 
3636   /* see if there are too many threads */
3637   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1, 1)) {
3638     if (__kmp_tp_cached) {
3639       __kmp_msg(kmp_ms_fatal, KMP_MSG(CantRegisterNewThread),
3640                 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3641                 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3642     } else {
3643       __kmp_msg(kmp_ms_fatal, KMP_MSG(CantRegisterNewThread),
3644                 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
3645     }
3646   }; // if
3647 
3648   /* find an available thread slot */
3649   /* Don't reassign the zero slot since we need that to only be used by initial
3650      thread */
3651   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3652        gtid++)
3653     ;
3654   KA_TRACE(1,
3655            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3656   KMP_ASSERT(gtid < __kmp_threads_capacity);
3657 
3658   /* update global accounting */
3659   __kmp_all_nth++;
3660   TCW_4(__kmp_nth, __kmp_nth + 1);
3661 
3662   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3663   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3664   if (__kmp_adjust_gtid_mode) {
3665     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3666       if (TCR_4(__kmp_gtid_mode) != 2) {
3667         TCW_4(__kmp_gtid_mode, 2);
3668       }
3669     } else {
3670       if (TCR_4(__kmp_gtid_mode) != 1) {
3671         TCW_4(__kmp_gtid_mode, 1);
3672       }
3673     }
3674   }
3675 
3676 #ifdef KMP_ADJUST_BLOCKTIME
3677   /* Adjust blocktime to zero if necessary            */
3678   /* Middle initialization might not have occurred yet */
3679   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3680     if (__kmp_nth > __kmp_avail_proc) {
3681       __kmp_zero_bt = TRUE;
3682     }
3683   }
3684 #endif /* KMP_ADJUST_BLOCKTIME */
3685 
3686   /* setup this new hierarchy */
3687   if (!(root = __kmp_root[gtid])) {
3688     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3689     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3690   }
3691 
3692 #if KMP_STATS_ENABLED
3693   // Initialize stats as soon as possible (right after gtid assignment).
3694   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3695   KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3696   KMP_SET_THREAD_STATE(SERIAL_REGION);
3697   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3698 #endif
3699   __kmp_initialize_root(root);
3700 
3701   /* setup new root thread structure */
3702   if (root->r.r_uber_thread) {
3703     root_thread = root->r.r_uber_thread;
3704   } else {
3705     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3706     if (__kmp_storage_map) {
3707       __kmp_print_thread_storage_map(root_thread, gtid);
3708     }
3709     root_thread->th.th_info.ds.ds_gtid = gtid;
3710     root_thread->th.th_root = root;
3711     if (__kmp_env_consistency_check) {
3712       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3713     }
3714 #if USE_FAST_MEMORY
3715     __kmp_initialize_fast_memory(root_thread);
3716 #endif /* USE_FAST_MEMORY */
3717 
3718 #if KMP_USE_BGET
3719     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3720     __kmp_initialize_bget(root_thread);
3721 #endif
3722     __kmp_init_random(root_thread); // Initialize random number generator
3723   }
3724 
3725   /* setup the serial team held in reserve by the root thread */
3726   if (!root_thread->th.th_serial_team) {
3727     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3728     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3729     root_thread->th.th_serial_team =
3730         __kmp_allocate_team(root, 1, 1,
3731 #if OMPT_SUPPORT
3732                             0, // root parallel id
3733 #endif
3734 #if OMP_40_ENABLED
3735                             proc_bind_default,
3736 #endif
3737                             &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3738   }
3739   KMP_ASSERT(root_thread->th.th_serial_team);
3740   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3741                 root_thread->th.th_serial_team));
3742 
3743   /* drop root_thread into place */
3744   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3745 
3746   root->r.r_root_team->t.t_threads[0] = root_thread;
3747   root->r.r_hot_team->t.t_threads[0] = root_thread;
3748   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3749   // AC: the team created in reserve, not for execution (it is unused for now).
3750   root_thread->th.th_serial_team->t.t_serialized = 0;
3751   root->r.r_uber_thread = root_thread;
3752 
3753   /* initialize the thread, get it ready to go */
3754   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3755   TCW_4(__kmp_init_gtid, TRUE);
3756 
3757   /* prepare the master thread for get_gtid() */
3758   __kmp_gtid_set_specific(gtid);
3759 
3760 #if USE_ITT_BUILD
3761   __kmp_itt_thread_name(gtid);
3762 #endif /* USE_ITT_BUILD */
3763 
3764 #ifdef KMP_TDATA_GTID
3765   __kmp_gtid = gtid;
3766 #endif
3767   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3768   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3769 
3770   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3771                 "plain=%u\n",
3772                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3773                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3774                 KMP_INIT_BARRIER_STATE));
3775   { // Initialize barrier data.
3776     int b;
3777     for (b = 0; b < bs_last_barrier; ++b) {
3778       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3779 #if USE_DEBUGGER
3780       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3781 #endif
3782     }; // for
3783   }
3784   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3785                    KMP_INIT_BARRIER_STATE);
3786 
3787 #if KMP_AFFINITY_SUPPORTED
3788 #if OMP_40_ENABLED
3789   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3790   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3791   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3792   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3793 #endif
3794 
3795   if (TCR_4(__kmp_init_middle)) {
3796     __kmp_affinity_set_init_mask(gtid, TRUE);
3797   }
3798 #endif /* KMP_AFFINITY_SUPPORTED */
3799 
3800   __kmp_root_counter++;
3801 
3802   KMP_MB();
3803   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3804 
3805   return gtid;
3806 }
3807 
3808 #if KMP_NESTED_HOT_TEAMS
3809 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3810                                 const int max_level) {
3811   int i, n, nth;
3812   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3813   if (!hot_teams || !hot_teams[level].hot_team) {
3814     return 0;
3815   }
3816   KMP_DEBUG_ASSERT(level < max_level);
3817   kmp_team_t *team = hot_teams[level].hot_team;
3818   nth = hot_teams[level].hot_team_nth;
3819   n = nth - 1; // master is not freed
3820   if (level < max_level - 1) {
3821     for (i = 0; i < nth; ++i) {
3822       kmp_info_t *th = team->t.t_threads[i];
3823       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3824       if (i > 0 && th->th.th_hot_teams) {
3825         __kmp_free(th->th.th_hot_teams);
3826         th->th.th_hot_teams = NULL;
3827       }
3828     }
3829   }
3830   __kmp_free_team(root, team, NULL);
3831   return n;
3832 }
3833 #endif
3834 
3835 // Resets a root thread and clear its root and hot teams.
3836 // Returns the number of __kmp_threads entries directly and indirectly freed.
3837 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3838   kmp_team_t *root_team = root->r.r_root_team;
3839   kmp_team_t *hot_team = root->r.r_hot_team;
3840   int n = hot_team->t.t_nproc;
3841   int i;
3842 
3843   KMP_DEBUG_ASSERT(!root->r.r_active);
3844 
3845   root->r.r_root_team = NULL;
3846   root->r.r_hot_team = NULL;
3847   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3848   // before call to __kmp_free_team().
3849   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3850 #if KMP_NESTED_HOT_TEAMS
3851   if (__kmp_hot_teams_max_level >
3852       0) { // need to free nested hot teams and their threads if any
3853     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3854       kmp_info_t *th = hot_team->t.t_threads[i];
3855       if (__kmp_hot_teams_max_level > 1) {
3856         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3857       }
3858       if (th->th.th_hot_teams) {
3859         __kmp_free(th->th.th_hot_teams);
3860         th->th.th_hot_teams = NULL;
3861       }
3862     }
3863   }
3864 #endif
3865   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3866 
3867   // Before we can reap the thread, we need to make certain that all other
3868   // threads in the teams that had this root as ancestor have stopped trying to
3869   // steal tasks.
3870   if (__kmp_tasking_mode != tskm_immediate_exec) {
3871     __kmp_wait_to_unref_task_teams();
3872   }
3873 
3874 #if KMP_OS_WINDOWS
3875   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3876   KA_TRACE(
3877       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3878            "\n",
3879            (LPVOID) & (root->r.r_uber_thread->th),
3880            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3881   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3882 #endif /* KMP_OS_WINDOWS */
3883 
3884 #if OMPT_SUPPORT
3885   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3886     int gtid = __kmp_get_gtid();
3887     __ompt_thread_end(ompt_thread_initial, gtid);
3888   }
3889 #endif
3890 
3891   TCW_4(__kmp_nth,
3892         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3893   __kmp_reap_thread(root->r.r_uber_thread, 1);
3894 
3895   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3896   // of freeing.
3897   root->r.r_uber_thread = NULL;
3898   /* mark root as no longer in use */
3899   root->r.r_begin = FALSE;
3900 
3901   return n;
3902 }
3903 
3904 void __kmp_unregister_root_current_thread(int gtid) {
3905   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3906   /* this lock should be ok, since unregister_root_current_thread is never
3907      called during an abort, only during a normal close. furthermore, if you
3908      have the forkjoin lock, you should never try to get the initz lock */
3909   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3910   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3911     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3912                   "exiting T#%d\n",
3913                   gtid));
3914     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3915     return;
3916   }
3917   kmp_root_t *root = __kmp_root[gtid];
3918 
3919   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3920   KMP_ASSERT(KMP_UBER_GTID(gtid));
3921   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3922   KMP_ASSERT(root->r.r_active == FALSE);
3923 
3924   KMP_MB();
3925 
3926 #if OMP_45_ENABLED
3927   kmp_info_t *thread = __kmp_threads[gtid];
3928   kmp_team_t *team = thread->th.th_team;
3929   kmp_task_team_t *task_team = thread->th.th_task_team;
3930 
3931   // we need to wait for the proxy tasks before finishing the thread
3932   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3933 #if OMPT_SUPPORT
3934     // the runtime is shutting down so we won't report any events
3935     thread->th.ompt_thread_info.state = ompt_state_undefined;
3936 #endif
3937     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3938   }
3939 #endif
3940 
3941   __kmp_reset_root(gtid, root);
3942 
3943   /* free up this thread slot */
3944   __kmp_gtid_set_specific(KMP_GTID_DNE);
3945 #ifdef KMP_TDATA_GTID
3946   __kmp_gtid = KMP_GTID_DNE;
3947 #endif
3948 
3949   KMP_MB();
3950   KC_TRACE(10,
3951            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3952 
3953   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3954 }
3955 
3956 #if KMP_OS_WINDOWS
3957 /* __kmp_forkjoin_lock must be already held
3958    Unregisters a root thread that is not the current thread.  Returns the number
3959    of __kmp_threads entries freed as a result. */
3960 static int __kmp_unregister_root_other_thread(int gtid) {
3961   kmp_root_t *root = __kmp_root[gtid];
3962   int r;
3963 
3964   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3965   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3966   KMP_ASSERT(KMP_UBER_GTID(gtid));
3967   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3968   KMP_ASSERT(root->r.r_active == FALSE);
3969 
3970   r = __kmp_reset_root(gtid, root);
3971   KC_TRACE(10,
3972            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3973   return r;
3974 }
3975 #endif
3976 
3977 #if KMP_DEBUG
3978 void __kmp_task_info() {
3979 
3980   kmp_int32 gtid = __kmp_entry_gtid();
3981   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
3982   kmp_info_t *this_thr = __kmp_threads[gtid];
3983   kmp_team_t *steam = this_thr->th.th_serial_team;
3984   kmp_team_t *team = this_thr->th.th_team;
3985 
3986   __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
3987                "ptask=%p\n",
3988                gtid, tid, this_thr, team, this_thr->th.th_current_task,
3989                team->t.t_implicit_task_taskdata[tid].td_parent);
3990 }
3991 #endif // KMP_DEBUG
3992 
3993 /* TODO optimize with one big memclr, take out what isn't needed, split
3994    responsibility to workers as much as possible, and delay initialization of
3995    features as much as possible  */
3996 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
3997                                   int tid, int gtid) {
3998   /* this_thr->th.th_info.ds.ds_gtid is setup in
3999      kmp_allocate_thread/create_worker.
4000      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4001   kmp_info_t *master = team->t.t_threads[0];
4002   KMP_DEBUG_ASSERT(this_thr != NULL);
4003   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4004   KMP_DEBUG_ASSERT(team);
4005   KMP_DEBUG_ASSERT(team->t.t_threads);
4006   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4007   KMP_DEBUG_ASSERT(master);
4008   KMP_DEBUG_ASSERT(master->th.th_root);
4009 
4010   KMP_MB();
4011 
4012   TCW_SYNC_PTR(this_thr->th.th_team, team);
4013 
4014   this_thr->th.th_info.ds.ds_tid = tid;
4015   this_thr->th.th_set_nproc = 0;
4016   if (__kmp_tasking_mode != tskm_immediate_exec)
4017     // When tasking is possible, threads are not safe to reap until they are
4018     // done tasking; this will be set when tasking code is exited in wait
4019     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4020   else // no tasking --> always safe to reap
4021     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4022 #if OMP_40_ENABLED
4023   this_thr->th.th_set_proc_bind = proc_bind_default;
4024 #if KMP_AFFINITY_SUPPORTED
4025   this_thr->th.th_new_place = this_thr->th.th_current_place;
4026 #endif
4027 #endif
4028   this_thr->th.th_root = master->th.th_root;
4029 
4030   /* setup the thread's cache of the team structure */
4031   this_thr->th.th_team_nproc = team->t.t_nproc;
4032   this_thr->th.th_team_master = master;
4033   this_thr->th.th_team_serialized = team->t.t_serialized;
4034   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4035 
4036   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4037 
4038   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4039                 tid, gtid, this_thr, this_thr->th.th_current_task));
4040 
4041   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4042                            team, tid, TRUE);
4043 
4044   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4045                 tid, gtid, this_thr, this_thr->th.th_current_task));
4046   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4047   // __kmp_initialize_team()?
4048 
4049   /* TODO no worksharing in speculative threads */
4050   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4051 
4052   this_thr->th.th_local.this_construct = 0;
4053 
4054 #ifdef BUILD_TV
4055   this_thr->th.th_local.tv_data = 0;
4056 #endif
4057 
4058   if (!this_thr->th.th_pri_common) {
4059     this_thr->th.th_pri_common =
4060         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4061     if (__kmp_storage_map) {
4062       __kmp_print_storage_map_gtid(
4063           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4064           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4065     }; // if
4066     this_thr->th.th_pri_head = NULL;
4067   }; // if
4068 
4069   /* Initialize dynamic dispatch */
4070   {
4071     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4072     // Use team max_nproc since this will never change for the team.
4073     size_t disp_size =
4074         sizeof(dispatch_private_info_t) *
4075         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4076     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4077                   team->t.t_max_nproc));
4078     KMP_ASSERT(dispatch);
4079     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4080     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4081 
4082     dispatch->th_disp_index = 0;
4083 #if OMP_45_ENABLED
4084     dispatch->th_doacross_buf_idx = 0;
4085 #endif
4086     if (!dispatch->th_disp_buffer) {
4087       dispatch->th_disp_buffer =
4088           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4089 
4090       if (__kmp_storage_map) {
4091         __kmp_print_storage_map_gtid(
4092             gtid, &dispatch->th_disp_buffer[0],
4093             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4094                                           ? 1
4095                                           : __kmp_dispatch_num_buffers],
4096             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4097                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4098             gtid, team->t.t_id, gtid);
4099       }
4100     } else {
4101       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4102     }
4103 
4104     dispatch->th_dispatch_pr_current = 0;
4105     dispatch->th_dispatch_sh_current = 0;
4106 
4107     dispatch->th_deo_fcn = 0; /* ORDERED     */
4108     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4109   }
4110 
4111   this_thr->th.th_next_pool = NULL;
4112 
4113   if (!this_thr->th.th_task_state_memo_stack) {
4114     size_t i;
4115     this_thr->th.th_task_state_memo_stack =
4116         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4117     this_thr->th.th_task_state_top = 0;
4118     this_thr->th.th_task_state_stack_sz = 4;
4119     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4120          ++i) // zero init the stack
4121       this_thr->th.th_task_state_memo_stack[i] = 0;
4122   }
4123 
4124   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4125   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4126 
4127   KMP_MB();
4128 }
4129 
4130 /* allocate a new thread for the requesting team. this is only called from
4131    within a forkjoin critical section. we will first try to get an available
4132    thread from the thread pool. if none is available, we will fork a new one
4133    assuming we are able to create a new one. this should be assured, as the
4134    caller should check on this first. */
4135 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4136                                   int new_tid) {
4137   kmp_team_t *serial_team;
4138   kmp_info_t *new_thr;
4139   int new_gtid;
4140 
4141   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4142   KMP_DEBUG_ASSERT(root && team);
4143 #if !KMP_NESTED_HOT_TEAMS
4144   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4145 #endif
4146   KMP_MB();
4147 
4148   /* first, try to get one from the thread pool */
4149   if (__kmp_thread_pool) {
4150 
4151     new_thr = (kmp_info_t *)__kmp_thread_pool;
4152     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4153     if (new_thr == __kmp_thread_pool_insert_pt) {
4154       __kmp_thread_pool_insert_pt = NULL;
4155     }
4156     TCW_4(new_thr->th.th_in_pool, FALSE);
4157     // Don't touch th_active_in_pool or th_active.
4158     // The worker thread adjusts those flags as it sleeps/awakens.
4159     __kmp_thread_pool_nth--;
4160 
4161     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4162                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4163     KMP_ASSERT(!new_thr->th.th_team);
4164     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4165     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4166 
4167     /* setup the thread structure */
4168     __kmp_initialize_info(new_thr, team, new_tid,
4169                           new_thr->th.th_info.ds.ds_gtid);
4170     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4171 
4172     TCW_4(__kmp_nth, __kmp_nth + 1);
4173 
4174     new_thr->th.th_task_state = 0;
4175     new_thr->th.th_task_state_top = 0;
4176     new_thr->th.th_task_state_stack_sz = 4;
4177 
4178 #ifdef KMP_ADJUST_BLOCKTIME
4179     /* Adjust blocktime back to zero if necessary */
4180     /* Middle initialization might not have occurred yet */
4181     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4182       if (__kmp_nth > __kmp_avail_proc) {
4183         __kmp_zero_bt = TRUE;
4184       }
4185     }
4186 #endif /* KMP_ADJUST_BLOCKTIME */
4187 
4188 #if KMP_DEBUG
4189     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4190     // KMP_BARRIER_PARENT_FLAG.
4191     int b;
4192     kmp_balign_t *balign = new_thr->th.th_bar;
4193     for (b = 0; b < bs_last_barrier; ++b)
4194       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4195 #endif
4196 
4197     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4198                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4199 
4200     KMP_MB();
4201     return new_thr;
4202   }
4203 
4204   /* no, well fork a new one */
4205   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4206   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4207 
4208 #if KMP_USE_MONITOR
4209   // If this is the first worker thread the RTL is creating, then also
4210   // launch the monitor thread.  We try to do this as early as possible.
4211   if (!TCR_4(__kmp_init_monitor)) {
4212     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4213     if (!TCR_4(__kmp_init_monitor)) {
4214       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4215       TCW_4(__kmp_init_monitor, 1);
4216       __kmp_create_monitor(&__kmp_monitor);
4217       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4218 #if KMP_OS_WINDOWS
4219       // AC: wait until monitor has started. This is a fix for CQ232808.
4220       // The reason is that if the library is loaded/unloaded in a loop with
4221       // small (parallel) work in between, then there is high probability that
4222       // monitor thread started after the library shutdown. At shutdown it is
4223       // too late to cope with the problem, because when the master is in
4224       // DllMain (process detach) the monitor has no chances to start (it is
4225       // blocked), and master has no means to inform the monitor that the
4226       // library has gone, because all the memory which the monitor can access
4227       // is going to be released/reset.
4228       while (TCR_4(__kmp_init_monitor) < 2) {
4229         KMP_YIELD(TRUE);
4230       }
4231       KF_TRACE(10, ("after monitor thread has started\n"));
4232 #endif
4233     }
4234     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4235   }
4236 #endif
4237 
4238   KMP_MB();
4239   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4240     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4241   }
4242 
4243   /* allocate space for it. */
4244   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4245 
4246   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4247 
4248   if (__kmp_storage_map) {
4249     __kmp_print_thread_storage_map(new_thr, new_gtid);
4250   }
4251 
4252   // add the reserve serialized team, initialized from the team's master thread
4253   {
4254     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4255     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4256     new_thr->th.th_serial_team = serial_team =
4257         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4258 #if OMPT_SUPPORT
4259                                           0, // root parallel id
4260 #endif
4261 #if OMP_40_ENABLED
4262                                           proc_bind_default,
4263 #endif
4264                                           &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4265   }
4266   KMP_ASSERT(serial_team);
4267   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4268   // execution (it is unused for now).
4269   serial_team->t.t_threads[0] = new_thr;
4270   KF_TRACE(10,
4271            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4272             new_thr));
4273 
4274   /* setup the thread structures */
4275   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4276 
4277 #if USE_FAST_MEMORY
4278   __kmp_initialize_fast_memory(new_thr);
4279 #endif /* USE_FAST_MEMORY */
4280 
4281 #if KMP_USE_BGET
4282   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4283   __kmp_initialize_bget(new_thr);
4284 #endif
4285 
4286   __kmp_init_random(new_thr); // Initialize random number generator
4287 
4288   /* Initialize these only once when thread is grabbed for a team allocation */
4289   KA_TRACE(20,
4290            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4291             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4292 
4293   int b;
4294   kmp_balign_t *balign = new_thr->th.th_bar;
4295   for (b = 0; b < bs_last_barrier; ++b) {
4296     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4297     balign[b].bb.team = NULL;
4298     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4299     balign[b].bb.use_oncore_barrier = 0;
4300   }
4301 
4302   new_thr->th.th_spin_here = FALSE;
4303   new_thr->th.th_next_waiting = 0;
4304 
4305 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4306   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4307   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4308   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4309   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4310 #endif
4311 
4312   TCW_4(new_thr->th.th_in_pool, FALSE);
4313   new_thr->th.th_active_in_pool = FALSE;
4314   TCW_4(new_thr->th.th_active, TRUE);
4315 
4316   /* adjust the global counters */
4317   __kmp_all_nth++;
4318   __kmp_nth++;
4319 
4320   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4321   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4322   if (__kmp_adjust_gtid_mode) {
4323     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4324       if (TCR_4(__kmp_gtid_mode) != 2) {
4325         TCW_4(__kmp_gtid_mode, 2);
4326       }
4327     } else {
4328       if (TCR_4(__kmp_gtid_mode) != 1) {
4329         TCW_4(__kmp_gtid_mode, 1);
4330       }
4331     }
4332   }
4333 
4334 #ifdef KMP_ADJUST_BLOCKTIME
4335   /* Adjust blocktime back to zero if necessary       */
4336   /* Middle initialization might not have occurred yet */
4337   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4338     if (__kmp_nth > __kmp_avail_proc) {
4339       __kmp_zero_bt = TRUE;
4340     }
4341   }
4342 #endif /* KMP_ADJUST_BLOCKTIME */
4343 
4344   /* actually fork it and create the new worker thread */
4345   KF_TRACE(
4346       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4347   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4348   KF_TRACE(10,
4349            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4350 
4351   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4352                 new_gtid));
4353   KMP_MB();
4354   return new_thr;
4355 }
4356 
4357 /* Reinitialize team for reuse.
4358    The hot team code calls this case at every fork barrier, so EPCC barrier
4359    test are extremely sensitive to changes in it, esp. writes to the team
4360    struct, which cause a cache invalidation in all threads.
4361    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4362 static void __kmp_reinitialize_team(kmp_team_t *team,
4363                                     kmp_internal_control_t *new_icvs,
4364                                     ident_t *loc) {
4365   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4366                 team->t.t_threads[0], team));
4367   KMP_DEBUG_ASSERT(team && new_icvs);
4368   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4369   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4370 
4371   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4372   // Copy ICVs to the master thread's implicit taskdata
4373   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4374   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4375 
4376   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4377                 team->t.t_threads[0], team));
4378 }
4379 
4380 /* Initialize the team data structure.
4381    This assumes the t_threads and t_max_nproc are already set.
4382    Also, we don't touch the arguments */
4383 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4384                                   kmp_internal_control_t *new_icvs,
4385                                   ident_t *loc) {
4386   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4387 
4388   /* verify */
4389   KMP_DEBUG_ASSERT(team);
4390   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4391   KMP_DEBUG_ASSERT(team->t.t_threads);
4392   KMP_MB();
4393 
4394   team->t.t_master_tid = 0; /* not needed */
4395   /* team->t.t_master_bar;        not needed */
4396   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4397   team->t.t_nproc = new_nproc;
4398 
4399   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4400   team->t.t_next_pool = NULL;
4401   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4402    * up hot team */
4403 
4404   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4405   team->t.t_invoke = NULL; /* not needed */
4406 
4407   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4408   team->t.t_sched = new_icvs->sched;
4409 
4410 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4411   team->t.t_fp_control_saved = FALSE; /* not needed */
4412   team->t.t_x87_fpu_control_word = 0; /* not needed */
4413   team->t.t_mxcsr = 0; /* not needed */
4414 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4415 
4416   team->t.t_construct = 0;
4417   __kmp_init_lock(&team->t.t_single_lock);
4418 
4419   team->t.t_ordered.dt.t_value = 0;
4420   team->t.t_master_active = FALSE;
4421 
4422   memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4423 
4424 #ifdef KMP_DEBUG
4425   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4426 #endif
4427   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4428 
4429   team->t.t_control_stack_top = NULL;
4430 
4431   __kmp_reinitialize_team(team, new_icvs, loc);
4432 
4433   KMP_MB();
4434   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4435 }
4436 
4437 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4438 /* Sets full mask for thread and returns old mask, no changes to structures. */
4439 static void
4440 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4441   if (KMP_AFFINITY_CAPABLE()) {
4442     int status;
4443     if (old_mask != NULL) {
4444       status = __kmp_get_system_affinity(old_mask, TRUE);
4445       int error = errno;
4446       if (status != 0) {
4447         __kmp_msg(kmp_ms_fatal, KMP_MSG(ChangeThreadAffMaskError),
4448                   KMP_ERR(error), __kmp_msg_null);
4449       }
4450     }
4451     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4452   }
4453 }
4454 #endif
4455 
4456 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4457 
4458 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4459 // It calculats the worker + master thread's partition based upon the parent
4460 // thread's partition, and binds each worker to a thread in their partition.
4461 // The master thread's partition should already include its current binding.
4462 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4463   // Copy the master thread's place partion to the team struct
4464   kmp_info_t *master_th = team->t.t_threads[0];
4465   KMP_DEBUG_ASSERT(master_th != NULL);
4466   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4467   int first_place = master_th->th.th_first_place;
4468   int last_place = master_th->th.th_last_place;
4469   int masters_place = master_th->th.th_current_place;
4470   team->t.t_first_place = first_place;
4471   team->t.t_last_place = last_place;
4472 
4473   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4474                 "bound to place %d partition = [%d,%d]\n",
4475                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4476                 team->t.t_id, masters_place, first_place, last_place));
4477 
4478   switch (proc_bind) {
4479 
4480   case proc_bind_default:
4481     // serial teams might have the proc_bind policy set to proc_bind_default. It
4482     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4483     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4484     break;
4485 
4486   case proc_bind_master: {
4487     int f;
4488     int n_th = team->t.t_nproc;
4489     for (f = 1; f < n_th; f++) {
4490       kmp_info_t *th = team->t.t_threads[f];
4491       KMP_DEBUG_ASSERT(th != NULL);
4492       th->th.th_first_place = first_place;
4493       th->th.th_last_place = last_place;
4494       th->th.th_new_place = masters_place;
4495 
4496       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4497                      "partition = [%d,%d]\n",
4498                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4499                      f, masters_place, first_place, last_place));
4500     }
4501   } break;
4502 
4503   case proc_bind_close: {
4504     int f;
4505     int n_th = team->t.t_nproc;
4506     int n_places;
4507     if (first_place <= last_place) {
4508       n_places = last_place - first_place + 1;
4509     } else {
4510       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4511     }
4512     if (n_th <= n_places) {
4513       int place = masters_place;
4514       for (f = 1; f < n_th; f++) {
4515         kmp_info_t *th = team->t.t_threads[f];
4516         KMP_DEBUG_ASSERT(th != NULL);
4517 
4518         if (place == last_place) {
4519           place = first_place;
4520         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4521           place = 0;
4522         } else {
4523           place++;
4524         }
4525         th->th.th_first_place = first_place;
4526         th->th.th_last_place = last_place;
4527         th->th.th_new_place = place;
4528 
4529         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4530                        "partition = [%d,%d]\n",
4531                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4532                        team->t.t_id, f, place, first_place, last_place));
4533       }
4534     } else {
4535       int S, rem, gap, s_count;
4536       S = n_th / n_places;
4537       s_count = 0;
4538       rem = n_th - (S * n_places);
4539       gap = rem > 0 ? n_places / rem : n_places;
4540       int place = masters_place;
4541       int gap_ct = gap;
4542       for (f = 0; f < n_th; f++) {
4543         kmp_info_t *th = team->t.t_threads[f];
4544         KMP_DEBUG_ASSERT(th != NULL);
4545 
4546         th->th.th_first_place = first_place;
4547         th->th.th_last_place = last_place;
4548         th->th.th_new_place = place;
4549         s_count++;
4550 
4551         if ((s_count == S) && rem && (gap_ct == gap)) {
4552           // do nothing, add an extra thread to place on next iteration
4553         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4554           // we added an extra thread to this place; move to next place
4555           if (place == last_place) {
4556             place = first_place;
4557           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4558             place = 0;
4559           } else {
4560             place++;
4561           }
4562           s_count = 0;
4563           gap_ct = 1;
4564           rem--;
4565         } else if (s_count == S) { // place full; don't add extra
4566           if (place == last_place) {
4567             place = first_place;
4568           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4569             place = 0;
4570           } else {
4571             place++;
4572           }
4573           gap_ct++;
4574           s_count = 0;
4575         }
4576 
4577         KA_TRACE(100,
4578                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4579                   "partition = [%d,%d]\n",
4580                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4581                   th->th.th_new_place, first_place, last_place));
4582       }
4583       KMP_DEBUG_ASSERT(place == masters_place);
4584     }
4585   } break;
4586 
4587   case proc_bind_spread: {
4588     int f;
4589     int n_th = team->t.t_nproc;
4590     int n_places;
4591     int thidx;
4592     if (first_place <= last_place) {
4593       n_places = last_place - first_place + 1;
4594     } else {
4595       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4596     }
4597     if (n_th <= n_places) {
4598       int place = masters_place;
4599       int S = n_places / n_th;
4600       int s_count, rem, gap, gap_ct;
4601       rem = n_places - n_th * S;
4602       gap = rem ? n_th / rem : 1;
4603       gap_ct = gap;
4604       thidx = n_th;
4605       if (update_master_only == 1)
4606         thidx = 1;
4607       for (f = 0; f < thidx; f++) {
4608         kmp_info_t *th = team->t.t_threads[f];
4609         KMP_DEBUG_ASSERT(th != NULL);
4610 
4611         th->th.th_first_place = place;
4612         th->th.th_new_place = place;
4613         s_count = 1;
4614         while (s_count < S) {
4615           if (place == last_place) {
4616             place = first_place;
4617           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4618             place = 0;
4619           } else {
4620             place++;
4621           }
4622           s_count++;
4623         }
4624         if (rem && (gap_ct == gap)) {
4625           if (place == last_place) {
4626             place = first_place;
4627           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4628             place = 0;
4629           } else {
4630             place++;
4631           }
4632           rem--;
4633           gap_ct = 0;
4634         }
4635         th->th.th_last_place = place;
4636         gap_ct++;
4637 
4638         if (place == last_place) {
4639           place = first_place;
4640         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4641           place = 0;
4642         } else {
4643           place++;
4644         }
4645 
4646         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4647                        "partition = [%d,%d]\n",
4648                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4649                        team->t.t_id, f, th->th.th_new_place,
4650                        th->th.th_first_place, th->th.th_last_place));
4651       }
4652       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4653     } else {
4654       int S, rem, gap, s_count;
4655       S = n_th / n_places;
4656       s_count = 0;
4657       rem = n_th - (S * n_places);
4658       gap = rem > 0 ? n_places / rem : n_places;
4659       int place = masters_place;
4660       int gap_ct = gap;
4661       thidx = n_th;
4662       if (update_master_only == 1)
4663         thidx = 1;
4664       for (f = 0; f < thidx; f++) {
4665         kmp_info_t *th = team->t.t_threads[f];
4666         KMP_DEBUG_ASSERT(th != NULL);
4667 
4668         th->th.th_first_place = place;
4669         th->th.th_last_place = place;
4670         th->th.th_new_place = place;
4671         s_count++;
4672 
4673         if ((s_count == S) && rem && (gap_ct == gap)) {
4674           // do nothing, add an extra thread to place on next iteration
4675         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4676           // we added an extra thread to this place; move on to next place
4677           if (place == last_place) {
4678             place = first_place;
4679           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4680             place = 0;
4681           } else {
4682             place++;
4683           }
4684           s_count = 0;
4685           gap_ct = 1;
4686           rem--;
4687         } else if (s_count == S) { // place is full; don't add extra thread
4688           if (place == last_place) {
4689             place = first_place;
4690           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4691             place = 0;
4692           } else {
4693             place++;
4694           }
4695           gap_ct++;
4696           s_count = 0;
4697         }
4698 
4699         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4700                        "partition = [%d,%d]\n",
4701                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4702                        team->t.t_id, f, th->th.th_new_place,
4703                        th->th.th_first_place, th->th.th_last_place));
4704       }
4705       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4706     }
4707   } break;
4708 
4709   default:
4710     break;
4711   }
4712 
4713   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4714 }
4715 
4716 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4717 
4718 /* allocate a new team data structure to use.  take one off of the free pool if
4719    available */
4720 kmp_team_t *
4721 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4722 #if OMPT_SUPPORT
4723                     ompt_parallel_id_t ompt_parallel_id,
4724 #endif
4725 #if OMP_40_ENABLED
4726                     kmp_proc_bind_t new_proc_bind,
4727 #endif
4728                     kmp_internal_control_t *new_icvs,
4729                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4730   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4731   int f;
4732   kmp_team_t *team;
4733   int use_hot_team = !root->r.r_active;
4734   int level = 0;
4735 
4736   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4737   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4738   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4739   KMP_MB();
4740 
4741 #if KMP_NESTED_HOT_TEAMS
4742   kmp_hot_team_ptr_t *hot_teams;
4743   if (master) {
4744     team = master->th.th_team;
4745     level = team->t.t_active_level;
4746     if (master->th.th_teams_microtask) { // in teams construct?
4747       if (master->th.th_teams_size.nteams > 1 &&
4748           ( // #teams > 1
4749               team->t.t_pkfn ==
4750                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4751               master->th.th_teams_level <
4752                   team->t.t_level)) { // or nested parallel inside the teams
4753         ++level; // not increment if #teams==1, or for outer fork of the teams;
4754         // increment otherwise
4755       }
4756     }
4757     hot_teams = master->th.th_hot_teams;
4758     if (level < __kmp_hot_teams_max_level && hot_teams &&
4759         hot_teams[level]
4760             .hot_team) { // hot team has already been allocated for given level
4761       use_hot_team = 1;
4762     } else {
4763       use_hot_team = 0;
4764     }
4765   }
4766 #endif
4767   // Optimization to use a "hot" team
4768   if (use_hot_team && new_nproc > 1) {
4769     KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4770 #if KMP_NESTED_HOT_TEAMS
4771     team = hot_teams[level].hot_team;
4772 #else
4773     team = root->r.r_hot_team;
4774 #endif
4775 #if KMP_DEBUG
4776     if (__kmp_tasking_mode != tskm_immediate_exec) {
4777       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4778                     "task_team[1] = %p before reinit\n",
4779                     team->t.t_task_team[0], team->t.t_task_team[1]));
4780     }
4781 #endif
4782 
4783     // Has the number of threads changed?
4784     /* Let's assume the most common case is that the number of threads is
4785        unchanged, and put that case first. */
4786     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4787       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4788       // This case can mean that omp_set_num_threads() was called and the hot
4789       // team size was already reduced, so we check the special flag
4790       if (team->t.t_size_changed == -1) {
4791         team->t.t_size_changed = 1;
4792       } else {
4793         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4794       }
4795 
4796       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4797       kmp_r_sched_t new_sched = new_icvs->sched;
4798       if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
4799           team->t.t_sched.chunk != new_sched.chunk)
4800         team->t.t_sched =
4801             new_sched; // set master's schedule as new run-time schedule
4802 
4803       __kmp_reinitialize_team(team, new_icvs,
4804                               root->r.r_uber_thread->th.th_ident);
4805 
4806       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4807                     team->t.t_threads[0], team));
4808       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4809 
4810 #if OMP_40_ENABLED
4811 #if KMP_AFFINITY_SUPPORTED
4812       if ((team->t.t_size_changed == 0) &&
4813           (team->t.t_proc_bind == new_proc_bind)) {
4814         if (new_proc_bind == proc_bind_spread) {
4815           __kmp_partition_places(
4816               team, 1); // add flag to update only master for spread
4817         }
4818         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4819                        "proc_bind = %d, partition = [%d,%d]\n",
4820                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4821                        team->t.t_last_place));
4822       } else {
4823         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4824         __kmp_partition_places(team);
4825       }
4826 #else
4827       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4828 #endif /* KMP_AFFINITY_SUPPORTED */
4829 #endif /* OMP_40_ENABLED */
4830     } else if (team->t.t_nproc > new_nproc) {
4831       KA_TRACE(20,
4832                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4833                 new_nproc));
4834 
4835       team->t.t_size_changed = 1;
4836 #if KMP_NESTED_HOT_TEAMS
4837       if (__kmp_hot_teams_mode == 0) {
4838         // AC: saved number of threads should correspond to team's value in this
4839         // mode, can be bigger in mode 1, when hot team has threads in reserve
4840         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4841         hot_teams[level].hot_team_nth = new_nproc;
4842 #endif // KMP_NESTED_HOT_TEAMS
4843         /* release the extra threads we don't need any more */
4844         for (f = new_nproc; f < team->t.t_nproc; f++) {
4845           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4846           if (__kmp_tasking_mode != tskm_immediate_exec) {
4847             // When decreasing team size, threads no longer in the team should
4848             // unref task team.
4849             team->t.t_threads[f]->th.th_task_team = NULL;
4850           }
4851           __kmp_free_thread(team->t.t_threads[f]);
4852           team->t.t_threads[f] = NULL;
4853         }
4854 #if KMP_NESTED_HOT_TEAMS
4855       } // (__kmp_hot_teams_mode == 0)
4856       else {
4857         // When keeping extra threads in team, switch threads to wait on own
4858         // b_go flag
4859         for (f = new_nproc; f < team->t.t_nproc; ++f) {
4860           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4861           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4862           for (int b = 0; b < bs_last_barrier; ++b) {
4863             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4864               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4865             }
4866             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4867           }
4868         }
4869       }
4870 #endif // KMP_NESTED_HOT_TEAMS
4871       team->t.t_nproc = new_nproc;
4872       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4873       if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type ||
4874           team->t.t_sched.chunk != new_icvs->sched.chunk)
4875         team->t.t_sched = new_icvs->sched;
4876       __kmp_reinitialize_team(team, new_icvs,
4877                               root->r.r_uber_thread->th.th_ident);
4878 
4879       /* update the remaining threads */
4880       for (f = 0; f < new_nproc; ++f) {
4881         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4882       }
4883       // restore the current task state of the master thread: should be the
4884       // implicit task
4885       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4886                     team->t.t_threads[0], team));
4887 
4888       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4889 
4890 #ifdef KMP_DEBUG
4891       for (f = 0; f < team->t.t_nproc; f++) {
4892         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4893                          team->t.t_threads[f]->th.th_team_nproc ==
4894                              team->t.t_nproc);
4895       }
4896 #endif
4897 
4898 #if OMP_40_ENABLED
4899       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4900 #if KMP_AFFINITY_SUPPORTED
4901       __kmp_partition_places(team);
4902 #endif
4903 #endif
4904     } else { // team->t.t_nproc < new_nproc
4905 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4906       kmp_affin_mask_t *old_mask;
4907       if (KMP_AFFINITY_CAPABLE()) {
4908         KMP_CPU_ALLOC(old_mask);
4909       }
4910 #endif
4911 
4912       KA_TRACE(20,
4913                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
4914                 new_nproc));
4915 
4916       team->t.t_size_changed = 1;
4917 
4918 #if KMP_NESTED_HOT_TEAMS
4919       int avail_threads = hot_teams[level].hot_team_nth;
4920       if (new_nproc < avail_threads)
4921         avail_threads = new_nproc;
4922       kmp_info_t **other_threads = team->t.t_threads;
4923       for (f = team->t.t_nproc; f < avail_threads; ++f) {
4924         // Adjust barrier data of reserved threads (if any) of the team
4925         // Other data will be set in __kmp_initialize_info() below.
4926         int b;
4927         kmp_balign_t *balign = other_threads[f]->th.th_bar;
4928         for (b = 0; b < bs_last_barrier; ++b) {
4929           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4930           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4931 #if USE_DEBUGGER
4932           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4933 #endif
4934         }
4935       }
4936       if (hot_teams[level].hot_team_nth >= new_nproc) {
4937         // we have all needed threads in reserve, no need to allocate any
4938         // this only possible in mode 1, cannot have reserved threads in mode 0
4939         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
4940         team->t.t_nproc = new_nproc; // just get reserved threads involved
4941       } else {
4942         // we may have some threads in reserve, but not enough
4943         team->t.t_nproc =
4944             hot_teams[level]
4945                 .hot_team_nth; // get reserved threads involved if any
4946         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
4947 #endif // KMP_NESTED_HOT_TEAMS
4948         if (team->t.t_max_nproc < new_nproc) {
4949           /* reallocate larger arrays */
4950           __kmp_reallocate_team_arrays(team, new_nproc);
4951           __kmp_reinitialize_team(team, new_icvs, NULL);
4952         }
4953 
4954 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4955         /* Temporarily set full mask for master thread before creation of
4956            workers. The reason is that workers inherit the affinity from master,
4957            so if a lot of workers are created on the single core quickly, they
4958            don't get a chance to set their own affinity for a long time. */
4959         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
4960 #endif
4961 
4962         /* allocate new threads for the hot team */
4963         for (f = team->t.t_nproc; f < new_nproc; f++) {
4964           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
4965           KMP_DEBUG_ASSERT(new_worker);
4966           team->t.t_threads[f] = new_worker;
4967 
4968           KA_TRACE(20,
4969                    ("__kmp_allocate_team: team %d init T#%d arrived: "
4970                     "join=%llu, plain=%llu\n",
4971                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
4972                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
4973                     team->t.t_bar[bs_plain_barrier].b_arrived));
4974 
4975           { // Initialize barrier data for new threads.
4976             int b;
4977             kmp_balign_t *balign = new_worker->th.th_bar;
4978             for (b = 0; b < bs_last_barrier; ++b) {
4979               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4980               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
4981                                KMP_BARRIER_PARENT_FLAG);
4982 #if USE_DEBUGGER
4983               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4984 #endif
4985             }
4986           }
4987         }
4988 
4989 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4990         if (KMP_AFFINITY_CAPABLE()) {
4991           /* Restore initial master thread's affinity mask */
4992           __kmp_set_system_affinity(old_mask, TRUE);
4993           KMP_CPU_FREE(old_mask);
4994         }
4995 #endif
4996 #if KMP_NESTED_HOT_TEAMS
4997       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
4998 #endif // KMP_NESTED_HOT_TEAMS
4999       /* make sure everyone is syncronized */
5000       int old_nproc = team->t.t_nproc; // save old value and use to update only
5001       // new threads below
5002       __kmp_initialize_team(team, new_nproc, new_icvs,
5003                             root->r.r_uber_thread->th.th_ident);
5004 
5005       /* reinitialize the threads */
5006       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5007       for (f = 0; f < team->t.t_nproc; ++f)
5008         __kmp_initialize_info(team->t.t_threads[f], team, f,
5009                               __kmp_gtid_from_tid(f, team));
5010       if (level) { // set th_task_state for new threads in nested hot team
5011         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5012         // only need to set the th_task_state for the new threads. th_task_state
5013         // for master thread will not be accurate until after this in
5014         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5015         // correct value.
5016         for (f = old_nproc; f < team->t.t_nproc; ++f)
5017           team->t.t_threads[f]->th.th_task_state =
5018               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5019       } else { // set th_task_state for new threads in non-nested hot team
5020         int old_state =
5021             team->t.t_threads[0]->th.th_task_state; // copy master's state
5022         for (f = old_nproc; f < team->t.t_nproc; ++f)
5023           team->t.t_threads[f]->th.th_task_state = old_state;
5024       }
5025 
5026 #ifdef KMP_DEBUG
5027       for (f = 0; f < team->t.t_nproc; ++f) {
5028         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5029                          team->t.t_threads[f]->th.th_team_nproc ==
5030                              team->t.t_nproc);
5031       }
5032 #endif
5033 
5034 #if OMP_40_ENABLED
5035       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5036 #if KMP_AFFINITY_SUPPORTED
5037       __kmp_partition_places(team);
5038 #endif
5039 #endif
5040     } // Check changes in number of threads
5041 
5042 #if OMP_40_ENABLED
5043     kmp_info_t *master = team->t.t_threads[0];
5044     if (master->th.th_teams_microtask) {
5045       for (f = 1; f < new_nproc; ++f) {
5046         // propagate teams construct specific info to workers
5047         kmp_info_t *thr = team->t.t_threads[f];
5048         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5049         thr->th.th_teams_level = master->th.th_teams_level;
5050         thr->th.th_teams_size = master->th.th_teams_size;
5051       }
5052     }
5053 #endif /* OMP_40_ENABLED */
5054 #if KMP_NESTED_HOT_TEAMS
5055     if (level) {
5056       // Sync barrier state for nested hot teams, not needed for outermost hot
5057       // team.
5058       for (f = 1; f < new_nproc; ++f) {
5059         kmp_info_t *thr = team->t.t_threads[f];
5060         int b;
5061         kmp_balign_t *balign = thr->th.th_bar;
5062         for (b = 0; b < bs_last_barrier; ++b) {
5063           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5064           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5065 #if USE_DEBUGGER
5066           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5067 #endif
5068         }
5069       }
5070     }
5071 #endif // KMP_NESTED_HOT_TEAMS
5072 
5073     /* reallocate space for arguments if necessary */
5074     __kmp_alloc_argv_entries(argc, team, TRUE);
5075     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5076     // The hot team re-uses the previous task team,
5077     // if untouched during the previous release->gather phase.
5078 
5079     KF_TRACE(10, (" hot_team = %p\n", team));
5080 
5081 #if KMP_DEBUG
5082     if (__kmp_tasking_mode != tskm_immediate_exec) {
5083       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5084                     "task_team[1] = %p after reinit\n",
5085                     team->t.t_task_team[0], team->t.t_task_team[1]));
5086     }
5087 #endif
5088 
5089 #if OMPT_SUPPORT
5090     __ompt_team_assign_id(team, ompt_parallel_id);
5091 #endif
5092 
5093     KMP_MB();
5094 
5095     return team;
5096   }
5097 
5098   /* next, let's try to take one from the team pool */
5099   KMP_MB();
5100   for (team = (kmp_team_t *)__kmp_team_pool; (team);) {
5101     /* TODO: consider resizing undersized teams instead of reaping them, now
5102        that we have a resizing mechanism */
5103     if (team->t.t_max_nproc >= max_nproc) {
5104       /* take this team from the team pool */
5105       __kmp_team_pool = team->t.t_next_pool;
5106 
5107       /* setup the team for fresh use */
5108       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5109 
5110       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5111                     "task_team[1] %p to NULL\n",
5112                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5113       team->t.t_task_team[0] = NULL;
5114       team->t.t_task_team[1] = NULL;
5115 
5116       /* reallocate space for arguments if necessary */
5117       __kmp_alloc_argv_entries(argc, team, TRUE);
5118       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5119 
5120       KA_TRACE(
5121           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5122                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5123       { // Initialize barrier data.
5124         int b;
5125         for (b = 0; b < bs_last_barrier; ++b) {
5126           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5127 #if USE_DEBUGGER
5128           team->t.t_bar[b].b_master_arrived = 0;
5129           team->t.t_bar[b].b_team_arrived = 0;
5130 #endif
5131         }
5132       }
5133 
5134 #if OMP_40_ENABLED
5135       team->t.t_proc_bind = new_proc_bind;
5136 #endif
5137 
5138       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5139                     team->t.t_id));
5140 
5141 #if OMPT_SUPPORT
5142       __ompt_team_assign_id(team, ompt_parallel_id);
5143 #endif
5144 
5145       KMP_MB();
5146 
5147       return team;
5148     }
5149 
5150 /* reap team if it is too small, then loop back and check the next one */
5151 // not sure if this is wise, but, will be redone during the hot-teams rewrite.
5152 /* TODO: Use technique to find the right size hot-team, don't reap them */
5153     team = __kmp_reap_team(team);
5154     __kmp_team_pool = team;
5155   }
5156 
5157   /* nothing available in the pool, no matter, make a new team! */
5158   KMP_MB();
5159   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5160 
5161   /* and set it up */
5162   team->t.t_max_nproc = max_nproc;
5163   /* NOTE well, for some reason allocating one big buffer and dividing it up
5164      seems to really hurt performance a lot on the P4, so, let's not use this */
5165   __kmp_allocate_team_arrays(team, max_nproc);
5166 
5167   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5168   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5169 
5170   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5171                 "%p to NULL\n",
5172                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5173   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5174   // memory, no need to duplicate
5175   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5176   // memory, no need to duplicate
5177 
5178   if (__kmp_storage_map) {
5179     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5180   }
5181 
5182   /* allocate space for arguments */
5183   __kmp_alloc_argv_entries(argc, team, FALSE);
5184   team->t.t_argc = argc;
5185 
5186   KA_TRACE(20,
5187            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5188             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5189   { // Initialize barrier data.
5190     int b;
5191     for (b = 0; b < bs_last_barrier; ++b) {
5192       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5193 #if USE_DEBUGGER
5194       team->t.t_bar[b].b_master_arrived = 0;
5195       team->t.t_bar[b].b_team_arrived = 0;
5196 #endif
5197     }
5198   }
5199 
5200 #if OMP_40_ENABLED
5201   team->t.t_proc_bind = new_proc_bind;
5202 #endif
5203 
5204 #if OMPT_SUPPORT
5205   __ompt_team_assign_id(team, ompt_parallel_id);
5206   team->t.ompt_serialized_team_info = NULL;
5207 #endif
5208 
5209   KMP_MB();
5210 
5211   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5212                 team->t.t_id));
5213 
5214   return team;
5215 }
5216 
5217 /* TODO implement hot-teams at all levels */
5218 /* TODO implement lazy thread release on demand (disband request) */
5219 
5220 /* free the team.  return it to the team pool.  release all the threads
5221  * associated with it */
5222 void __kmp_free_team(kmp_root_t *root,
5223                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5224   int f;
5225   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5226                 team->t.t_id));
5227 
5228   /* verify state */
5229   KMP_DEBUG_ASSERT(root);
5230   KMP_DEBUG_ASSERT(team);
5231   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5232   KMP_DEBUG_ASSERT(team->t.t_threads);
5233 
5234   int use_hot_team = team == root->r.r_hot_team;
5235 #if KMP_NESTED_HOT_TEAMS
5236   int level;
5237   kmp_hot_team_ptr_t *hot_teams;
5238   if (master) {
5239     level = team->t.t_active_level - 1;
5240     if (master->th.th_teams_microtask) { // in teams construct?
5241       if (master->th.th_teams_size.nteams > 1) {
5242         ++level; // level was not increased in teams construct for
5243         // team_of_masters
5244       }
5245       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5246           master->th.th_teams_level == team->t.t_level) {
5247         ++level; // level was not increased in teams construct for
5248         // team_of_workers before the parallel
5249       } // team->t.t_level will be increased inside parallel
5250     }
5251     hot_teams = master->th.th_hot_teams;
5252     if (level < __kmp_hot_teams_max_level) {
5253       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5254       use_hot_team = 1;
5255     }
5256   }
5257 #endif // KMP_NESTED_HOT_TEAMS
5258 
5259   /* team is done working */
5260   TCW_SYNC_PTR(team->t.t_pkfn,
5261                NULL); // Important for Debugging Support Library.
5262   team->t.t_copyin_counter = 0; // init counter for possible reuse
5263   // Do not reset pointer to parent team to NULL for hot teams.
5264 
5265   /* if we are non-hot team, release our threads */
5266   if (!use_hot_team) {
5267     if (__kmp_tasking_mode != tskm_immediate_exec) {
5268       // Wait for threads to reach reapable state
5269       for (f = 1; f < team->t.t_nproc; ++f) {
5270         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5271         kmp_info_t *th = team->t.t_threads[f];
5272         volatile kmp_uint32 *state = &th->th.th_reap_state;
5273         while (*state != KMP_SAFE_TO_REAP) {
5274 #if KMP_OS_WINDOWS
5275           // On Windows a thread can be killed at any time, check this
5276           DWORD ecode;
5277           if (!__kmp_is_thread_alive(th, &ecode)) {
5278             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5279             break;
5280           }
5281 #endif
5282           // first check if thread is sleeping
5283           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5284           if (fl.is_sleeping())
5285             fl.resume(__kmp_gtid_from_thread(th));
5286           KMP_CPU_PAUSE();
5287         }
5288       }
5289 
5290       // Delete task teams
5291       int tt_idx;
5292       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5293         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5294         if (task_team != NULL) {
5295           for (f = 0; f < team->t.t_nproc;
5296                ++f) { // Have all threads unref task teams
5297             team->t.t_threads[f]->th.th_task_team = NULL;
5298           }
5299           KA_TRACE(
5300               20,
5301               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5302                __kmp_get_gtid(), task_team, team->t.t_id));
5303 #if KMP_NESTED_HOT_TEAMS
5304           __kmp_free_task_team(master, task_team);
5305 #endif
5306           team->t.t_task_team[tt_idx] = NULL;
5307         }
5308       }
5309     }
5310 
5311     // Reset pointer to parent team only for non-hot teams.
5312     team->t.t_parent = NULL;
5313     team->t.t_level = 0;
5314     team->t.t_active_level = 0;
5315 
5316     /* free the worker threads */
5317     for (f = 1; f < team->t.t_nproc; ++f) {
5318       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5319       __kmp_free_thread(team->t.t_threads[f]);
5320       team->t.t_threads[f] = NULL;
5321     }
5322 
5323     /* put the team back in the team pool */
5324     /* TODO limit size of team pool, call reap_team if pool too large */
5325     team->t.t_next_pool = (kmp_team_t *)__kmp_team_pool;
5326     __kmp_team_pool = (volatile kmp_team_t *)team;
5327   }
5328 
5329   KMP_MB();
5330 }
5331 
5332 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5333 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5334   kmp_team_t *next_pool = team->t.t_next_pool;
5335 
5336   KMP_DEBUG_ASSERT(team);
5337   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5338   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5339   KMP_DEBUG_ASSERT(team->t.t_threads);
5340   KMP_DEBUG_ASSERT(team->t.t_argv);
5341 
5342   /* TODO clean the threads that are a part of this? */
5343 
5344   /* free stuff */
5345   __kmp_free_team_arrays(team);
5346   if (team->t.t_argv != &team->t.t_inline_argv[0])
5347     __kmp_free((void *)team->t.t_argv);
5348   __kmp_free(team);
5349 
5350   KMP_MB();
5351   return next_pool;
5352 }
5353 
5354 // Free the thread.  Don't reap it, just place it on the pool of available
5355 // threads.
5356 //
5357 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5358 // binding for the affinity mechanism to be useful.
5359 //
5360 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5361 // However, we want to avoid a potential performance problem by always
5362 // scanning through the list to find the correct point at which to insert
5363 // the thread (potential N**2 behavior).  To do this we keep track of the
5364 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5365 // With single-level parallelism, threads will always be added to the tail
5366 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5367 // parallelism, all bets are off and we may need to scan through the entire
5368 // free list.
5369 //
5370 // This change also has a potentially large performance benefit, for some
5371 // applications.  Previously, as threads were freed from the hot team, they
5372 // would be placed back on the free list in inverse order.  If the hot team
5373 // grew back to it's original size, then the freed thread would be placed
5374 // back on the hot team in reverse order.  This could cause bad cache
5375 // locality problems on programs where the size of the hot team regularly
5376 // grew and shrunk.
5377 //
5378 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5379 void __kmp_free_thread(kmp_info_t *this_th) {
5380   int gtid;
5381   kmp_info_t **scan;
5382 
5383   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5384                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5385 
5386   KMP_DEBUG_ASSERT(this_th);
5387 
5388   // When moving thread to pool, switch thread to wait on own b_go flag, and
5389   // uninitialized (NULL team).
5390   int b;
5391   kmp_balign_t *balign = this_th->th.th_bar;
5392   for (b = 0; b < bs_last_barrier; ++b) {
5393     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5394       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5395     balign[b].bb.team = NULL;
5396     balign[b].bb.leaf_kids = 0;
5397   }
5398   this_th->th.th_task_state = 0;
5399 
5400   /* put thread back on the free pool */
5401   TCW_PTR(this_th->th.th_team, NULL);
5402   TCW_PTR(this_th->th.th_root, NULL);
5403   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5404 
5405   // If the __kmp_thread_pool_insert_pt is already past the new insert
5406   // point, then we need to re-scan the entire list.
5407   gtid = this_th->th.th_info.ds.ds_gtid;
5408   if (__kmp_thread_pool_insert_pt != NULL) {
5409     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5410     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5411       __kmp_thread_pool_insert_pt = NULL;
5412     }
5413   }
5414 
5415   // Scan down the list to find the place to insert the thread.
5416   // scan is the address of a link in the list, possibly the address of
5417   // __kmp_thread_pool itself.
5418   //
5419   // In the absence of nested parallism, the for loop will have 0 iterations.
5420   if (__kmp_thread_pool_insert_pt != NULL) {
5421     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5422   } else {
5423     scan = (kmp_info_t **)&__kmp_thread_pool;
5424   }
5425   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5426        scan = &((*scan)->th.th_next_pool))
5427     ;
5428 
5429   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5430   // to its address.
5431   TCW_PTR(this_th->th.th_next_pool, *scan);
5432   __kmp_thread_pool_insert_pt = *scan = this_th;
5433   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5434                    (this_th->th.th_info.ds.ds_gtid <
5435                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5436   TCW_4(this_th->th.th_in_pool, TRUE);
5437   __kmp_thread_pool_nth++;
5438 
5439   TCW_4(__kmp_nth, __kmp_nth - 1);
5440 
5441 #ifdef KMP_ADJUST_BLOCKTIME
5442   /* Adjust blocktime back to user setting or default if necessary */
5443   /* Middle initialization might never have occurred                */
5444   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5445     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5446     if (__kmp_nth <= __kmp_avail_proc) {
5447       __kmp_zero_bt = FALSE;
5448     }
5449   }
5450 #endif /* KMP_ADJUST_BLOCKTIME */
5451 
5452   KMP_MB();
5453 }
5454 
5455 /* ------------------------------------------------------------------------ */
5456 
5457 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5458   int gtid = this_thr->th.th_info.ds.ds_gtid;
5459   /*    void                 *stack_data;*/
5460   kmp_team_t *(*volatile pteam);
5461 
5462   KMP_MB();
5463   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5464 
5465   if (__kmp_env_consistency_check) {
5466     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5467   }
5468 
5469 #if OMPT_SUPPORT
5470   if (ompt_enabled) {
5471     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5472     this_thr->th.ompt_thread_info.wait_id = 0;
5473     this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5474     if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5475       __ompt_thread_begin(ompt_thread_worker, gtid);
5476     }
5477   }
5478 #endif
5479 
5480   /* This is the place where threads wait for work */
5481   while (!TCR_4(__kmp_global.g.g_done)) {
5482     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5483     KMP_MB();
5484 
5485     /* wait for work to do */
5486     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5487 
5488 #if OMPT_SUPPORT
5489     if (ompt_enabled) {
5490       this_thr->th.ompt_thread_info.state = ompt_state_idle;
5491     }
5492 #endif
5493 
5494     /* No tid yet since not part of a team */
5495     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5496 
5497 #if OMPT_SUPPORT
5498     if (ompt_enabled) {
5499       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5500     }
5501 #endif
5502 
5503     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5504 
5505     /* have we been allocated? */
5506     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5507 #if OMPT_SUPPORT
5508       ompt_task_info_t *task_info;
5509       ompt_parallel_id_t my_parallel_id;
5510       if (ompt_enabled) {
5511         task_info = __ompt_get_taskinfo(0);
5512         my_parallel_id = (*pteam)->t.ompt_team_info.parallel_id;
5513       }
5514 #endif
5515       /* we were just woken up, so run our new task */
5516       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5517         int rc;
5518         KA_TRACE(20,
5519                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5520                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5521                   (*pteam)->t.t_pkfn));
5522 
5523         updateHWFPControl(*pteam);
5524 
5525 #if OMPT_SUPPORT
5526         if (ompt_enabled) {
5527           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5528           // Initialize OMPT task id for implicit task.
5529           int tid = __kmp_tid_from_gtid(gtid);
5530           task_info->task_id = __ompt_task_id_new(tid);
5531         }
5532 #endif
5533 
5534         {
5535           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5536           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5537           rc = (*pteam)->t.t_invoke(gtid);
5538         }
5539         KMP_ASSERT(rc);
5540 
5541 #if OMPT_SUPPORT
5542         if (ompt_enabled) {
5543           /* no frame set while outside task */
5544           task_info->frame.exit_runtime_frame = NULL;
5545 
5546           this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5547         }
5548 #endif
5549         KMP_MB();
5550         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5551                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5552                       (*pteam)->t.t_pkfn));
5553       }
5554       /* join barrier after parallel region */
5555       __kmp_join_barrier(gtid);
5556 #if OMPT_SUPPORT && OMPT_TRACE
5557       if (ompt_enabled) {
5558         if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
5559           // don't access *pteam here: it may have already been freed
5560           // by the master thread behind the barrier (possible race)
5561           ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
5562               my_parallel_id, task_info->task_id);
5563         }
5564         task_info->frame.exit_runtime_frame = NULL;
5565         task_info->task_id = 0;
5566       }
5567 #endif
5568     }
5569   }
5570   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5571 
5572 #if OMPT_SUPPORT
5573   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5574     __ompt_thread_end(ompt_thread_worker, gtid);
5575   }
5576 #endif
5577 
5578   this_thr->th.th_task_team = NULL;
5579   /* run the destructors for the threadprivate data for this thread */
5580   __kmp_common_destroy_gtid(gtid);
5581 
5582   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5583   KMP_MB();
5584   return this_thr;
5585 }
5586 
5587 /* ------------------------------------------------------------------------ */
5588 
5589 void __kmp_internal_end_dest(void *specific_gtid) {
5590 #if KMP_COMPILER_ICC
5591 #pragma warning(push)
5592 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5593 // significant bits
5594 #endif
5595   // Make sure no significant bits are lost
5596   int gtid = (kmp_intptr_t)specific_gtid - 1;
5597 #if KMP_COMPILER_ICC
5598 #pragma warning(pop)
5599 #endif
5600 
5601   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5602   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5603    * this is because 0 is reserved for the nothing-stored case */
5604 
5605   /* josh: One reason for setting the gtid specific data even when it is being
5606      destroyed by pthread is to allow gtid lookup through thread specific data
5607      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5608      that gets executed in the call to __kmp_internal_end_thread, actually
5609      gets the gtid through the thread specific data.  Setting it here seems
5610      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5611      to run smoothly.
5612      todo: get rid of this after we remove the dependence on
5613      __kmp_gtid_get_specific  */
5614   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5615     __kmp_gtid_set_specific(gtid);
5616 #ifdef KMP_TDATA_GTID
5617   __kmp_gtid = gtid;
5618 #endif
5619   __kmp_internal_end_thread(gtid);
5620 }
5621 
5622 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5623 
5624 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5625 // destructors work perfectly, but in real libomp.so I have no evidence it is
5626 // ever called. However, -fini linker option in makefile.mk works fine.
5627 
5628 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5629   __kmp_internal_end_atexit();
5630 }
5631 
5632 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5633 
5634 #endif
5635 
5636 /* [Windows] josh: when the atexit handler is called, there may still be more
5637    than one thread alive */
5638 void __kmp_internal_end_atexit(void) {
5639   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5640   /* [Windows]
5641      josh: ideally, we want to completely shutdown the library in this atexit
5642      handler, but stat code that depends on thread specific data for gtid fails
5643      because that data becomes unavailable at some point during the shutdown, so
5644      we call __kmp_internal_end_thread instead. We should eventually remove the
5645      dependency on __kmp_get_specific_gtid in the stat code and use
5646      __kmp_internal_end_library to cleanly shutdown the library.
5647 
5648      // TODO: Can some of this comment about GVS be removed?
5649      I suspect that the offending stat code is executed when the calling thread
5650      tries to clean up a dead root thread's data structures, resulting in GVS
5651      code trying to close the GVS structures for that thread, but since the stat
5652      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5653      the calling thread is cleaning up itself instead of another thread, it get
5654      confused. This happens because allowing a thread to unregister and cleanup
5655      another thread is a recent modification for addressing an issue.
5656      Based on the current design (20050722), a thread may end up
5657      trying to unregister another thread only if thread death does not trigger
5658      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5659      thread specific data destructor function to detect thread death. For
5660      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5661      is nothing.  Thus, the workaround is applicable only for Windows static
5662      stat library. */
5663   __kmp_internal_end_library(-1);
5664 #if KMP_OS_WINDOWS
5665   __kmp_close_console();
5666 #endif
5667 }
5668 
5669 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5670   // It is assumed __kmp_forkjoin_lock is acquired.
5671 
5672   int gtid;
5673 
5674   KMP_DEBUG_ASSERT(thread != NULL);
5675 
5676   gtid = thread->th.th_info.ds.ds_gtid;
5677 
5678   if (!is_root) {
5679 
5680     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5681       /* Assume the threads are at the fork barrier here */
5682       KA_TRACE(
5683           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5684                gtid));
5685       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5686        * (GEH) */
5687       ANNOTATE_HAPPENS_BEFORE(thread);
5688       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5689       __kmp_release_64(&flag);
5690     }; // if
5691 
5692     // Terminate OS thread.
5693     __kmp_reap_worker(thread);
5694 
5695     // The thread was killed asynchronously.  If it was actively
5696     // spinning in the thread pool, decrement the global count.
5697     //
5698     // There is a small timing hole here - if the worker thread was just waking
5699     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5700     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5701     // the global counter might not get updated.
5702     //
5703     // Currently, this can only happen as the library is unloaded,
5704     // so there are no harmful side effects.
5705     if (thread->th.th_active_in_pool) {
5706       thread->th.th_active_in_pool = FALSE;
5707       KMP_TEST_THEN_DEC32((kmp_int32 *)&__kmp_thread_pool_active_nth);
5708       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
5709     }
5710 
5711     // Decrement # of [worker] threads in the pool.
5712     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5713     --__kmp_thread_pool_nth;
5714   }; // if
5715 
5716   __kmp_free_implicit_task(thread);
5717 
5718 // Free the fast memory for tasking
5719 #if USE_FAST_MEMORY
5720   __kmp_free_fast_memory(thread);
5721 #endif /* USE_FAST_MEMORY */
5722 
5723   __kmp_suspend_uninitialize_thread(thread);
5724 
5725   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5726   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5727 
5728   --__kmp_all_nth;
5729 // __kmp_nth was decremented when thread is added to the pool.
5730 
5731 #ifdef KMP_ADJUST_BLOCKTIME
5732   /* Adjust blocktime back to user setting or default if necessary */
5733   /* Middle initialization might never have occurred                */
5734   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5735     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5736     if (__kmp_nth <= __kmp_avail_proc) {
5737       __kmp_zero_bt = FALSE;
5738     }
5739   }
5740 #endif /* KMP_ADJUST_BLOCKTIME */
5741 
5742   /* free the memory being used */
5743   if (__kmp_env_consistency_check) {
5744     if (thread->th.th_cons) {
5745       __kmp_free_cons_stack(thread->th.th_cons);
5746       thread->th.th_cons = NULL;
5747     }; // if
5748   }
5749 
5750   if (thread->th.th_pri_common != NULL) {
5751     __kmp_free(thread->th.th_pri_common);
5752     thread->th.th_pri_common = NULL;
5753   }; // if
5754 
5755   if (thread->th.th_task_state_memo_stack != NULL) {
5756     __kmp_free(thread->th.th_task_state_memo_stack);
5757     thread->th.th_task_state_memo_stack = NULL;
5758   }
5759 
5760 #if KMP_USE_BGET
5761   if (thread->th.th_local.bget_data != NULL) {
5762     __kmp_finalize_bget(thread);
5763   }; // if
5764 #endif
5765 
5766 #if KMP_AFFINITY_SUPPORTED
5767   if (thread->th.th_affin_mask != NULL) {
5768     KMP_CPU_FREE(thread->th.th_affin_mask);
5769     thread->th.th_affin_mask = NULL;
5770   }; // if
5771 #endif /* KMP_AFFINITY_SUPPORTED */
5772 
5773   __kmp_reap_team(thread->th.th_serial_team);
5774   thread->th.th_serial_team = NULL;
5775   __kmp_free(thread);
5776 
5777   KMP_MB();
5778 
5779 } // __kmp_reap_thread
5780 
5781 static void __kmp_internal_end(void) {
5782   int i;
5783 
5784   /* First, unregister the library */
5785   __kmp_unregister_library();
5786 
5787 #if KMP_OS_WINDOWS
5788   /* In Win static library, we can't tell when a root actually dies, so we
5789      reclaim the data structures for any root threads that have died but not
5790      unregistered themselves, in order to shut down cleanly.
5791      In Win dynamic library we also can't tell when a thread dies.  */
5792   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5793 // dead roots
5794 #endif
5795 
5796   for (i = 0; i < __kmp_threads_capacity; i++)
5797     if (__kmp_root[i])
5798       if (__kmp_root[i]->r.r_active)
5799         break;
5800   KMP_MB(); /* Flush all pending memory write invalidates.  */
5801   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5802 
5803   if (i < __kmp_threads_capacity) {
5804 #if KMP_USE_MONITOR
5805     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5806     KMP_MB(); /* Flush all pending memory write invalidates.  */
5807 
5808 // Need to check that monitor was initialized before reaping it. If we are
5809 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5810 // __kmp_monitor will appear to contain valid data, but it is only valid in the
5811 // parent process, not the child.
5812     // New behavior (201008): instead of keying off of the flag
5813     // __kmp_init_parallel, the monitor thread creation is keyed off
5814     // of the new flag __kmp_init_monitor.
5815     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5816     if (TCR_4(__kmp_init_monitor)) {
5817       __kmp_reap_monitor(&__kmp_monitor);
5818       TCW_4(__kmp_init_monitor, 0);
5819     }
5820     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5821     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5822 #endif // KMP_USE_MONITOR
5823   } else {
5824 /* TODO move this to cleanup code */
5825 #ifdef KMP_DEBUG
5826     /* make sure that everything has properly ended */
5827     for (i = 0; i < __kmp_threads_capacity; i++) {
5828       if (__kmp_root[i]) {
5829         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
5830         //                    there can be uber threads alive here
5831         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5832       }
5833     }
5834 #endif
5835 
5836     KMP_MB();
5837 
5838     // Reap the worker threads.
5839     // This is valid for now, but be careful if threads are reaped sooner.
5840     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5841       // Get the next thread from the pool.
5842       kmp_info_t *thread = (kmp_info_t *)__kmp_thread_pool;
5843       __kmp_thread_pool = thread->th.th_next_pool;
5844       // Reap it.
5845       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5846       thread->th.th_next_pool = NULL;
5847       thread->th.th_in_pool = FALSE;
5848       __kmp_reap_thread(thread, 0);
5849     }; // while
5850     __kmp_thread_pool_insert_pt = NULL;
5851 
5852     // Reap teams.
5853     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5854       // Get the next team from the pool.
5855       kmp_team_t *team = (kmp_team_t *)__kmp_team_pool;
5856       __kmp_team_pool = team->t.t_next_pool;
5857       // Reap it.
5858       team->t.t_next_pool = NULL;
5859       __kmp_reap_team(team);
5860     }; // while
5861 
5862     __kmp_reap_task_teams();
5863 
5864     for (i = 0; i < __kmp_threads_capacity; ++i) {
5865       // TBD: Add some checking...
5866       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5867     }
5868 
5869     /* Make sure all threadprivate destructors get run by joining with all
5870        worker threads before resetting this flag */
5871     TCW_SYNC_4(__kmp_init_common, FALSE);
5872 
5873     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5874     KMP_MB();
5875 
5876 #if KMP_USE_MONITOR
5877     // See note above: One of the possible fixes for CQ138434 / CQ140126
5878     //
5879     // FIXME: push both code fragments down and CSE them?
5880     // push them into __kmp_cleanup() ?
5881     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5882     if (TCR_4(__kmp_init_monitor)) {
5883       __kmp_reap_monitor(&__kmp_monitor);
5884       TCW_4(__kmp_init_monitor, 0);
5885     }
5886     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5887     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5888 #endif
5889   } /* else !__kmp_global.t_active */
5890   TCW_4(__kmp_init_gtid, FALSE);
5891   KMP_MB(); /* Flush all pending memory write invalidates.  */
5892 
5893   __kmp_cleanup();
5894 #if OMPT_SUPPORT
5895   ompt_fini();
5896 #endif
5897 }
5898 
5899 void __kmp_internal_end_library(int gtid_req) {
5900   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5901   /* this shouldn't be a race condition because __kmp_internal_end() is the
5902      only place to clear __kmp_serial_init */
5903   /* we'll check this later too, after we get the lock */
5904   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
5905   // redundaant, because the next check will work in any case.
5906   if (__kmp_global.g.g_abort) {
5907     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
5908     /* TODO abort? */
5909     return;
5910   }
5911   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
5912     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
5913     return;
5914   }
5915 
5916   KMP_MB(); /* Flush all pending memory write invalidates.  */
5917 
5918   /* find out who we are and what we should do */
5919   {
5920     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
5921     KA_TRACE(
5922         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
5923     if (gtid == KMP_GTID_SHUTDOWN) {
5924       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
5925                     "already shutdown\n"));
5926       return;
5927     } else if (gtid == KMP_GTID_MONITOR) {
5928       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
5929                     "registered, or system shutdown\n"));
5930       return;
5931     } else if (gtid == KMP_GTID_DNE) {
5932       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
5933                     "shutdown\n"));
5934       /* we don't know who we are, but we may still shutdown the library */
5935     } else if (KMP_UBER_GTID(gtid)) {
5936       /* unregister ourselves as an uber thread.  gtid is no longer valid */
5937       if (__kmp_root[gtid]->r.r_active) {
5938         __kmp_global.g.g_abort = -1;
5939         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5940         KA_TRACE(10,
5941                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
5942                   gtid));
5943         return;
5944       } else {
5945         KA_TRACE(
5946             10,
5947             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
5948         __kmp_unregister_root_current_thread(gtid);
5949       }
5950     } else {
5951 /* worker threads may call this function through the atexit handler, if they
5952  * call exit() */
5953 /* For now, skip the usual subsequent processing and just dump the debug buffer.
5954    TODO: do a thorough shutdown instead */
5955 #ifdef DUMP_DEBUG_ON_EXIT
5956       if (__kmp_debug_buf)
5957         __kmp_dump_debug_buffer();
5958 #endif
5959       return;
5960     }
5961   }
5962   /* synchronize the termination process */
5963   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
5964 
5965   /* have we already finished */
5966   if (__kmp_global.g.g_abort) {
5967     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
5968     /* TODO abort? */
5969     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
5970     return;
5971   }
5972   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
5973     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
5974     return;
5975   }
5976 
5977   /* We need this lock to enforce mutex between this reading of
5978      __kmp_threads_capacity and the writing by __kmp_register_root.
5979      Alternatively, we can use a counter of roots that is atomically updated by
5980      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
5981      __kmp_internal_end_*.  */
5982   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
5983 
5984   /* now we can safely conduct the actual termination */
5985   __kmp_internal_end();
5986 
5987   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
5988   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
5989 
5990   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
5991 
5992 #ifdef DUMP_DEBUG_ON_EXIT
5993   if (__kmp_debug_buf)
5994     __kmp_dump_debug_buffer();
5995 #endif
5996 
5997 #if KMP_OS_WINDOWS
5998   __kmp_close_console();
5999 #endif
6000 
6001   __kmp_fini_allocator();
6002 
6003 } // __kmp_internal_end_library
6004 
6005 void __kmp_internal_end_thread(int gtid_req) {
6006   int i;
6007 
6008   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6009   /* this shouldn't be a race condition because __kmp_internal_end() is the
6010    * only place to clear __kmp_serial_init */
6011   /* we'll check this later too, after we get the lock */
6012   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6013   // redundant, because the next check will work in any case.
6014   if (__kmp_global.g.g_abort) {
6015     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6016     /* TODO abort? */
6017     return;
6018   }
6019   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6020     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6021     return;
6022   }
6023 
6024   KMP_MB(); /* Flush all pending memory write invalidates.  */
6025 
6026   /* find out who we are and what we should do */
6027   {
6028     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6029     KA_TRACE(10,
6030              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6031     if (gtid == KMP_GTID_SHUTDOWN) {
6032       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6033                     "already shutdown\n"));
6034       return;
6035     } else if (gtid == KMP_GTID_MONITOR) {
6036       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6037                     "registered, or system shutdown\n"));
6038       return;
6039     } else if (gtid == KMP_GTID_DNE) {
6040       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6041                     "shutdown\n"));
6042       return;
6043       /* we don't know who we are */
6044     } else if (KMP_UBER_GTID(gtid)) {
6045       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6046       if (__kmp_root[gtid]->r.r_active) {
6047         __kmp_global.g.g_abort = -1;
6048         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6049         KA_TRACE(10,
6050                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6051                   gtid));
6052         return;
6053       } else {
6054         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6055                       gtid));
6056         __kmp_unregister_root_current_thread(gtid);
6057       }
6058     } else {
6059       /* just a worker thread, let's leave */
6060       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6061 
6062       if (gtid >= 0) {
6063         __kmp_threads[gtid]->th.th_task_team = NULL;
6064       }
6065 
6066       KA_TRACE(10,
6067                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6068                 gtid));
6069       return;
6070     }
6071   }
6072 #if defined KMP_DYNAMIC_LIB
6073   // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6074   // thread, because we will better shutdown later in the library destructor.
6075   // The reason of this change is performance problem when non-openmp thread in
6076   // a loop forks and joins many openmp threads. We can save a lot of time
6077   // keeping worker threads alive until the program shutdown.
6078   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6079   // and Windows(DPD200287443) that occurs when using critical sections from
6080   // foreign threads.
6081   KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6082   return;
6083 #endif
6084   /* synchronize the termination process */
6085   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6086 
6087   /* have we already finished */
6088   if (__kmp_global.g.g_abort) {
6089     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6090     /* TODO abort? */
6091     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6092     return;
6093   }
6094   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6095     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6096     return;
6097   }
6098 
6099   /* We need this lock to enforce mutex between this reading of
6100      __kmp_threads_capacity and the writing by __kmp_register_root.
6101      Alternatively, we can use a counter of roots that is atomically updated by
6102      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6103      __kmp_internal_end_*.  */
6104 
6105   /* should we finish the run-time?  are all siblings done? */
6106   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6107 
6108   for (i = 0; i < __kmp_threads_capacity; ++i) {
6109     if (KMP_UBER_GTID(i)) {
6110       KA_TRACE(
6111           10,
6112           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6113       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6114       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6115       return;
6116     };
6117   }
6118 
6119   /* now we can safely conduct the actual termination */
6120 
6121   __kmp_internal_end();
6122 
6123   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6124   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6125 
6126   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6127 
6128 #ifdef DUMP_DEBUG_ON_EXIT
6129   if (__kmp_debug_buf)
6130     __kmp_dump_debug_buffer();
6131 #endif
6132 } // __kmp_internal_end_thread
6133 
6134 // -----------------------------------------------------------------------------
6135 // Library registration stuff.
6136 
6137 static long __kmp_registration_flag = 0;
6138 // Random value used to indicate library initialization.
6139 static char *__kmp_registration_str = NULL;
6140 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6141 
6142 static inline char *__kmp_reg_status_name() {
6143   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6144      each thread. If registration and unregistration go in different threads
6145      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6146      env var can not be found, because the name will contain different pid. */
6147   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6148 } // __kmp_reg_status_get
6149 
6150 void __kmp_register_library_startup(void) {
6151 
6152   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6153   int done = 0;
6154   union {
6155     double dtime;
6156     long ltime;
6157   } time;
6158 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6159   __kmp_initialize_system_tick();
6160 #endif
6161   __kmp_read_system_time(&time.dtime);
6162   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6163   __kmp_registration_str =
6164       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6165                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6166 
6167   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6168                 __kmp_registration_str));
6169 
6170   while (!done) {
6171 
6172     char *value = NULL; // Actual value of the environment variable.
6173 
6174     // Set environment variable, but do not overwrite if it is exist.
6175     __kmp_env_set(name, __kmp_registration_str, 0);
6176     // Check the variable is written.
6177     value = __kmp_env_get(name);
6178     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6179 
6180       done = 1; // Ok, environment variable set successfully, exit the loop.
6181 
6182     } else {
6183 
6184       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6185       // Check whether it alive or dead.
6186       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6187       char *tail = value;
6188       char *flag_addr_str = NULL;
6189       char *flag_val_str = NULL;
6190       char const *file_name = NULL;
6191       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6192       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6193       file_name = tail;
6194       if (tail != NULL) {
6195         long *flag_addr = 0;
6196         long flag_val = 0;
6197         KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6198         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6199         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6200           // First, check whether environment-encoded address is mapped into
6201           // addr space.
6202           // If so, dereference it to see if it still has the right value.
6203           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6204             neighbor = 1;
6205           } else {
6206             // If not, then we know the other copy of the library is no longer
6207             // running.
6208             neighbor = 2;
6209           }; // if
6210         }; // if
6211       }; // if
6212       switch (neighbor) {
6213       case 0: // Cannot parse environment variable -- neighbor status unknown.
6214         // Assume it is the incompatible format of future version of the
6215         // library. Assume the other library is alive.
6216         // WARN( ... ); // TODO: Issue a warning.
6217         file_name = "unknown library";
6218       // Attention! Falling to the next case. That's intentional.
6219       case 1: { // Neighbor is alive.
6220         // Check it is allowed.
6221         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6222         if (!__kmp_str_match_true(duplicate_ok)) {
6223           // That's not allowed. Issue fatal error.
6224           __kmp_msg(kmp_ms_fatal,
6225                     KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6226                     KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6227         }; // if
6228         KMP_INTERNAL_FREE(duplicate_ok);
6229         __kmp_duplicate_library_ok = 1;
6230         done = 1; // Exit the loop.
6231       } break;
6232       case 2: { // Neighbor is dead.
6233         // Clear the variable and try to register library again.
6234         __kmp_env_unset(name);
6235       } break;
6236       default: { KMP_DEBUG_ASSERT(0); } break;
6237       }; // switch
6238 
6239     }; // if
6240     KMP_INTERNAL_FREE((void *)value);
6241 
6242   }; // while
6243   KMP_INTERNAL_FREE((void *)name);
6244 
6245 } // func __kmp_register_library_startup
6246 
6247 void __kmp_unregister_library(void) {
6248 
6249   char *name = __kmp_reg_status_name();
6250   char *value = __kmp_env_get(name);
6251 
6252   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6253   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6254   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6255     // Ok, this is our variable. Delete it.
6256     __kmp_env_unset(name);
6257   }; // if
6258 
6259   KMP_INTERNAL_FREE(__kmp_registration_str);
6260   KMP_INTERNAL_FREE(value);
6261   KMP_INTERNAL_FREE(name);
6262 
6263   __kmp_registration_flag = 0;
6264   __kmp_registration_str = NULL;
6265 
6266 } // __kmp_unregister_library
6267 
6268 // End of Library registration stuff.
6269 // -----------------------------------------------------------------------------
6270 
6271 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6272 
6273 static void __kmp_check_mic_type() {
6274   kmp_cpuid_t cpuid_state = {0};
6275   kmp_cpuid_t *cs_p = &cpuid_state;
6276   __kmp_x86_cpuid(1, 0, cs_p);
6277   // We don't support mic1 at the moment
6278   if ((cs_p->eax & 0xff0) == 0xB10) {
6279     __kmp_mic_type = mic2;
6280   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6281     __kmp_mic_type = mic3;
6282   } else {
6283     __kmp_mic_type = non_mic;
6284   }
6285 }
6286 
6287 #endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
6288 
6289 static void __kmp_do_serial_initialize(void) {
6290   int i, gtid;
6291   int size;
6292 
6293   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6294 
6295   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6296   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6297   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6298   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6299   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6300 
6301 #if OMPT_SUPPORT
6302   ompt_pre_init();
6303 #endif
6304 
6305   __kmp_validate_locks();
6306 
6307   /* Initialize internal memory allocator */
6308   __kmp_init_allocator();
6309 
6310   /* Register the library startup via an environment variable and check to see
6311      whether another copy of the library is already registered. */
6312 
6313   __kmp_register_library_startup();
6314 
6315   /* TODO reinitialization of library */
6316   if (TCR_4(__kmp_global.g.g_done)) {
6317     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6318   }
6319 
6320   __kmp_global.g.g_abort = 0;
6321   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6322 
6323 /* initialize the locks */
6324 #if KMP_USE_ADAPTIVE_LOCKS
6325 #if KMP_DEBUG_ADAPTIVE_LOCKS
6326   __kmp_init_speculative_stats();
6327 #endif
6328 #endif
6329 #if KMP_STATS_ENABLED
6330   __kmp_stats_init();
6331 #endif
6332   __kmp_init_lock(&__kmp_global_lock);
6333   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6334   __kmp_init_lock(&__kmp_debug_lock);
6335   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6336   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6337   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6338   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6339   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6340   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6341   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6342   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6343   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6344   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6345   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6346   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6347   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6348   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6349   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6350 #if KMP_USE_MONITOR
6351   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6352 #endif
6353   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6354 
6355   /* conduct initialization and initial setup of configuration */
6356 
6357   __kmp_runtime_initialize();
6358 
6359 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6360   __kmp_check_mic_type();
6361 #endif
6362 
6363 // Some global variable initialization moved here from kmp_env_initialize()
6364 #ifdef KMP_DEBUG
6365   kmp_diag = 0;
6366 #endif
6367   __kmp_abort_delay = 0;
6368 
6369   // From __kmp_init_dflt_team_nth()
6370   /* assume the entire machine will be used */
6371   __kmp_dflt_team_nth_ub = __kmp_xproc;
6372   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6373     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6374   }
6375   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6376     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6377   }
6378   __kmp_max_nth = __kmp_sys_max_nth;
6379 
6380   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6381   // part
6382   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6383 #if KMP_USE_MONITOR
6384   __kmp_monitor_wakeups =
6385       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6386   __kmp_bt_intervals =
6387       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6388 #endif
6389   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6390   __kmp_library = library_throughput;
6391   // From KMP_SCHEDULE initialization
6392   __kmp_static = kmp_sch_static_balanced;
6393 // AC: do not use analytical here, because it is non-monotonous
6394 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6395 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6396 // need to repeat assignment
6397 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6398 // bit control and barrier method control parts
6399 #if KMP_FAST_REDUCTION_BARRIER
6400 #define kmp_reduction_barrier_gather_bb ((int)1)
6401 #define kmp_reduction_barrier_release_bb ((int)1)
6402 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6403 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6404 #endif // KMP_FAST_REDUCTION_BARRIER
6405   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6406     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6407     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6408     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6409     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6410 #if KMP_FAST_REDUCTION_BARRIER
6411     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6412       // lin_64 ): hyper,1
6413       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6414       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6415       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6416       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6417     }
6418 #endif // KMP_FAST_REDUCTION_BARRIER
6419   }
6420 #if KMP_FAST_REDUCTION_BARRIER
6421 #undef kmp_reduction_barrier_release_pat
6422 #undef kmp_reduction_barrier_gather_pat
6423 #undef kmp_reduction_barrier_release_bb
6424 #undef kmp_reduction_barrier_gather_bb
6425 #endif // KMP_FAST_REDUCTION_BARRIER
6426 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6427   if (__kmp_mic_type == mic2) { // KNC
6428     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6429     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6430     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6431         1; // forkjoin release
6432     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6433     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6434   }
6435 #if KMP_FAST_REDUCTION_BARRIER
6436   if (__kmp_mic_type == mic2) { // KNC
6437     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6438     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6439   }
6440 #endif
6441 #endif
6442 
6443 // From KMP_CHECKS initialization
6444 #ifdef KMP_DEBUG
6445   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6446 #else
6447   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6448 #endif
6449 
6450   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6451   __kmp_foreign_tp = TRUE;
6452 
6453   __kmp_global.g.g_dynamic = FALSE;
6454   __kmp_global.g.g_dynamic_mode = dynamic_default;
6455 
6456   __kmp_env_initialize(NULL);
6457 
6458 // Print all messages in message catalog for testing purposes.
6459 #ifdef KMP_DEBUG
6460   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6461   if (__kmp_str_match_true(val)) {
6462     kmp_str_buf_t buffer;
6463     __kmp_str_buf_init(&buffer);
6464     __kmp_i18n_dump_catalog(&buffer);
6465     __kmp_printf("%s", buffer.str);
6466     __kmp_str_buf_free(&buffer);
6467   }; // if
6468   __kmp_env_free(&val);
6469 #endif
6470 
6471   __kmp_threads_capacity =
6472       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6473   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6474   __kmp_tp_capacity = __kmp_default_tp_capacity(
6475       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6476 
6477   // If the library is shut down properly, both pools must be NULL. Just in
6478   // case, set them to NULL -- some memory may leak, but subsequent code will
6479   // work even if pools are not freed.
6480   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6481   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6482   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6483   __kmp_thread_pool = NULL;
6484   __kmp_thread_pool_insert_pt = NULL;
6485   __kmp_team_pool = NULL;
6486 
6487   /* Allocate all of the variable sized records */
6488   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6489    * expandable */
6490   /* Since allocation is cache-aligned, just add extra padding at the end */
6491   size =
6492       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6493       CACHE_LINE;
6494   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6495   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6496                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6497 
6498   /* init thread counts */
6499   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6500                    0); // Asserts fail if the library is reinitializing and
6501   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6502   __kmp_all_nth = 0;
6503   __kmp_nth = 0;
6504 
6505   /* setup the uber master thread and hierarchy */
6506   gtid = __kmp_register_root(TRUE);
6507   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6508   KMP_ASSERT(KMP_UBER_GTID(gtid));
6509   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6510 
6511   KMP_MB(); /* Flush all pending memory write invalidates.  */
6512 
6513   __kmp_common_initialize();
6514 
6515 #if KMP_OS_UNIX
6516   /* invoke the child fork handler */
6517   __kmp_register_atfork();
6518 #endif
6519 
6520 #if !defined KMP_DYNAMIC_LIB
6521   {
6522     /* Invoke the exit handler when the program finishes, only for static
6523        library. For dynamic library, we already have _fini and DllMain. */
6524     int rc = atexit(__kmp_internal_end_atexit);
6525     if (rc != 0) {
6526       __kmp_msg(kmp_ms_fatal, KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6527                 __kmp_msg_null);
6528     }; // if
6529   }
6530 #endif
6531 
6532 #if KMP_HANDLE_SIGNALS
6533 #if KMP_OS_UNIX
6534   /* NOTE: make sure that this is called before the user installs their own
6535      signal handlers so that the user handlers are called first. this way they
6536      can return false, not call our handler, avoid terminating the library, and
6537      continue execution where they left off. */
6538   __kmp_install_signals(FALSE);
6539 #endif /* KMP_OS_UNIX */
6540 #if KMP_OS_WINDOWS
6541   __kmp_install_signals(TRUE);
6542 #endif /* KMP_OS_WINDOWS */
6543 #endif
6544 
6545   /* we have finished the serial initialization */
6546   __kmp_init_counter++;
6547 
6548   __kmp_init_serial = TRUE;
6549 
6550   if (__kmp_settings) {
6551     __kmp_env_print();
6552   }
6553 
6554 #if OMP_40_ENABLED
6555   if (__kmp_display_env || __kmp_display_env_verbose) {
6556     __kmp_env_print_2();
6557   }
6558 #endif // OMP_40_ENABLED
6559 
6560 #if OMPT_SUPPORT
6561   ompt_post_init();
6562 #endif
6563 
6564   KMP_MB();
6565 
6566   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6567 }
6568 
6569 void __kmp_serial_initialize(void) {
6570   if (__kmp_init_serial) {
6571     return;
6572   }
6573   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6574   if (__kmp_init_serial) {
6575     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6576     return;
6577   }
6578   __kmp_do_serial_initialize();
6579   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6580 }
6581 
6582 static void __kmp_do_middle_initialize(void) {
6583   int i, j;
6584   int prev_dflt_team_nth;
6585 
6586   if (!__kmp_init_serial) {
6587     __kmp_do_serial_initialize();
6588   }
6589 
6590   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6591 
6592   // Save the previous value for the __kmp_dflt_team_nth so that
6593   // we can avoid some reinitialization if it hasn't changed.
6594   prev_dflt_team_nth = __kmp_dflt_team_nth;
6595 
6596 #if KMP_AFFINITY_SUPPORTED
6597   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6598   // number of cores on the machine.
6599   __kmp_affinity_initialize();
6600 
6601   // Run through the __kmp_threads array and set the affinity mask
6602   // for each root thread that is currently registered with the RTL.
6603   for (i = 0; i < __kmp_threads_capacity; i++) {
6604     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6605       __kmp_affinity_set_init_mask(i, TRUE);
6606     }
6607   }
6608 #endif /* KMP_AFFINITY_SUPPORTED */
6609 
6610   KMP_ASSERT(__kmp_xproc > 0);
6611   if (__kmp_avail_proc == 0) {
6612     __kmp_avail_proc = __kmp_xproc;
6613   }
6614 
6615   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6616   // correct them now
6617   j = 0;
6618   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6619     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6620         __kmp_avail_proc;
6621     j++;
6622   }
6623 
6624   if (__kmp_dflt_team_nth == 0) {
6625 #ifdef KMP_DFLT_NTH_CORES
6626     // Default #threads = #cores
6627     __kmp_dflt_team_nth = __kmp_ncores;
6628     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6629                   "__kmp_ncores (%d)\n",
6630                   __kmp_dflt_team_nth));
6631 #else
6632     // Default #threads = #available OS procs
6633     __kmp_dflt_team_nth = __kmp_avail_proc;
6634     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6635                   "__kmp_avail_proc(%d)\n",
6636                   __kmp_dflt_team_nth));
6637 #endif /* KMP_DFLT_NTH_CORES */
6638   }
6639 
6640   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6641     __kmp_dflt_team_nth = KMP_MIN_NTH;
6642   }
6643   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6644     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6645   }
6646 
6647   // There's no harm in continuing if the following check fails,
6648   // but it indicates an error in the previous logic.
6649   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6650 
6651   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6652     // Run through the __kmp_threads array and set the num threads icv for each
6653     // root thread that is currently registered with the RTL (which has not
6654     // already explicitly set its nthreads-var with a call to
6655     // omp_set_num_threads()).
6656     for (i = 0; i < __kmp_threads_capacity; i++) {
6657       kmp_info_t *thread = __kmp_threads[i];
6658       if (thread == NULL)
6659         continue;
6660       if (thread->th.th_current_task->td_icvs.nproc != 0)
6661         continue;
6662 
6663       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6664     }
6665   }
6666   KA_TRACE(
6667       20,
6668       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6669        __kmp_dflt_team_nth));
6670 
6671 #ifdef KMP_ADJUST_BLOCKTIME
6672   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6673   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6674     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6675     if (__kmp_nth > __kmp_avail_proc) {
6676       __kmp_zero_bt = TRUE;
6677     }
6678   }
6679 #endif /* KMP_ADJUST_BLOCKTIME */
6680 
6681   /* we have finished middle initialization */
6682   TCW_SYNC_4(__kmp_init_middle, TRUE);
6683 
6684   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6685 }
6686 
6687 void __kmp_middle_initialize(void) {
6688   if (__kmp_init_middle) {
6689     return;
6690   }
6691   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6692   if (__kmp_init_middle) {
6693     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6694     return;
6695   }
6696   __kmp_do_middle_initialize();
6697   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6698 }
6699 
6700 void __kmp_parallel_initialize(void) {
6701   int gtid = __kmp_entry_gtid(); // this might be a new root
6702 
6703   /* synchronize parallel initialization (for sibling) */
6704   if (TCR_4(__kmp_init_parallel))
6705     return;
6706   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6707   if (TCR_4(__kmp_init_parallel)) {
6708     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6709     return;
6710   }
6711 
6712   /* TODO reinitialization after we have already shut down */
6713   if (TCR_4(__kmp_global.g.g_done)) {
6714     KA_TRACE(
6715         10,
6716         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6717     __kmp_infinite_loop();
6718   }
6719 
6720   /* jc: The lock __kmp_initz_lock is already held, so calling
6721      __kmp_serial_initialize would cause a deadlock.  So we call
6722      __kmp_do_serial_initialize directly. */
6723   if (!__kmp_init_middle) {
6724     __kmp_do_middle_initialize();
6725   }
6726 
6727   /* begin initialization */
6728   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6729   KMP_ASSERT(KMP_UBER_GTID(gtid));
6730 
6731 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6732   // Save the FP control regs.
6733   // Worker threads will set theirs to these values at thread startup.
6734   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6735   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6736   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6737 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6738 
6739 #if KMP_OS_UNIX
6740 #if KMP_HANDLE_SIGNALS
6741   /*  must be after __kmp_serial_initialize  */
6742   __kmp_install_signals(TRUE);
6743 #endif
6744 #endif
6745 
6746   __kmp_suspend_initialize();
6747 
6748 #if defined(USE_LOAD_BALANCE)
6749   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6750     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6751   }
6752 #else
6753   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6754     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6755   }
6756 #endif
6757 
6758   if (__kmp_version) {
6759     __kmp_print_version_2();
6760   }
6761 
6762   /* we have finished parallel initialization */
6763   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6764 
6765   KMP_MB();
6766   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6767 
6768   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6769 }
6770 
6771 /* ------------------------------------------------------------------------ */
6772 
6773 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6774                                    kmp_team_t *team) {
6775   kmp_disp_t *dispatch;
6776 
6777   KMP_MB();
6778 
6779   /* none of the threads have encountered any constructs, yet. */
6780   this_thr->th.th_local.this_construct = 0;
6781 #if KMP_CACHE_MANAGE
6782   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6783 #endif /* KMP_CACHE_MANAGE */
6784   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6785   KMP_DEBUG_ASSERT(dispatch);
6786   KMP_DEBUG_ASSERT(team->t.t_dispatch);
6787   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6788   // this_thr->th.th_info.ds.ds_tid ] );
6789 
6790   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6791 #if OMP_45_ENABLED
6792   dispatch->th_doacross_buf_idx =
6793       0; /* reset the doacross dispatch buffer counter */
6794 #endif
6795   if (__kmp_env_consistency_check)
6796     __kmp_push_parallel(gtid, team->t.t_ident);
6797 
6798   KMP_MB(); /* Flush all pending memory write invalidates.  */
6799 }
6800 
6801 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6802                                   kmp_team_t *team) {
6803   if (__kmp_env_consistency_check)
6804     __kmp_pop_parallel(gtid, team->t.t_ident);
6805 
6806   __kmp_finish_implicit_task(this_thr);
6807 }
6808 
6809 int __kmp_invoke_task_func(int gtid) {
6810   int rc;
6811   int tid = __kmp_tid_from_gtid(gtid);
6812   kmp_info_t *this_thr = __kmp_threads[gtid];
6813   kmp_team_t *team = this_thr->th.th_team;
6814 
6815   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6816 #if USE_ITT_BUILD
6817   if (__itt_stack_caller_create_ptr) {
6818     __kmp_itt_stack_callee_enter(
6819         (__itt_caller)
6820             team->t.t_stack_id); // inform ittnotify about entering user's code
6821   }
6822 #endif /* USE_ITT_BUILD */
6823 #if INCLUDE_SSC_MARKS
6824   SSC_MARK_INVOKING();
6825 #endif
6826 
6827 #if OMPT_SUPPORT
6828   void *dummy;
6829   void **exit_runtime_p;
6830   ompt_task_id_t my_task_id;
6831   ompt_parallel_id_t my_parallel_id;
6832 
6833   if (ompt_enabled) {
6834     exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid]
6835                            .ompt_task_info.frame.exit_runtime_frame);
6836   } else {
6837     exit_runtime_p = &dummy;
6838   }
6839 
6840 #if OMPT_TRACE
6841   my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6842   my_parallel_id = team->t.ompt_team_info.parallel_id;
6843   if (ompt_enabled &&
6844       ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6845     ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(my_parallel_id,
6846                                                                  my_task_id);
6847   }
6848 #endif
6849 #endif
6850 
6851   {
6852     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6853     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6854     rc =
6855         __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6856                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
6857 #if OMPT_SUPPORT
6858                                ,
6859                                exit_runtime_p
6860 #endif
6861                                );
6862 #if OMPT_SUPPORT
6863     *exit_runtime_p = NULL;
6864 #endif
6865   }
6866 
6867 #if USE_ITT_BUILD
6868   if (__itt_stack_caller_create_ptr) {
6869     __kmp_itt_stack_callee_leave(
6870         (__itt_caller)
6871             team->t.t_stack_id); // inform ittnotify about leaving user's code
6872   }
6873 #endif /* USE_ITT_BUILD */
6874   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6875 
6876   return rc;
6877 }
6878 
6879 #if OMP_40_ENABLED
6880 void __kmp_teams_master(int gtid) {
6881   // This routine is called by all master threads in teams construct
6882   kmp_info_t *thr = __kmp_threads[gtid];
6883   kmp_team_t *team = thr->th.th_team;
6884   ident_t *loc = team->t.t_ident;
6885   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6886   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
6887   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
6888   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
6889                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
6890 // Launch league of teams now, but not let workers execute
6891 // (they hang on fork barrier until next parallel)
6892 #if INCLUDE_SSC_MARKS
6893   SSC_MARK_FORKING();
6894 #endif
6895   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
6896 #if OMPT_SUPPORT
6897                   (void *)thr->th.th_teams_microtask, // "unwrapped" task
6898 #endif
6899                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6900                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
6901 #if INCLUDE_SSC_MARKS
6902   SSC_MARK_JOINING();
6903 #endif
6904 
6905   // AC: last parameter "1" eliminates join barrier which won't work because
6906   // worker threads are in a fork barrier waiting for more parallel regions
6907   __kmp_join_call(loc, gtid
6908 #if OMPT_SUPPORT
6909                   ,
6910                   fork_context_intel
6911 #endif
6912                   ,
6913                   1);
6914 }
6915 
6916 int __kmp_invoke_teams_master(int gtid) {
6917   kmp_info_t *this_thr = __kmp_threads[gtid];
6918   kmp_team_t *team = this_thr->th.th_team;
6919 #if KMP_DEBUG
6920   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
6921     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
6922                      (void *)__kmp_teams_master);
6923 #endif
6924   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
6925   __kmp_teams_master(gtid);
6926   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
6927   return 1;
6928 }
6929 #endif /* OMP_40_ENABLED */
6930 
6931 /* this sets the requested number of threads for the next parallel region
6932    encountered by this team. since this should be enclosed in the forkjoin
6933    critical section it should avoid race conditions with assymmetrical nested
6934    parallelism */
6935 
6936 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
6937   kmp_info_t *thr = __kmp_threads[gtid];
6938 
6939   if (num_threads > 0)
6940     thr->th.th_set_nproc = num_threads;
6941 }
6942 
6943 #if OMP_40_ENABLED
6944 
6945 /* this sets the requested number of teams for the teams region and/or
6946    the number of threads for the next parallel region encountered  */
6947 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
6948                           int num_threads) {
6949   kmp_info_t *thr = __kmp_threads[gtid];
6950   KMP_DEBUG_ASSERT(num_teams >= 0);
6951   KMP_DEBUG_ASSERT(num_threads >= 0);
6952 
6953   if (num_teams == 0)
6954     num_teams = 1; // default number of teams is 1.
6955   if (num_teams > __kmp_max_nth) { // if too many teams requested?
6956     if (!__kmp_reserve_warn) {
6957       __kmp_reserve_warn = 1;
6958       __kmp_msg(kmp_ms_warning,
6959                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_max_nth),
6960                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
6961     }
6962     num_teams = __kmp_max_nth;
6963   }
6964   // Set number of teams (number of threads in the outer "parallel" of the
6965   // teams)
6966   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
6967 
6968   // Remember the number of threads for inner parallel regions
6969   if (num_threads == 0) {
6970     if (!TCR_4(__kmp_init_middle))
6971       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
6972     num_threads = __kmp_avail_proc / num_teams;
6973     if (num_teams * num_threads > __kmp_max_nth) {
6974       // adjust num_threads w/o warning as it is not user setting
6975       num_threads = __kmp_max_nth / num_teams;
6976     }
6977   } else {
6978     if (num_teams * num_threads > __kmp_max_nth) {
6979       int new_threads = __kmp_max_nth / num_teams;
6980       if (!__kmp_reserve_warn) { // user asked for too many threads
6981         __kmp_reserve_warn = 1; // that conflicts with OMP_THREAD_LIMIT
6982         __kmp_msg(kmp_ms_warning,
6983                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
6984                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
6985       }
6986       num_threads = new_threads;
6987     }
6988   }
6989   thr->th.th_teams_size.nth = num_threads;
6990 }
6991 
6992 // Set the proc_bind var to use in the following parallel region.
6993 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
6994   kmp_info_t *thr = __kmp_threads[gtid];
6995   thr->th.th_set_proc_bind = proc_bind;
6996 }
6997 
6998 #endif /* OMP_40_ENABLED */
6999 
7000 /* Launch the worker threads into the microtask. */
7001 
7002 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7003   kmp_info_t *this_thr = __kmp_threads[gtid];
7004 
7005 #ifdef KMP_DEBUG
7006   int f;
7007 #endif /* KMP_DEBUG */
7008 
7009   KMP_DEBUG_ASSERT(team);
7010   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7011   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7012   KMP_MB(); /* Flush all pending memory write invalidates.  */
7013 
7014   team->t.t_construct = 0; /* no single directives seen yet */
7015   team->t.t_ordered.dt.t_value =
7016       0; /* thread 0 enters the ordered section first */
7017 
7018   /* Reset the identifiers on the dispatch buffer */
7019   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7020   if (team->t.t_max_nproc > 1) {
7021     int i;
7022     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7023       team->t.t_disp_buffer[i].buffer_index = i;
7024 #if OMP_45_ENABLED
7025       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7026 #endif
7027     }
7028   } else {
7029     team->t.t_disp_buffer[0].buffer_index = 0;
7030 #if OMP_45_ENABLED
7031     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7032 #endif
7033   }
7034 
7035   KMP_MB(); /* Flush all pending memory write invalidates.  */
7036   KMP_ASSERT(this_thr->th.th_team == team);
7037 
7038 #ifdef KMP_DEBUG
7039   for (f = 0; f < team->t.t_nproc; f++) {
7040     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7041                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7042   }
7043 #endif /* KMP_DEBUG */
7044 
7045   /* release the worker threads so they may begin working */
7046   __kmp_fork_barrier(gtid, 0);
7047 }
7048 
7049 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7050   kmp_info_t *this_thr = __kmp_threads[gtid];
7051 
7052   KMP_DEBUG_ASSERT(team);
7053   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7054   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7055   KMP_MB(); /* Flush all pending memory write invalidates.  */
7056 
7057 /* Join barrier after fork */
7058 
7059 #ifdef KMP_DEBUG
7060   if (__kmp_threads[gtid] &&
7061       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7062     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7063                  __kmp_threads[gtid]);
7064     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7065                  "team->t.t_nproc=%d\n",
7066                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7067                  team->t.t_nproc);
7068     __kmp_print_structure();
7069   }
7070   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7071                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7072 #endif /* KMP_DEBUG */
7073 
7074   __kmp_join_barrier(gtid); /* wait for everyone */
7075 
7076   KMP_MB(); /* Flush all pending memory write invalidates.  */
7077   KMP_ASSERT(this_thr->th.th_team == team);
7078 }
7079 
7080 /* ------------------------------------------------------------------------ */
7081 
7082 #ifdef USE_LOAD_BALANCE
7083 
7084 // Return the worker threads actively spinning in the hot team, if we
7085 // are at the outermost level of parallelism.  Otherwise, return 0.
7086 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7087   int i;
7088   int retval;
7089   kmp_team_t *hot_team;
7090 
7091   if (root->r.r_active) {
7092     return 0;
7093   }
7094   hot_team = root->r.r_hot_team;
7095   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7096     return hot_team->t.t_nproc - 1; // Don't count master thread
7097   }
7098 
7099   // Skip the master thread - it is accounted for elsewhere.
7100   retval = 0;
7101   for (i = 1; i < hot_team->t.t_nproc; i++) {
7102     if (hot_team->t.t_threads[i]->th.th_active) {
7103       retval++;
7104     }
7105   }
7106   return retval;
7107 }
7108 
7109 // Perform an automatic adjustment to the number of
7110 // threads used by the next parallel region.
7111 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7112   int retval;
7113   int pool_active;
7114   int hot_team_active;
7115   int team_curr_active;
7116   int system_active;
7117 
7118   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7119                 set_nproc));
7120   KMP_DEBUG_ASSERT(root);
7121   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7122                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7123   KMP_DEBUG_ASSERT(set_nproc > 1);
7124 
7125   if (set_nproc == 1) {
7126     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7127     return 1;
7128   }
7129 
7130   // Threads that are active in the thread pool, active in the hot team for this
7131   // particular root (if we are at the outer par level), and the currently
7132   // executing thread (to become the master) are available to add to the new
7133   // team, but are currently contributing to the system load, and must be
7134   // accounted for.
7135   pool_active = TCR_4(__kmp_thread_pool_active_nth);
7136   hot_team_active = __kmp_active_hot_team_nproc(root);
7137   team_curr_active = pool_active + hot_team_active + 1;
7138 
7139   // Check the system load.
7140   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7141   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7142                 "hot team active = %d\n",
7143                 system_active, pool_active, hot_team_active));
7144 
7145   if (system_active < 0) {
7146     // There was an error reading the necessary info from /proc, so use the
7147     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7148     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7149     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7150     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7151 
7152     // Make this call behave like the thread limit algorithm.
7153     retval = __kmp_avail_proc - __kmp_nth +
7154              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7155     if (retval > set_nproc) {
7156       retval = set_nproc;
7157     }
7158     if (retval < KMP_MIN_NTH) {
7159       retval = KMP_MIN_NTH;
7160     }
7161 
7162     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7163                   retval));
7164     return retval;
7165   }
7166 
7167   // There is a slight delay in the load balance algorithm in detecting new
7168   // running procs. The real system load at this instant should be at least as
7169   // large as the #active omp thread that are available to add to the team.
7170   if (system_active < team_curr_active) {
7171     system_active = team_curr_active;
7172   }
7173   retval = __kmp_avail_proc - system_active + team_curr_active;
7174   if (retval > set_nproc) {
7175     retval = set_nproc;
7176   }
7177   if (retval < KMP_MIN_NTH) {
7178     retval = KMP_MIN_NTH;
7179   }
7180 
7181   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7182   return retval;
7183 } // __kmp_load_balance_nproc()
7184 
7185 #endif /* USE_LOAD_BALANCE */
7186 
7187 /* ------------------------------------------------------------------------ */
7188 
7189 /* NOTE: this is called with the __kmp_init_lock held */
7190 void __kmp_cleanup(void) {
7191   int f;
7192 
7193   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7194 
7195   if (TCR_4(__kmp_init_parallel)) {
7196 #if KMP_HANDLE_SIGNALS
7197     __kmp_remove_signals();
7198 #endif
7199     TCW_4(__kmp_init_parallel, FALSE);
7200   }
7201 
7202   if (TCR_4(__kmp_init_middle)) {
7203 #if KMP_AFFINITY_SUPPORTED
7204     __kmp_affinity_uninitialize();
7205 #endif /* KMP_AFFINITY_SUPPORTED */
7206     __kmp_cleanup_hierarchy();
7207     TCW_4(__kmp_init_middle, FALSE);
7208   }
7209 
7210   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7211 
7212   if (__kmp_init_serial) {
7213     __kmp_runtime_destroy();
7214     __kmp_init_serial = FALSE;
7215   }
7216 
7217   for (f = 0; f < __kmp_threads_capacity; f++) {
7218     if (__kmp_root[f] != NULL) {
7219       __kmp_free(__kmp_root[f]);
7220       __kmp_root[f] = NULL;
7221     }
7222   }
7223   __kmp_free(__kmp_threads);
7224   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7225   // there is no need in freeing __kmp_root.
7226   __kmp_threads = NULL;
7227   __kmp_root = NULL;
7228   __kmp_threads_capacity = 0;
7229 
7230 #if KMP_USE_DYNAMIC_LOCK
7231   __kmp_cleanup_indirect_user_locks();
7232 #else
7233   __kmp_cleanup_user_locks();
7234 #endif
7235 
7236 #if KMP_AFFINITY_SUPPORTED
7237   KMP_INTERNAL_FREE((void *)__kmp_cpuinfo_file);
7238   __kmp_cpuinfo_file = NULL;
7239 #endif /* KMP_AFFINITY_SUPPORTED */
7240 
7241 #if KMP_USE_ADAPTIVE_LOCKS
7242 #if KMP_DEBUG_ADAPTIVE_LOCKS
7243   __kmp_print_speculative_stats();
7244 #endif
7245 #endif
7246   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7247   __kmp_nested_nth.nth = NULL;
7248   __kmp_nested_nth.size = 0;
7249   __kmp_nested_nth.used = 0;
7250   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7251   __kmp_nested_proc_bind.bind_types = NULL;
7252   __kmp_nested_proc_bind.size = 0;
7253   __kmp_nested_proc_bind.used = 0;
7254 
7255   __kmp_i18n_catclose();
7256 
7257 #if KMP_STATS_ENABLED
7258   __kmp_stats_fini();
7259 #endif
7260 
7261   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7262 }
7263 
7264 /* ------------------------------------------------------------------------ */
7265 
7266 int __kmp_ignore_mppbeg(void) {
7267   char *env;
7268 
7269   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7270     if (__kmp_str_match_false(env))
7271       return FALSE;
7272   }
7273   // By default __kmpc_begin() is no-op.
7274   return TRUE;
7275 }
7276 
7277 int __kmp_ignore_mppend(void) {
7278   char *env;
7279 
7280   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7281     if (__kmp_str_match_false(env))
7282       return FALSE;
7283   }
7284   // By default __kmpc_end() is no-op.
7285   return TRUE;
7286 }
7287 
7288 void __kmp_internal_begin(void) {
7289   int gtid;
7290   kmp_root_t *root;
7291 
7292   /* this is a very important step as it will register new sibling threads
7293      and assign these new uber threads a new gtid */
7294   gtid = __kmp_entry_gtid();
7295   root = __kmp_threads[gtid]->th.th_root;
7296   KMP_ASSERT(KMP_UBER_GTID(gtid));
7297 
7298   if (root->r.r_begin)
7299     return;
7300   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7301   if (root->r.r_begin) {
7302     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7303     return;
7304   }
7305 
7306   root->r.r_begin = TRUE;
7307 
7308   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7309 }
7310 
7311 /* ------------------------------------------------------------------------ */
7312 
7313 void __kmp_user_set_library(enum library_type arg) {
7314   int gtid;
7315   kmp_root_t *root;
7316   kmp_info_t *thread;
7317 
7318   /* first, make sure we are initialized so we can get our gtid */
7319 
7320   gtid = __kmp_entry_gtid();
7321   thread = __kmp_threads[gtid];
7322 
7323   root = thread->th.th_root;
7324 
7325   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7326                 library_serial));
7327   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7328                                   thread */
7329     KMP_WARNING(SetLibraryIncorrectCall);
7330     return;
7331   }
7332 
7333   switch (arg) {
7334   case library_serial:
7335     thread->th.th_set_nproc = 0;
7336     set__nproc(thread, 1);
7337     break;
7338   case library_turnaround:
7339     thread->th.th_set_nproc = 0;
7340     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7341                                            : __kmp_dflt_team_nth_ub);
7342     break;
7343   case library_throughput:
7344     thread->th.th_set_nproc = 0;
7345     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7346                                            : __kmp_dflt_team_nth_ub);
7347     break;
7348   default:
7349     KMP_FATAL(UnknownLibraryType, arg);
7350   }
7351 
7352   __kmp_aux_set_library(arg);
7353 }
7354 
7355 void __kmp_aux_set_stacksize(size_t arg) {
7356   if (!__kmp_init_serial)
7357     __kmp_serial_initialize();
7358 
7359 #if KMP_OS_DARWIN
7360   if (arg & (0x1000 - 1)) {
7361     arg &= ~(0x1000 - 1);
7362     if (arg + 0x1000) /* check for overflow if we round up */
7363       arg += 0x1000;
7364   }
7365 #endif
7366   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7367 
7368   /* only change the default stacksize before the first parallel region */
7369   if (!TCR_4(__kmp_init_parallel)) {
7370     size_t value = arg; /* argument is in bytes */
7371 
7372     if (value < __kmp_sys_min_stksize)
7373       value = __kmp_sys_min_stksize;
7374     else if (value > KMP_MAX_STKSIZE)
7375       value = KMP_MAX_STKSIZE;
7376 
7377     __kmp_stksize = value;
7378 
7379     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7380   }
7381 
7382   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7383 }
7384 
7385 /* set the behaviour of the runtime library */
7386 /* TODO this can cause some odd behaviour with sibling parallelism... */
7387 void __kmp_aux_set_library(enum library_type arg) {
7388   __kmp_library = arg;
7389 
7390   switch (__kmp_library) {
7391   case library_serial: {
7392     KMP_INFORM(LibraryIsSerial);
7393     (void)__kmp_change_library(TRUE);
7394   } break;
7395   case library_turnaround:
7396     (void)__kmp_change_library(TRUE);
7397     break;
7398   case library_throughput:
7399     (void)__kmp_change_library(FALSE);
7400     break;
7401   default:
7402     KMP_FATAL(UnknownLibraryType, arg);
7403   }
7404 }
7405 
7406 /* ------------------------------------------------------------------------ */
7407 
7408 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7409   int blocktime = arg; /* argument is in milliseconds */
7410 #if KMP_USE_MONITOR
7411   int bt_intervals;
7412 #endif
7413   int bt_set;
7414 
7415   __kmp_save_internal_controls(thread);
7416 
7417   /* Normalize and set blocktime for the teams */
7418   if (blocktime < KMP_MIN_BLOCKTIME)
7419     blocktime = KMP_MIN_BLOCKTIME;
7420   else if (blocktime > KMP_MAX_BLOCKTIME)
7421     blocktime = KMP_MAX_BLOCKTIME;
7422 
7423   set__blocktime_team(thread->th.th_team, tid, blocktime);
7424   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7425 
7426 #if KMP_USE_MONITOR
7427   /* Calculate and set blocktime intervals for the teams */
7428   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7429 
7430   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7431   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7432 #endif
7433 
7434   /* Set whether blocktime has been set to "TRUE" */
7435   bt_set = TRUE;
7436 
7437   set__bt_set_team(thread->th.th_team, tid, bt_set);
7438   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7439 #if KMP_USE_MONITOR
7440   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7441                 "bt_intervals=%d, monitor_updates=%d\n",
7442                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7443                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7444                 __kmp_monitor_wakeups));
7445 #else
7446   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7447                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7448                 thread->th.th_team->t.t_id, tid, blocktime));
7449 #endif
7450 }
7451 
7452 void __kmp_aux_set_defaults(char const *str, int len) {
7453   if (!__kmp_init_serial) {
7454     __kmp_serial_initialize();
7455   };
7456   __kmp_env_initialize(str);
7457 
7458   if (__kmp_settings
7459 #if OMP_40_ENABLED
7460       || __kmp_display_env || __kmp_display_env_verbose
7461 #endif // OMP_40_ENABLED
7462       ) {
7463     __kmp_env_print();
7464   }
7465 } // __kmp_aux_set_defaults
7466 
7467 /* ------------------------------------------------------------------------ */
7468 /* internal fast reduction routines */
7469 
7470 PACKED_REDUCTION_METHOD_T
7471 __kmp_determine_reduction_method(
7472     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7473     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7474     kmp_critical_name *lck) {
7475 
7476   // Default reduction method: critical construct ( lck != NULL, like in current
7477   // PAROPT )
7478   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7479   // can be selected by RTL
7480   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7481   // can be selected by RTL
7482   // Finally, it's up to OpenMP RTL to make a decision on which method to select
7483   // among generated by PAROPT.
7484 
7485   PACKED_REDUCTION_METHOD_T retval;
7486 
7487   int team_size;
7488 
7489   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7490   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7491 
7492 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
7493   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7494 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7495 
7496   retval = critical_reduce_block;
7497 
7498   // another choice of getting a team size (with 1 dynamic deference) is slower
7499   team_size = __kmp_get_team_num_threads(global_tid);
7500   if (team_size == 1) {
7501 
7502     retval = empty_reduce_block;
7503 
7504   } else {
7505 
7506     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7507     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7508 
7509 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7510 
7511 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||       \
7512     KMP_OS_DARWIN
7513 
7514     int teamsize_cutoff = 4;
7515 
7516 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
7517     if (__kmp_mic_type != non_mic) {
7518       teamsize_cutoff = 8;
7519     }
7520 #endif
7521     if (tree_available) {
7522       if (team_size <= teamsize_cutoff) {
7523         if (atomic_available) {
7524           retval = atomic_reduce_block;
7525         }
7526       } else {
7527         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7528       }
7529     } else if (atomic_available) {
7530       retval = atomic_reduce_block;
7531     }
7532 #else
7533 #error "Unknown or unsupported OS"
7534 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7535 // KMP_OS_DARWIN
7536 
7537 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7538 
7539 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7540 
7541     // basic tuning
7542 
7543     if (atomic_available) {
7544       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7545         retval = atomic_reduce_block;
7546       }
7547     } // otherwise: use critical section
7548 
7549 #elif KMP_OS_DARWIN
7550 
7551     if (atomic_available && (num_vars <= 3)) {
7552       retval = atomic_reduce_block;
7553     } else if (tree_available) {
7554       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7555           (reduce_size < (2000 * sizeof(kmp_real64)))) {
7556         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7557       }
7558     } // otherwise: use critical section
7559 
7560 #else
7561 #error "Unknown or unsupported OS"
7562 #endif
7563 
7564 #else
7565 #error "Unknown or unsupported architecture"
7566 #endif
7567   }
7568 
7569   // KMP_FORCE_REDUCTION
7570 
7571   // If the team is serialized (team_size == 1), ignore the forced reduction
7572   // method and stay with the unsynchronized method (empty_reduce_block)
7573   if (__kmp_force_reduction_method != reduction_method_not_defined &&
7574       team_size != 1) {
7575 
7576     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7577 
7578     int atomic_available, tree_available;
7579 
7580     switch ((forced_retval = __kmp_force_reduction_method)) {
7581     case critical_reduce_block:
7582       KMP_ASSERT(lck); // lck should be != 0
7583       break;
7584 
7585     case atomic_reduce_block:
7586       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7587       if (!atomic_available) {
7588         KMP_WARNING(RedMethodNotSupported, "atomic");
7589         forced_retval = critical_reduce_block;
7590       }
7591       break;
7592 
7593     case tree_reduce_block:
7594       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7595       if (!tree_available) {
7596         KMP_WARNING(RedMethodNotSupported, "tree");
7597         forced_retval = critical_reduce_block;
7598       } else {
7599 #if KMP_FAST_REDUCTION_BARRIER
7600         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7601 #endif
7602       }
7603       break;
7604 
7605     default:
7606       KMP_ASSERT(0); // "unsupported method specified"
7607     }
7608 
7609     retval = forced_retval;
7610   }
7611 
7612   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7613 
7614 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7615 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7616 
7617   return (retval);
7618 }
7619 
7620 // this function is for testing set/get/determine reduce method
7621 kmp_int32 __kmp_get_reduce_method(void) {
7622   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7623 }
7624