1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if KMP_OS_WINDOWS
45 // windows does not need include files as it doesn't use shared memory
46 #else
47 #include <sys/mman.h>
48 #include <sys/stat.h>
49 #include <fcntl.h>
50 #define SHM_SIZE 1024
51 #endif
52 
53 #if defined(KMP_GOMP_COMPAT)
54 char const __kmp_version_alt_comp[] =
55     KMP_VERSION_PREFIX "alternative compiler support: yes";
56 #endif /* defined(KMP_GOMP_COMPAT) */
57 
58 char const __kmp_version_omp_api[] =
59     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63     KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79                                   int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81                                   kmp_internal_control_t *new_icvs,
82                                   ident_t *loc);
83 #if KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85                                    int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91                           kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
102 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
103 
104 /* Calculate the identifier of the current thread */
105 /* fast (and somewhat portable) way to get unique identifier of executing
106    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
107 int __kmp_get_global_thread_id() {
108   int i;
109   kmp_info_t **other_threads;
110   size_t stack_data;
111   char *stack_addr;
112   size_t stack_size;
113   char *stack_base;
114 
115   KA_TRACE(
116       1000,
117       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
118        __kmp_nth, __kmp_all_nth));
119 
120   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
121      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
122      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
123      __kmp_init_gtid for this to work. */
124 
125   if (!TCR_4(__kmp_init_gtid))
126     return KMP_GTID_DNE;
127 
128 #ifdef KMP_TDATA_GTID
129   if (TCR_4(__kmp_gtid_mode) >= 3) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
131     return __kmp_gtid;
132   }
133 #endif
134   if (TCR_4(__kmp_gtid_mode) >= 2) {
135     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
136     return __kmp_gtid_get_specific();
137   }
138   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
139 
140   stack_addr = (char *)&stack_data;
141   other_threads = __kmp_threads;
142 
143   /* ATT: The code below is a source of potential bugs due to unsynchronized
144      access to __kmp_threads array. For example:
145      1. Current thread loads other_threads[i] to thr and checks it, it is
146         non-NULL.
147      2. Current thread is suspended by OS.
148      3. Another thread unregisters and finishes (debug versions of free()
149         may fill memory with something like 0xEF).
150      4. Current thread is resumed.
151      5. Current thread reads junk from *thr.
152      TODO: Fix it.  --ln  */
153 
154   for (i = 0; i < __kmp_threads_capacity; i++) {
155 
156     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
157     if (!thr)
158       continue;
159 
160     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
161     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
162 
163     /* stack grows down -- search through all of the active threads */
164 
165     if (stack_addr <= stack_base) {
166       size_t stack_diff = stack_base - stack_addr;
167 
168       if (stack_diff <= stack_size) {
169         /* The only way we can be closer than the allocated */
170         /* stack size is if we are running on this thread. */
171         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
172         return i;
173       }
174     }
175   }
176 
177   /* get specific to try and determine our gtid */
178   KA_TRACE(1000,
179            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
180             "thread, using TLS\n"));
181   i = __kmp_gtid_get_specific();
182 
183   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
184 
185   /* if we havn't been assigned a gtid, then return code */
186   if (i < 0)
187     return i;
188 
189   /* dynamically updated stack window for uber threads to avoid get_specific
190      call */
191   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
192     KMP_FATAL(StackOverflow, i);
193   }
194 
195   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
196   if (stack_addr > stack_base) {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
198     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
199             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
200                 stack_base);
201   } else {
202     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
203             stack_base - stack_addr);
204   }
205 
206   /* Reprint stack bounds for ubermaster since they have been refined */
207   if (__kmp_storage_map) {
208     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
209     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
210     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
211                                  other_threads[i]->th.th_info.ds.ds_stacksize,
212                                  "th_%d stack (refinement)", i);
213   }
214   return i;
215 }
216 
217 int __kmp_get_global_thread_id_reg() {
218   int gtid;
219 
220   if (!__kmp_init_serial) {
221     gtid = KMP_GTID_DNE;
222   } else
223 #ifdef KMP_TDATA_GTID
224       if (TCR_4(__kmp_gtid_mode) >= 3) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
226     gtid = __kmp_gtid;
227   } else
228 #endif
229       if (TCR_4(__kmp_gtid_mode) >= 2) {
230     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
231     gtid = __kmp_gtid_get_specific();
232   } else {
233     KA_TRACE(1000,
234              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
235     gtid = __kmp_get_global_thread_id();
236   }
237 
238   /* we must be a new uber master sibling thread */
239   if (gtid == KMP_GTID_DNE) {
240     KA_TRACE(10,
241              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
242               "Registering a new gtid.\n"));
243     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
244     if (!__kmp_init_serial) {
245       __kmp_do_serial_initialize();
246       gtid = __kmp_gtid_get_specific();
247     } else {
248       gtid = __kmp_register_root(FALSE);
249     }
250     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
251     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
252   }
253 
254   KMP_DEBUG_ASSERT(gtid >= 0);
255 
256   return gtid;
257 }
258 
259 /* caller must hold forkjoin_lock */
260 void __kmp_check_stack_overlap(kmp_info_t *th) {
261   int f;
262   char *stack_beg = NULL;
263   char *stack_end = NULL;
264   int gtid;
265 
266   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
267   if (__kmp_storage_map) {
268     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
269     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
270 
271     gtid = __kmp_gtid_from_thread(th);
272 
273     if (gtid == KMP_GTID_MONITOR) {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%s stack (%s)", "mon",
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     } else {
279       __kmp_print_storage_map_gtid(
280           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281           "th_%d stack (%s)", gtid,
282           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283     }
284   }
285 
286   /* No point in checking ubermaster threads since they use refinement and
287    * cannot overlap */
288   gtid = __kmp_gtid_from_thread(th);
289   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
290     KA_TRACE(10,
291              ("__kmp_check_stack_overlap: performing extensive checking\n"));
292     if (stack_beg == NULL) {
293       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
294       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
295     }
296 
297     for (f = 0; f < __kmp_threads_capacity; f++) {
298       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
299 
300       if (f_th && f_th != th) {
301         char *other_stack_end =
302             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
303         char *other_stack_beg =
304             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
305         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
306             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
307 
308           /* Print the other stack values before the abort */
309           if (__kmp_storage_map)
310             __kmp_print_storage_map_gtid(
311                 -1, other_stack_beg, other_stack_end,
312                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
313                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
314 
315           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
316                       __kmp_msg_null);
317         }
318       }
319     }
320   }
321   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
322 }
323 
324 /* ------------------------------------------------------------------------ */
325 
326 void __kmp_infinite_loop(void) {
327   static int done = FALSE;
328 
329   while (!done) {
330     KMP_YIELD(TRUE);
331   }
332 }
333 
334 #define MAX_MESSAGE 512
335 
336 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
337                                   char const *format, ...) {
338   char buffer[MAX_MESSAGE];
339   va_list ap;
340 
341   va_start(ap, format);
342   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
343                p2, (unsigned long)size, format);
344   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
345   __kmp_vprintf(kmp_err, buffer, ap);
346 #if KMP_PRINT_DATA_PLACEMENT
347   int node;
348   if (gtid >= 0) {
349     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
350       if (__kmp_storage_map_verbose) {
351         node = __kmp_get_host_node(p1);
352         if (node < 0) /* doesn't work, so don't try this next time */
353           __kmp_storage_map_verbose = FALSE;
354         else {
355           char *last;
356           int lastNode;
357           int localProc = __kmp_get_cpu_from_gtid(gtid);
358 
359           const int page_size = KMP_GET_PAGE_SIZE();
360 
361           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
362           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
363           if (localProc >= 0)
364             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
365                                  localProc >> 1);
366           else
367             __kmp_printf_no_lock("  GTID %d\n", gtid);
368 #if KMP_USE_PRCTL
369           /* The more elaborate format is disabled for now because of the prctl
370            * hanging bug. */
371           do {
372             last = p1;
373             lastNode = node;
374             /* This loop collates adjacent pages with the same host node. */
375             do {
376               (char *)p1 += page_size;
377             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
378             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
379                                  lastNode);
380           } while (p1 <= p2);
381 #else
382           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
383                                (char *)p1 + (page_size - 1),
384                                __kmp_get_host_node(p1));
385           if (p1 < p2) {
386             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
387                                  (char *)p2 + (page_size - 1),
388                                  __kmp_get_host_node(p2));
389           }
390 #endif
391         }
392       }
393     } else
394       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
395   }
396 #endif /* KMP_PRINT_DATA_PLACEMENT */
397   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
398 }
399 
400 void __kmp_warn(char const *format, ...) {
401   char buffer[MAX_MESSAGE];
402   va_list ap;
403 
404   if (__kmp_generate_warnings == kmp_warnings_off) {
405     return;
406   }
407 
408   va_start(ap, format);
409 
410   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
411   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
412   __kmp_vprintf(kmp_err, buffer, ap);
413   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
414 
415   va_end(ap);
416 }
417 
418 void __kmp_abort_process() {
419   // Later threads may stall here, but that's ok because abort() will kill them.
420   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
421 
422   if (__kmp_debug_buf) {
423     __kmp_dump_debug_buffer();
424   }
425 
426   if (KMP_OS_WINDOWS) {
427     // Let other threads know of abnormal termination and prevent deadlock
428     // if abort happened during library initialization or shutdown
429     __kmp_global.g.g_abort = SIGABRT;
430 
431     /* On Windows* OS by default abort() causes pop-up error box, which stalls
432        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
433        boxes. _set_abort_behavior() works well, but this function is not
434        available in VS7 (this is not problem for DLL, but it is a problem for
435        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
436        help, at least in some versions of MS C RTL.
437 
438        It seems following sequence is the only way to simulate abort() and
439        avoid pop-up error box. */
440     raise(SIGABRT);
441     _exit(3); // Just in case, if signal ignored, exit anyway.
442   } else {
443     __kmp_unregister_library();
444     abort();
445   }
446 
447   __kmp_infinite_loop();
448   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449 
450 } // __kmp_abort_process
451 
452 void __kmp_abort_thread(void) {
453   // TODO: Eliminate g_abort global variable and this function.
454   // In case of abort just call abort(), it will kill all the threads.
455   __kmp_infinite_loop();
456 } // __kmp_abort_thread
457 
458 /* Print out the storage map for the major kmp_info_t thread data structures
459    that are allocated together. */
460 
461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463                                gtid);
464 
465   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467 
468   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
470 
471   __kmp_print_storage_map_gtid(
472       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474 
475   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476                                &thr->th.th_bar[bs_plain_barrier + 1],
477                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478                                gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483                                gtid);
484 
485 #if KMP_FAST_REDUCTION_BARRIER
486   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487                                &thr->th.th_bar[bs_reduction_barrier + 1],
488                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489                                gtid);
490 #endif // KMP_FAST_REDUCTION_BARRIER
491 }
492 
493 /* Print out the storage map for the major kmp_team_t team data structures
494    that are allocated together. */
495 
496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497                                          int team_id, int num_thr) {
498   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500                                header, team_id);
501 
502   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503                                &team->t.t_bar[bs_last_barrier],
504                                sizeof(kmp_balign_team_t) * bs_last_barrier,
505                                "%s_%d.t_bar", header, team_id);
506 
507   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508                                &team->t.t_bar[bs_plain_barrier + 1],
509                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510                                header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513                                &team->t.t_bar[bs_forkjoin_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[forkjoin]", header, team_id);
516 
517 #if KMP_FAST_REDUCTION_BARRIER
518   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519                                &team->t.t_bar[bs_reduction_barrier + 1],
520                                sizeof(kmp_balign_team_t),
521                                "%s_%d.t_bar[reduction]", header, team_id);
522 #endif // KMP_FAST_REDUCTION_BARRIER
523 
524   __kmp_print_storage_map_gtid(
525       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527 
528   __kmp_print_storage_map_gtid(
529       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531 
532   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533                                &team->t.t_disp_buffer[num_disp_buff],
534                                sizeof(dispatch_shared_info_t) * num_disp_buff,
535                                "%s_%d.t_disp_buffer", header, team_id);
536 }
537 
538 static void __kmp_init_allocator() { __kmp_init_memkind(); }
539 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
540 
541 /* ------------------------------------------------------------------------ */
542 
543 #if KMP_DYNAMIC_LIB
544 #if KMP_OS_WINDOWS
545 
546 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
547   // TODO: Change to __kmp_break_bootstrap_lock().
548   __kmp_init_bootstrap_lock(lck); // make the lock released
549 }
550 
551 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
552   int i;
553   int thread_count;
554 
555   // PROCESS_DETACH is expected to be called by a thread that executes
556   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
557   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
558   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
559   // threads can be still alive here, although being about to be terminated. The
560   // threads in the array with ds_thread==0 are most suspicious. Actually, it
561   // can be not safe to access the __kmp_threads[].
562 
563   // TODO: does it make sense to check __kmp_roots[] ?
564 
565   // Let's check that there are no other alive threads registered with the OMP
566   // lib.
567   while (1) {
568     thread_count = 0;
569     for (i = 0; i < __kmp_threads_capacity; ++i) {
570       if (!__kmp_threads)
571         continue;
572       kmp_info_t *th = __kmp_threads[i];
573       if (th == NULL)
574         continue;
575       int gtid = th->th.th_info.ds.ds_gtid;
576       if (gtid == gtid_req)
577         continue;
578       if (gtid < 0)
579         continue;
580       DWORD exit_val;
581       int alive = __kmp_is_thread_alive(th, &exit_val);
582       if (alive) {
583         ++thread_count;
584       }
585     }
586     if (thread_count == 0)
587       break; // success
588   }
589 
590   // Assume that I'm alone. Now it might be safe to check and reset locks.
591   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
592   __kmp_reset_lock(&__kmp_forkjoin_lock);
593 #ifdef KMP_DEBUG
594   __kmp_reset_lock(&__kmp_stdio_lock);
595 #endif // KMP_DEBUG
596 }
597 
598 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
599   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
600 
601   switch (fdwReason) {
602 
603   case DLL_PROCESS_ATTACH:
604     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
605 
606     return TRUE;
607 
608   case DLL_PROCESS_DETACH:
609     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
610 
611     if (lpReserved != NULL) {
612       // lpReserved is used for telling the difference:
613       //   lpReserved == NULL when FreeLibrary() was called,
614       //   lpReserved != NULL when the process terminates.
615       // When FreeLibrary() is called, worker threads remain alive. So they will
616       // release the forkjoin lock by themselves. When the process terminates,
617       // worker threads disappear triggering the problem of unreleased forkjoin
618       // lock as described below.
619 
620       // A worker thread can take the forkjoin lock. The problem comes up if
621       // that worker thread becomes dead before it releases the forkjoin lock.
622       // The forkjoin lock remains taken, while the thread executing
623       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
624       // to take the forkjoin lock and will always fail, so that the application
625       // will never finish [normally]. This scenario is possible if
626       // __kmpc_end() has not been executed. It looks like it's not a corner
627       // case, but common cases:
628       // - the main function was compiled by an alternative compiler;
629       // - the main function was compiled by icl but without /Qopenmp
630       //   (application with plugins);
631       // - application terminates by calling C exit(), Fortran CALL EXIT() or
632       //   Fortran STOP.
633       // - alive foreign thread prevented __kmpc_end from doing cleanup.
634       //
635       // This is a hack to work around the problem.
636       // TODO: !!! figure out something better.
637       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
638     }
639 
640     __kmp_internal_end_library(__kmp_gtid_get_specific());
641 
642     return TRUE;
643 
644   case DLL_THREAD_ATTACH:
645     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
646 
647     /* if we want to register new siblings all the time here call
648      * __kmp_get_gtid(); */
649     return TRUE;
650 
651   case DLL_THREAD_DETACH:
652     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
653 
654     __kmp_internal_end_thread(__kmp_gtid_get_specific());
655     return TRUE;
656   }
657 
658   return TRUE;
659 }
660 
661 #endif /* KMP_OS_WINDOWS */
662 #endif /* KMP_DYNAMIC_LIB */
663 
664 /* __kmp_parallel_deo -- Wait until it's our turn. */
665 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
666   int gtid = *gtid_ref;
667 #ifdef BUILD_PARALLEL_ORDERED
668   kmp_team_t *team = __kmp_team_from_gtid(gtid);
669 #endif /* BUILD_PARALLEL_ORDERED */
670 
671   if (__kmp_env_consistency_check) {
672     if (__kmp_threads[gtid]->th.th_root->r.r_active)
673 #if KMP_USE_DYNAMIC_LOCK
674       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
675 #else
676       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
677 #endif
678   }
679 #ifdef BUILD_PARALLEL_ORDERED
680   if (!team->t.t_serialized) {
681     KMP_MB();
682     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
683              NULL);
684     KMP_MB();
685   }
686 #endif /* BUILD_PARALLEL_ORDERED */
687 }
688 
689 /* __kmp_parallel_dxo -- Signal the next task. */
690 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
691   int gtid = *gtid_ref;
692 #ifdef BUILD_PARALLEL_ORDERED
693   int tid = __kmp_tid_from_gtid(gtid);
694   kmp_team_t *team = __kmp_team_from_gtid(gtid);
695 #endif /* BUILD_PARALLEL_ORDERED */
696 
697   if (__kmp_env_consistency_check) {
698     if (__kmp_threads[gtid]->th.th_root->r.r_active)
699       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
700   }
701 #ifdef BUILD_PARALLEL_ORDERED
702   if (!team->t.t_serialized) {
703     KMP_MB(); /* Flush all pending memory write invalidates.  */
704 
705     /* use the tid of the next thread in this team */
706     /* TODO replace with general release procedure */
707     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
708 
709     KMP_MB(); /* Flush all pending memory write invalidates.  */
710   }
711 #endif /* BUILD_PARALLEL_ORDERED */
712 }
713 
714 /* ------------------------------------------------------------------------ */
715 /* The BARRIER for a SINGLE process section is always explicit   */
716 
717 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
718   int status;
719   kmp_info_t *th;
720   kmp_team_t *team;
721 
722   if (!TCR_4(__kmp_init_parallel))
723     __kmp_parallel_initialize();
724   __kmp_resume_if_soft_paused();
725 
726   th = __kmp_threads[gtid];
727   team = th->th.th_team;
728   status = 0;
729 
730   th->th.th_ident = id_ref;
731 
732   if (team->t.t_serialized) {
733     status = 1;
734   } else {
735     kmp_int32 old_this = th->th.th_local.this_construct;
736 
737     ++th->th.th_local.this_construct;
738     /* try to set team count to thread count--success means thread got the
739        single block */
740     /* TODO: Should this be acquire or release? */
741     if (team->t.t_construct == old_this) {
742       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
743                                               th->th.th_local.this_construct);
744     }
745 #if USE_ITT_BUILD
746     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
747         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
748         team->t.t_active_level ==
749             1) { // Only report metadata by master of active team at level 1
750       __kmp_itt_metadata_single(id_ref);
751     }
752 #endif /* USE_ITT_BUILD */
753   }
754 
755   if (__kmp_env_consistency_check) {
756     if (status && push_ws) {
757       __kmp_push_workshare(gtid, ct_psingle, id_ref);
758     } else {
759       __kmp_check_workshare(gtid, ct_psingle, id_ref);
760     }
761   }
762 #if USE_ITT_BUILD
763   if (status) {
764     __kmp_itt_single_start(gtid);
765   }
766 #endif /* USE_ITT_BUILD */
767   return status;
768 }
769 
770 void __kmp_exit_single(int gtid) {
771 #if USE_ITT_BUILD
772   __kmp_itt_single_end(gtid);
773 #endif /* USE_ITT_BUILD */
774   if (__kmp_env_consistency_check)
775     __kmp_pop_workshare(gtid, ct_psingle, NULL);
776 }
777 
778 /* determine if we can go parallel or must use a serialized parallel region and
779  * how many threads we can use
780  * set_nproc is the number of threads requested for the team
781  * returns 0 if we should serialize or only use one thread,
782  * otherwise the number of threads to use
783  * The forkjoin lock is held by the caller. */
784 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
785                                  int master_tid, int set_nthreads,
786                                  int enter_teams) {
787   int capacity;
788   int new_nthreads;
789   KMP_DEBUG_ASSERT(__kmp_init_serial);
790   KMP_DEBUG_ASSERT(root && parent_team);
791   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
792 
793   // If dyn-var is set, dynamically adjust the number of desired threads,
794   // according to the method specified by dynamic_mode.
795   new_nthreads = set_nthreads;
796   if (!get__dynamic_2(parent_team, master_tid)) {
797     ;
798   }
799 #ifdef USE_LOAD_BALANCE
800   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
801     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
802     if (new_nthreads == 1) {
803       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
804                     "reservation to 1 thread\n",
805                     master_tid));
806       return 1;
807     }
808     if (new_nthreads < set_nthreads) {
809       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
810                     "reservation to %d threads\n",
811                     master_tid, new_nthreads));
812     }
813   }
814 #endif /* USE_LOAD_BALANCE */
815   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
816     new_nthreads = __kmp_avail_proc - __kmp_nth +
817                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
818     if (new_nthreads <= 1) {
819       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
820                     "reservation to 1 thread\n",
821                     master_tid));
822       return 1;
823     }
824     if (new_nthreads < set_nthreads) {
825       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
826                     "reservation to %d threads\n",
827                     master_tid, new_nthreads));
828     } else {
829       new_nthreads = set_nthreads;
830     }
831   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
832     if (set_nthreads > 2) {
833       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
834       new_nthreads = (new_nthreads % set_nthreads) + 1;
835       if (new_nthreads == 1) {
836         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
837                       "reservation to 1 thread\n",
838                       master_tid));
839         return 1;
840       }
841       if (new_nthreads < set_nthreads) {
842         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
843                       "reservation to %d threads\n",
844                       master_tid, new_nthreads));
845       }
846     }
847   } else {
848     KMP_ASSERT(0);
849   }
850 
851   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
852   if (__kmp_nth + new_nthreads -
853           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
854       __kmp_max_nth) {
855     int tl_nthreads = __kmp_max_nth - __kmp_nth +
856                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
857     if (tl_nthreads <= 0) {
858       tl_nthreads = 1;
859     }
860 
861     // If dyn-var is false, emit a 1-time warning.
862     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
863       __kmp_reserve_warn = 1;
864       __kmp_msg(kmp_ms_warning,
865                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
866                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
867     }
868     if (tl_nthreads == 1) {
869       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
870                     "reduced reservation to 1 thread\n",
871                     master_tid));
872       return 1;
873     }
874     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
875                   "reservation to %d threads\n",
876                   master_tid, tl_nthreads));
877     new_nthreads = tl_nthreads;
878   }
879 
880   // Respect OMP_THREAD_LIMIT
881   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
882   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
883   if (cg_nthreads + new_nthreads -
884           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
885       max_cg_threads) {
886     int tl_nthreads = max_cg_threads - cg_nthreads +
887                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
888     if (tl_nthreads <= 0) {
889       tl_nthreads = 1;
890     }
891 
892     // If dyn-var is false, emit a 1-time warning.
893     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
894       __kmp_reserve_warn = 1;
895       __kmp_msg(kmp_ms_warning,
896                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
897                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
898     }
899     if (tl_nthreads == 1) {
900       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
901                     "reduced reservation to 1 thread\n",
902                     master_tid));
903       return 1;
904     }
905     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
906                   "reservation to %d threads\n",
907                   master_tid, tl_nthreads));
908     new_nthreads = tl_nthreads;
909   }
910 
911   // Check if the threads array is large enough, or needs expanding.
912   // See comment in __kmp_register_root() about the adjustment if
913   // __kmp_threads[0] == NULL.
914   capacity = __kmp_threads_capacity;
915   if (TCR_PTR(__kmp_threads[0]) == NULL) {
916     --capacity;
917   }
918   if (__kmp_nth + new_nthreads -
919           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
920       capacity) {
921     // Expand the threads array.
922     int slotsRequired = __kmp_nth + new_nthreads -
923                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
924                         capacity;
925     int slotsAdded = __kmp_expand_threads(slotsRequired);
926     if (slotsAdded < slotsRequired) {
927       // The threads array was not expanded enough.
928       new_nthreads -= (slotsRequired - slotsAdded);
929       KMP_ASSERT(new_nthreads >= 1);
930 
931       // If dyn-var is false, emit a 1-time warning.
932       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
933         __kmp_reserve_warn = 1;
934         if (__kmp_tp_cached) {
935           __kmp_msg(kmp_ms_warning,
936                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
937                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
938                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
939         } else {
940           __kmp_msg(kmp_ms_warning,
941                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
942                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
943         }
944       }
945     }
946   }
947 
948 #ifdef KMP_DEBUG
949   if (new_nthreads == 1) {
950     KC_TRACE(10,
951              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
952               "dead roots and rechecking; requested %d threads\n",
953               __kmp_get_gtid(), set_nthreads));
954   } else {
955     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
956                   " %d threads\n",
957                   __kmp_get_gtid(), new_nthreads, set_nthreads));
958   }
959 #endif // KMP_DEBUG
960   return new_nthreads;
961 }
962 
963 /* Allocate threads from the thread pool and assign them to the new team. We are
964    assured that there are enough threads available, because we checked on that
965    earlier within critical section forkjoin */
966 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
967                                     kmp_info_t *master_th, int master_gtid) {
968   int i;
969   int use_hot_team;
970 
971   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
972   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
973   KMP_MB();
974 
975   /* first, let's setup the master thread */
976   master_th->th.th_info.ds.ds_tid = 0;
977   master_th->th.th_team = team;
978   master_th->th.th_team_nproc = team->t.t_nproc;
979   master_th->th.th_team_master = master_th;
980   master_th->th.th_team_serialized = FALSE;
981   master_th->th.th_dispatch = &team->t.t_dispatch[0];
982 
983 /* make sure we are not the optimized hot team */
984 #if KMP_NESTED_HOT_TEAMS
985   use_hot_team = 0;
986   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
987   if (hot_teams) { // hot teams array is not allocated if
988     // KMP_HOT_TEAMS_MAX_LEVEL=0
989     int level = team->t.t_active_level - 1; // index in array of hot teams
990     if (master_th->th.th_teams_microtask) { // are we inside the teams?
991       if (master_th->th.th_teams_size.nteams > 1) {
992         ++level; // level was not increased in teams construct for
993         // team_of_masters
994       }
995       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
996           master_th->th.th_teams_level == team->t.t_level) {
997         ++level; // level was not increased in teams construct for
998         // team_of_workers before the parallel
999       } // team->t.t_level will be increased inside parallel
1000     }
1001     if (level < __kmp_hot_teams_max_level) {
1002       if (hot_teams[level].hot_team) {
1003         // hot team has already been allocated for given level
1004         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1005         use_hot_team = 1; // the team is ready to use
1006       } else {
1007         use_hot_team = 0; // AC: threads are not allocated yet
1008         hot_teams[level].hot_team = team; // remember new hot team
1009         hot_teams[level].hot_team_nth = team->t.t_nproc;
1010       }
1011     } else {
1012       use_hot_team = 0;
1013     }
1014   }
1015 #else
1016   use_hot_team = team == root->r.r_hot_team;
1017 #endif
1018   if (!use_hot_team) {
1019 
1020     /* install the master thread */
1021     team->t.t_threads[0] = master_th;
1022     __kmp_initialize_info(master_th, team, 0, master_gtid);
1023 
1024     /* now, install the worker threads */
1025     for (i = 1; i < team->t.t_nproc; i++) {
1026 
1027       /* fork or reallocate a new thread and install it in team */
1028       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1029       team->t.t_threads[i] = thr;
1030       KMP_DEBUG_ASSERT(thr);
1031       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1032       /* align team and thread arrived states */
1033       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1034                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1035                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1036                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1037                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1038                     team->t.t_bar[bs_plain_barrier].b_arrived));
1039       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1040       thr->th.th_teams_level = master_th->th.th_teams_level;
1041       thr->th.th_teams_size = master_th->th.th_teams_size;
1042       { // Initialize threads' barrier data.
1043         int b;
1044         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1045         for (b = 0; b < bs_last_barrier; ++b) {
1046           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1047           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1048 #if USE_DEBUGGER
1049           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1050 #endif
1051         }
1052       }
1053     }
1054 
1055 #if KMP_AFFINITY_SUPPORTED
1056     __kmp_partition_places(team);
1057 #endif
1058   }
1059 
1060   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1061     for (i = 0; i < team->t.t_nproc; i++) {
1062       kmp_info_t *thr = team->t.t_threads[i];
1063       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1064           thr->th.th_prev_level != team->t.t_level) {
1065         team->t.t_display_affinity = 1;
1066         break;
1067       }
1068     }
1069   }
1070 
1071   KMP_MB();
1072 }
1073 
1074 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1075 // Propagate any changes to the floating point control registers out to the team
1076 // We try to avoid unnecessary writes to the relevant cache line in the team
1077 // structure, so we don't make changes unless they are needed.
1078 inline static void propagateFPControl(kmp_team_t *team) {
1079   if (__kmp_inherit_fp_control) {
1080     kmp_int16 x87_fpu_control_word;
1081     kmp_uint32 mxcsr;
1082 
1083     // Get master values of FPU control flags (both X87 and vector)
1084     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1085     __kmp_store_mxcsr(&mxcsr);
1086     mxcsr &= KMP_X86_MXCSR_MASK;
1087 
1088     // There is no point looking at t_fp_control_saved here.
1089     // If it is TRUE, we still have to update the values if they are different
1090     // from those we now have. If it is FALSE we didn't save anything yet, but
1091     // our objective is the same. We have to ensure that the values in the team
1092     // are the same as those we have.
1093     // So, this code achieves what we need whether or not t_fp_control_saved is
1094     // true. By checking whether the value needs updating we avoid unnecessary
1095     // writes that would put the cache-line into a written state, causing all
1096     // threads in the team to have to read it again.
1097     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1098     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1099     // Although we don't use this value, other code in the runtime wants to know
1100     // whether it should restore them. So we must ensure it is correct.
1101     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1102   } else {
1103     // Similarly here. Don't write to this cache-line in the team structure
1104     // unless we have to.
1105     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1106   }
1107 }
1108 
1109 // Do the opposite, setting the hardware registers to the updated values from
1110 // the team.
1111 inline static void updateHWFPControl(kmp_team_t *team) {
1112   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1113     // Only reset the fp control regs if they have been changed in the team.
1114     // the parallel region that we are exiting.
1115     kmp_int16 x87_fpu_control_word;
1116     kmp_uint32 mxcsr;
1117     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1118     __kmp_store_mxcsr(&mxcsr);
1119     mxcsr &= KMP_X86_MXCSR_MASK;
1120 
1121     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1122       __kmp_clear_x87_fpu_status_word();
1123       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1124     }
1125 
1126     if (team->t.t_mxcsr != mxcsr) {
1127       __kmp_load_mxcsr(&team->t.t_mxcsr);
1128     }
1129   }
1130 }
1131 #else
1132 #define propagateFPControl(x) ((void)0)
1133 #define updateHWFPControl(x) ((void)0)
1134 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1135 
1136 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1137                                      int realloc); // forward declaration
1138 
1139 /* Run a parallel region that has been serialized, so runs only in a team of the
1140    single master thread. */
1141 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1142   kmp_info_t *this_thr;
1143   kmp_team_t *serial_team;
1144 
1145   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1146 
1147   /* Skip all this code for autopar serialized loops since it results in
1148      unacceptable overhead */
1149   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1150     return;
1151 
1152   if (!TCR_4(__kmp_init_parallel))
1153     __kmp_parallel_initialize();
1154   __kmp_resume_if_soft_paused();
1155 
1156   this_thr = __kmp_threads[global_tid];
1157   serial_team = this_thr->th.th_serial_team;
1158 
1159   /* utilize the serialized team held by this thread */
1160   KMP_DEBUG_ASSERT(serial_team);
1161   KMP_MB();
1162 
1163   if (__kmp_tasking_mode != tskm_immediate_exec) {
1164     KMP_DEBUG_ASSERT(
1165         this_thr->th.th_task_team ==
1166         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1167     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1168                      NULL);
1169     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1170                   "team %p, new task_team = NULL\n",
1171                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1172     this_thr->th.th_task_team = NULL;
1173   }
1174 
1175   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1176   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1177     proc_bind = proc_bind_false;
1178   } else if (proc_bind == proc_bind_default) {
1179     // No proc_bind clause was specified, so use the current value
1180     // of proc-bind-var for this parallel region.
1181     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1182   }
1183   // Reset for next parallel region
1184   this_thr->th.th_set_proc_bind = proc_bind_default;
1185 
1186 #if OMPT_SUPPORT
1187   ompt_data_t ompt_parallel_data = ompt_data_none;
1188   ompt_data_t *implicit_task_data;
1189   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1190   if (ompt_enabled.enabled &&
1191       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1192 
1193     ompt_task_info_t *parent_task_info;
1194     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1195 
1196     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1197     if (ompt_enabled.ompt_callback_parallel_begin) {
1198       int team_size = 1;
1199 
1200       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1201           &(parent_task_info->task_data), &(parent_task_info->frame),
1202           &ompt_parallel_data, team_size,
1203           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1204     }
1205   }
1206 #endif // OMPT_SUPPORT
1207 
1208   if (this_thr->th.th_team != serial_team) {
1209     // Nested level will be an index in the nested nthreads array
1210     int level = this_thr->th.th_team->t.t_level;
1211 
1212     if (serial_team->t.t_serialized) {
1213       /* this serial team was already used
1214          TODO increase performance by making this locks more specific */
1215       kmp_team_t *new_team;
1216 
1217       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1218 
1219       new_team =
1220           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1221 #if OMPT_SUPPORT
1222                               ompt_parallel_data,
1223 #endif
1224                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1225                               0 USE_NESTED_HOT_ARG(NULL));
1226       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1227       KMP_ASSERT(new_team);
1228 
1229       /* setup new serialized team and install it */
1230       new_team->t.t_threads[0] = this_thr;
1231       new_team->t.t_parent = this_thr->th.th_team;
1232       serial_team = new_team;
1233       this_thr->th.th_serial_team = serial_team;
1234 
1235       KF_TRACE(
1236           10,
1237           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1238            global_tid, serial_team));
1239 
1240       /* TODO the above breaks the requirement that if we run out of resources,
1241          then we can still guarantee that serialized teams are ok, since we may
1242          need to allocate a new one */
1243     } else {
1244       KF_TRACE(
1245           10,
1246           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1247            global_tid, serial_team));
1248     }
1249 
1250     /* we have to initialize this serial team */
1251     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1252     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1253     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1254     serial_team->t.t_ident = loc;
1255     serial_team->t.t_serialized = 1;
1256     serial_team->t.t_nproc = 1;
1257     serial_team->t.t_parent = this_thr->th.th_team;
1258     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1259     this_thr->th.th_team = serial_team;
1260     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1261 
1262     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1263                   this_thr->th.th_current_task));
1264     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1265     this_thr->th.th_current_task->td_flags.executing = 0;
1266 
1267     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1268 
1269     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1270        implicit task for each serialized task represented by
1271        team->t.t_serialized? */
1272     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1273               &this_thr->th.th_current_task->td_parent->td_icvs);
1274 
1275     // Thread value exists in the nested nthreads array for the next nested
1276     // level
1277     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1278       this_thr->th.th_current_task->td_icvs.nproc =
1279           __kmp_nested_nth.nth[level + 1];
1280     }
1281 
1282     if (__kmp_nested_proc_bind.used &&
1283         (level + 1 < __kmp_nested_proc_bind.used)) {
1284       this_thr->th.th_current_task->td_icvs.proc_bind =
1285           __kmp_nested_proc_bind.bind_types[level + 1];
1286     }
1287 
1288 #if USE_DEBUGGER
1289     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1290 #endif
1291     this_thr->th.th_info.ds.ds_tid = 0;
1292 
1293     /* set thread cache values */
1294     this_thr->th.th_team_nproc = 1;
1295     this_thr->th.th_team_master = this_thr;
1296     this_thr->th.th_team_serialized = 1;
1297 
1298     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1299     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1300     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1301 
1302     propagateFPControl(serial_team);
1303 
1304     /* check if we need to allocate dispatch buffers stack */
1305     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1306     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1307       serial_team->t.t_dispatch->th_disp_buffer =
1308           (dispatch_private_info_t *)__kmp_allocate(
1309               sizeof(dispatch_private_info_t));
1310     }
1311     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1312 
1313     KMP_MB();
1314 
1315   } else {
1316     /* this serialized team is already being used,
1317      * that's fine, just add another nested level */
1318     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1319     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1320     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1321     ++serial_team->t.t_serialized;
1322     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1323 
1324     // Nested level will be an index in the nested nthreads array
1325     int level = this_thr->th.th_team->t.t_level;
1326     // Thread value exists in the nested nthreads array for the next nested
1327     // level
1328     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1329       this_thr->th.th_current_task->td_icvs.nproc =
1330           __kmp_nested_nth.nth[level + 1];
1331     }
1332     serial_team->t.t_level++;
1333     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1334                   "of serial team %p to %d\n",
1335                   global_tid, serial_team, serial_team->t.t_level));
1336 
1337     /* allocate/push dispatch buffers stack */
1338     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1339     {
1340       dispatch_private_info_t *disp_buffer =
1341           (dispatch_private_info_t *)__kmp_allocate(
1342               sizeof(dispatch_private_info_t));
1343       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1344       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1345     }
1346     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1347 
1348     KMP_MB();
1349   }
1350   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1351 
1352   // Perform the display affinity functionality for
1353   // serialized parallel regions
1354   if (__kmp_display_affinity) {
1355     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1356         this_thr->th.th_prev_num_threads != 1) {
1357       // NULL means use the affinity-format-var ICV
1358       __kmp_aux_display_affinity(global_tid, NULL);
1359       this_thr->th.th_prev_level = serial_team->t.t_level;
1360       this_thr->th.th_prev_num_threads = 1;
1361     }
1362   }
1363 
1364   if (__kmp_env_consistency_check)
1365     __kmp_push_parallel(global_tid, NULL);
1366 #if OMPT_SUPPORT
1367   serial_team->t.ompt_team_info.master_return_address = codeptr;
1368   if (ompt_enabled.enabled &&
1369       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1370     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1371 
1372     ompt_lw_taskteam_t lw_taskteam;
1373     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1374                             &ompt_parallel_data, codeptr);
1375 
1376     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1377     // don't use lw_taskteam after linking. content was swaped
1378 
1379     /* OMPT implicit task begin */
1380     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1381     if (ompt_enabled.ompt_callback_implicit_task) {
1382       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1383           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1384           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1385       OMPT_CUR_TASK_INFO(this_thr)
1386           ->thread_num = __kmp_tid_from_gtid(global_tid);
1387     }
1388 
1389     /* OMPT state */
1390     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1391     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1392   }
1393 #endif
1394 }
1395 
1396 /* most of the work for a fork */
1397 /* return true if we really went parallel, false if serialized */
1398 int __kmp_fork_call(ident_t *loc, int gtid,
1399                     enum fork_context_e call_context, // Intel, GNU, ...
1400                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1401                     kmp_va_list ap) {
1402   void **argv;
1403   int i;
1404   int master_tid;
1405   int master_this_cons;
1406   kmp_team_t *team;
1407   kmp_team_t *parent_team;
1408   kmp_info_t *master_th;
1409   kmp_root_t *root;
1410   int nthreads;
1411   int master_active;
1412   int master_set_numthreads;
1413   int level;
1414   int active_level;
1415   int teams_level;
1416 #if KMP_NESTED_HOT_TEAMS
1417   kmp_hot_team_ptr_t **p_hot_teams;
1418 #endif
1419   { // KMP_TIME_BLOCK
1420     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1421     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1422 
1423     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1424     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1425       /* Some systems prefer the stack for the root thread(s) to start with */
1426       /* some gap from the parent stack to prevent false sharing. */
1427       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1428       /* These 2 lines below are so this does not get optimized out */
1429       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1430         __kmp_stkpadding += (short)((kmp_int64)dummy);
1431     }
1432 
1433     /* initialize if needed */
1434     KMP_DEBUG_ASSERT(
1435         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1436     if (!TCR_4(__kmp_init_parallel))
1437       __kmp_parallel_initialize();
1438     __kmp_resume_if_soft_paused();
1439 
1440     /* setup current data */
1441     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1442     // shutdown
1443     parent_team = master_th->th.th_team;
1444     master_tid = master_th->th.th_info.ds.ds_tid;
1445     master_this_cons = master_th->th.th_local.this_construct;
1446     root = master_th->th.th_root;
1447     master_active = root->r.r_active;
1448     master_set_numthreads = master_th->th.th_set_nproc;
1449 
1450 #if OMPT_SUPPORT
1451     ompt_data_t ompt_parallel_data = ompt_data_none;
1452     ompt_data_t *parent_task_data;
1453     ompt_frame_t *ompt_frame;
1454     ompt_data_t *implicit_task_data;
1455     void *return_address = NULL;
1456 
1457     if (ompt_enabled.enabled) {
1458       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1459                                     NULL, NULL);
1460       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1461     }
1462 #endif
1463 
1464     // Nested level will be an index in the nested nthreads array
1465     level = parent_team->t.t_level;
1466     // used to launch non-serial teams even if nested is not allowed
1467     active_level = parent_team->t.t_active_level;
1468     // needed to check nesting inside the teams
1469     teams_level = master_th->th.th_teams_level;
1470 #if KMP_NESTED_HOT_TEAMS
1471     p_hot_teams = &master_th->th.th_hot_teams;
1472     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1473       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1474           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1475       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1476       // it is either actual or not needed (when active_level > 0)
1477       (*p_hot_teams)[0].hot_team_nth = 1;
1478     }
1479 #endif
1480 
1481 #if OMPT_SUPPORT
1482     if (ompt_enabled.enabled) {
1483       if (ompt_enabled.ompt_callback_parallel_begin) {
1484         int team_size = master_set_numthreads
1485                             ? master_set_numthreads
1486                             : get__nproc_2(parent_team, master_tid);
1487         int flags = OMPT_INVOKER(call_context) |
1488                     ((microtask == (microtask_t)__kmp_teams_master)
1489                          ? ompt_parallel_league
1490                          : ompt_parallel_team);
1491         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1492             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1493             return_address);
1494       }
1495       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1496     }
1497 #endif
1498 
1499     master_th->th.th_ident = loc;
1500 
1501     if (master_th->th.th_teams_microtask && ap &&
1502         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1503       // AC: This is start of parallel that is nested inside teams construct.
1504       // The team is actual (hot), all workers are ready at the fork barrier.
1505       // No lock needed to initialize the team a bit, then free workers.
1506       parent_team->t.t_ident = loc;
1507       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1508       parent_team->t.t_argc = argc;
1509       argv = (void **)parent_team->t.t_argv;
1510       for (i = argc - 1; i >= 0; --i)
1511         *argv++ = va_arg(kmp_va_deref(ap), void *);
1512       // Increment our nested depth levels, but not increase the serialization
1513       if (parent_team == master_th->th.th_serial_team) {
1514         // AC: we are in serialized parallel
1515         __kmpc_serialized_parallel(loc, gtid);
1516         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1517 
1518         if (call_context == fork_context_gnu) {
1519           // AC: need to decrement t_serialized for enquiry functions to work
1520           // correctly, will restore at join time
1521           parent_team->t.t_serialized--;
1522           return TRUE;
1523         }
1524 
1525 #if OMPT_SUPPORT
1526         void *dummy;
1527         void **exit_frame_p;
1528 
1529         ompt_lw_taskteam_t lw_taskteam;
1530 
1531         if (ompt_enabled.enabled) {
1532           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1533                                   &ompt_parallel_data, return_address);
1534           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1535 
1536           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1537           // don't use lw_taskteam after linking. content was swaped
1538 
1539           /* OMPT implicit task begin */
1540           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1541           if (ompt_enabled.ompt_callback_implicit_task) {
1542             OMPT_CUR_TASK_INFO(master_th)
1543                 ->thread_num = __kmp_tid_from_gtid(gtid);
1544             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1545                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1546                 implicit_task_data, 1,
1547                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1548           }
1549 
1550           /* OMPT state */
1551           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1552         } else {
1553           exit_frame_p = &dummy;
1554         }
1555 #endif
1556         // AC: need to decrement t_serialized for enquiry functions to work
1557         // correctly, will restore at join time
1558         parent_team->t.t_serialized--;
1559 
1560         {
1561           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1562           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1563           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1564 #if OMPT_SUPPORT
1565                                  ,
1566                                  exit_frame_p
1567 #endif
1568                                  );
1569         }
1570 
1571 #if OMPT_SUPPORT
1572         if (ompt_enabled.enabled) {
1573           *exit_frame_p = NULL;
1574           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1575           if (ompt_enabled.ompt_callback_implicit_task) {
1576             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1577                 ompt_scope_end, NULL, implicit_task_data, 1,
1578                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1579           }
1580           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1581           __ompt_lw_taskteam_unlink(master_th);
1582           if (ompt_enabled.ompt_callback_parallel_end) {
1583             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1584                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1585                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1586                 return_address);
1587           }
1588           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1589         }
1590 #endif
1591         return TRUE;
1592       }
1593 
1594       parent_team->t.t_pkfn = microtask;
1595       parent_team->t.t_invoke = invoker;
1596       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1597       parent_team->t.t_active_level++;
1598       parent_team->t.t_level++;
1599       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1600 
1601 #if OMPT_SUPPORT
1602       if (ompt_enabled.enabled) {
1603         ompt_lw_taskteam_t lw_taskteam;
1604         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1605                                 &ompt_parallel_data, return_address);
1606         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1607       }
1608 #endif
1609 
1610       /* Change number of threads in the team if requested */
1611       if (master_set_numthreads) { // The parallel has num_threads clause
1612         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1613           // AC: only can reduce number of threads dynamically, can't increase
1614           kmp_info_t **other_threads = parent_team->t.t_threads;
1615           parent_team->t.t_nproc = master_set_numthreads;
1616           for (i = 0; i < master_set_numthreads; ++i) {
1617             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1618           }
1619           // Keep extra threads hot in the team for possible next parallels
1620         }
1621         master_th->th.th_set_nproc = 0;
1622       }
1623 
1624 #if USE_DEBUGGER
1625       if (__kmp_debugging) { // Let debugger override number of threads.
1626         int nth = __kmp_omp_num_threads(loc);
1627         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1628           master_set_numthreads = nth;
1629         }
1630       }
1631 #endif
1632 
1633 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1634       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1635            KMP_ITT_DEBUG) &&
1636           __kmp_forkjoin_frames_mode == 3 &&
1637           parent_team->t.t_active_level == 1 // only report frames at level 1
1638           && master_th->th.th_teams_size.nteams == 1) {
1639         kmp_uint64 tmp_time = __itt_get_timestamp();
1640         master_th->th.th_frame_time = tmp_time;
1641         parent_team->t.t_region_time = tmp_time;
1642       }
1643       if (__itt_stack_caller_create_ptr) {
1644         // create new stack stitching id before entering fork barrier
1645         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1646       }
1647 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1648 
1649       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1650                     "master_th=%p, gtid=%d\n",
1651                     root, parent_team, master_th, gtid));
1652       __kmp_internal_fork(loc, gtid, parent_team);
1653       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1654                     "master_th=%p, gtid=%d\n",
1655                     root, parent_team, master_th, gtid));
1656 
1657       if (call_context == fork_context_gnu)
1658         return TRUE;
1659 
1660       /* Invoke microtask for MASTER thread */
1661       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1662                     parent_team->t.t_id, parent_team->t.t_pkfn));
1663 
1664       if (!parent_team->t.t_invoke(gtid)) {
1665         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1666       }
1667       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1668                     parent_team->t.t_id, parent_team->t.t_pkfn));
1669       KMP_MB(); /* Flush all pending memory write invalidates.  */
1670 
1671       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1672 
1673       return TRUE;
1674     } // Parallel closely nested in teams construct
1675 
1676 #if KMP_DEBUG
1677     if (__kmp_tasking_mode != tskm_immediate_exec) {
1678       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1679                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1680     }
1681 #endif
1682 
1683     if (parent_team->t.t_active_level >=
1684         master_th->th.th_current_task->td_icvs.max_active_levels) {
1685       nthreads = 1;
1686     } else {
1687       int enter_teams = ((ap == NULL && active_level == 0) ||
1688                          (ap && teams_level > 0 && teams_level == level));
1689       nthreads =
1690           master_set_numthreads
1691               ? master_set_numthreads
1692               : get__nproc_2(
1693                     parent_team,
1694                     master_tid); // TODO: get nproc directly from current task
1695 
1696       // Check if we need to take forkjoin lock? (no need for serialized
1697       // parallel out of teams construct). This code moved here from
1698       // __kmp_reserve_threads() to speedup nested serialized parallels.
1699       if (nthreads > 1) {
1700         if ((get__max_active_levels(master_th) == 1 &&
1701              (root->r.r_in_parallel && !enter_teams)) ||
1702             (__kmp_library == library_serial)) {
1703           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1704                         " threads\n",
1705                         gtid, nthreads));
1706           nthreads = 1;
1707         }
1708       }
1709       if (nthreads > 1) {
1710         /* determine how many new threads we can use */
1711         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1712         /* AC: If we execute teams from parallel region (on host), then teams
1713            should be created but each can only have 1 thread if nesting is
1714            disabled. If teams called from serial region, then teams and their
1715            threads should be created regardless of the nesting setting. */
1716         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1717                                          nthreads, enter_teams);
1718         if (nthreads == 1) {
1719           // Free lock for single thread execution here; for multi-thread
1720           // execution it will be freed later after team of threads created
1721           // and initialized
1722           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1723         }
1724       }
1725     }
1726     KMP_DEBUG_ASSERT(nthreads > 0);
1727 
1728     // If we temporarily changed the set number of threads then restore it now
1729     master_th->th.th_set_nproc = 0;
1730 
1731     /* create a serialized parallel region? */
1732     if (nthreads == 1) {
1733 /* josh todo: hypothetical question: what do we do for OS X*? */
1734 #if KMP_OS_LINUX &&                                                            \
1735     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1736       void *args[argc];
1737 #else
1738       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1739 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1740           KMP_ARCH_AARCH64) */
1741 
1742       KA_TRACE(20,
1743                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1744 
1745       __kmpc_serialized_parallel(loc, gtid);
1746 
1747       if (call_context == fork_context_intel) {
1748         /* TODO this sucks, use the compiler itself to pass args! :) */
1749         master_th->th.th_serial_team->t.t_ident = loc;
1750         if (!ap) {
1751           // revert change made in __kmpc_serialized_parallel()
1752           master_th->th.th_serial_team->t.t_level--;
1753 // Get args from parent team for teams construct
1754 
1755 #if OMPT_SUPPORT
1756           void *dummy;
1757           void **exit_frame_p;
1758           ompt_task_info_t *task_info;
1759 
1760           ompt_lw_taskteam_t lw_taskteam;
1761 
1762           if (ompt_enabled.enabled) {
1763             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1764                                     &ompt_parallel_data, return_address);
1765 
1766             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1767             // don't use lw_taskteam after linking. content was swaped
1768 
1769             task_info = OMPT_CUR_TASK_INFO(master_th);
1770             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1771             if (ompt_enabled.ompt_callback_implicit_task) {
1772               OMPT_CUR_TASK_INFO(master_th)
1773                   ->thread_num = __kmp_tid_from_gtid(gtid);
1774               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1775                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1776                   &(task_info->task_data), 1,
1777                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1778                   ompt_task_implicit);
1779             }
1780 
1781             /* OMPT state */
1782             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1783           } else {
1784             exit_frame_p = &dummy;
1785           }
1786 #endif
1787 
1788           {
1789             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1790             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1791             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1792                                    parent_team->t.t_argv
1793 #if OMPT_SUPPORT
1794                                    ,
1795                                    exit_frame_p
1796 #endif
1797                                    );
1798           }
1799 
1800 #if OMPT_SUPPORT
1801           if (ompt_enabled.enabled) {
1802             *exit_frame_p = NULL;
1803             if (ompt_enabled.ompt_callback_implicit_task) {
1804               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1805                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1806                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1807                   ompt_task_implicit);
1808             }
1809             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1810             __ompt_lw_taskteam_unlink(master_th);
1811             if (ompt_enabled.ompt_callback_parallel_end) {
1812               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1813                   &ompt_parallel_data, parent_task_data,
1814                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1815                   return_address);
1816             }
1817             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1818           }
1819 #endif
1820         } else if (microtask == (microtask_t)__kmp_teams_master) {
1821           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1822                            master_th->th.th_serial_team);
1823           team = master_th->th.th_team;
1824           // team->t.t_pkfn = microtask;
1825           team->t.t_invoke = invoker;
1826           __kmp_alloc_argv_entries(argc, team, TRUE);
1827           team->t.t_argc = argc;
1828           argv = (void **)team->t.t_argv;
1829           if (ap) {
1830             for (i = argc - 1; i >= 0; --i)
1831               *argv++ = va_arg(kmp_va_deref(ap), void *);
1832           } else {
1833             for (i = 0; i < argc; ++i)
1834               // Get args from parent team for teams construct
1835               argv[i] = parent_team->t.t_argv[i];
1836           }
1837           // AC: revert change made in __kmpc_serialized_parallel()
1838           //     because initial code in teams should have level=0
1839           team->t.t_level--;
1840           // AC: call special invoker for outer "parallel" of teams construct
1841           invoker(gtid);
1842 #if OMPT_SUPPORT
1843           if (ompt_enabled.enabled) {
1844             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1845             if (ompt_enabled.ompt_callback_implicit_task) {
1846               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1847                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1848                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1849             }
1850             if (ompt_enabled.ompt_callback_parallel_end) {
1851               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1852                   &ompt_parallel_data, parent_task_data,
1853                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1854                   return_address);
1855             }
1856             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1857           }
1858 #endif
1859         } else {
1860           argv = args;
1861           for (i = argc - 1; i >= 0; --i)
1862             *argv++ = va_arg(kmp_va_deref(ap), void *);
1863           KMP_MB();
1864 
1865 #if OMPT_SUPPORT
1866           void *dummy;
1867           void **exit_frame_p;
1868           ompt_task_info_t *task_info;
1869 
1870           ompt_lw_taskteam_t lw_taskteam;
1871 
1872           if (ompt_enabled.enabled) {
1873             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1874                                     &ompt_parallel_data, return_address);
1875             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1876             // don't use lw_taskteam after linking. content was swaped
1877             task_info = OMPT_CUR_TASK_INFO(master_th);
1878             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1879 
1880             /* OMPT implicit task begin */
1881             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1882             if (ompt_enabled.ompt_callback_implicit_task) {
1883               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1884                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1885                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1886                   ompt_task_implicit);
1887               OMPT_CUR_TASK_INFO(master_th)
1888                   ->thread_num = __kmp_tid_from_gtid(gtid);
1889             }
1890 
1891             /* OMPT state */
1892             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1893           } else {
1894             exit_frame_p = &dummy;
1895           }
1896 #endif
1897 
1898           {
1899             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1900             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1901             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1902 #if OMPT_SUPPORT
1903                                    ,
1904                                    exit_frame_p
1905 #endif
1906                                    );
1907           }
1908 
1909 #if OMPT_SUPPORT
1910           if (ompt_enabled.enabled) {
1911             *exit_frame_p = NULL;
1912             if (ompt_enabled.ompt_callback_implicit_task) {
1913               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1914                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1915                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1916                   ompt_task_implicit);
1917             }
1918 
1919             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1920             __ompt_lw_taskteam_unlink(master_th);
1921             if (ompt_enabled.ompt_callback_parallel_end) {
1922               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1923                   &ompt_parallel_data, parent_task_data,
1924                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1925                   return_address);
1926             }
1927             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1928           }
1929 #endif
1930         }
1931       } else if (call_context == fork_context_gnu) {
1932 #if OMPT_SUPPORT
1933         ompt_lw_taskteam_t lwt;
1934         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1935                                 return_address);
1936 
1937         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1938         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1939 // don't use lw_taskteam after linking. content was swaped
1940 #endif
1941 
1942         // we were called from GNU native code
1943         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1944         return FALSE;
1945       } else {
1946         KMP_ASSERT2(call_context < fork_context_last,
1947                     "__kmp_fork_call: unknown fork_context parameter");
1948       }
1949 
1950       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1951       KMP_MB();
1952       return FALSE;
1953     } // if (nthreads == 1)
1954 
1955     // GEH: only modify the executing flag in the case when not serialized
1956     //      serialized case is handled in kmpc_serialized_parallel
1957     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1958                   "curtask=%p, curtask_max_aclevel=%d\n",
1959                   parent_team->t.t_active_level, master_th,
1960                   master_th->th.th_current_task,
1961                   master_th->th.th_current_task->td_icvs.max_active_levels));
1962     // TODO: GEH - cannot do this assertion because root thread not set up as
1963     // executing
1964     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1965     master_th->th.th_current_task->td_flags.executing = 0;
1966 
1967     if (!master_th->th.th_teams_microtask || level > teams_level) {
1968       /* Increment our nested depth level */
1969       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1970     }
1971 
1972     // See if we need to make a copy of the ICVs.
1973     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1974     if ((level + 1 < __kmp_nested_nth.used) &&
1975         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1976       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1977     } else {
1978       nthreads_icv = 0; // don't update
1979     }
1980 
1981     // Figure out the proc_bind_policy for the new team.
1982     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1983     kmp_proc_bind_t proc_bind_icv =
1984         proc_bind_default; // proc_bind_default means don't update
1985     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1986       proc_bind = proc_bind_false;
1987     } else {
1988       if (proc_bind == proc_bind_default) {
1989         // No proc_bind clause specified; use current proc-bind-var for this
1990         // parallel region
1991         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1992       }
1993       /* else: The proc_bind policy was specified explicitly on parallel clause.
1994          This overrides proc-bind-var for this parallel region, but does not
1995          change proc-bind-var. */
1996       // Figure the value of proc-bind-var for the child threads.
1997       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1998           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1999            master_th->th.th_current_task->td_icvs.proc_bind)) {
2000         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2001       }
2002     }
2003 
2004     // Reset for next parallel region
2005     master_th->th.th_set_proc_bind = proc_bind_default;
2006 
2007     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2008       kmp_internal_control_t new_icvs;
2009       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2010       new_icvs.next = NULL;
2011       if (nthreads_icv > 0) {
2012         new_icvs.nproc = nthreads_icv;
2013       }
2014       if (proc_bind_icv != proc_bind_default) {
2015         new_icvs.proc_bind = proc_bind_icv;
2016       }
2017 
2018       /* allocate a new parallel team */
2019       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2020       team = __kmp_allocate_team(root, nthreads, nthreads,
2021 #if OMPT_SUPPORT
2022                                  ompt_parallel_data,
2023 #endif
2024                                  proc_bind, &new_icvs,
2025                                  argc USE_NESTED_HOT_ARG(master_th));
2026     } else {
2027       /* allocate a new parallel team */
2028       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2029       team = __kmp_allocate_team(root, nthreads, nthreads,
2030 #if OMPT_SUPPORT
2031                                  ompt_parallel_data,
2032 #endif
2033                                  proc_bind,
2034                                  &master_th->th.th_current_task->td_icvs,
2035                                  argc USE_NESTED_HOT_ARG(master_th));
2036     }
2037     KF_TRACE(
2038         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2039 
2040     /* setup the new team */
2041     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2042     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2043     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2044     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2045     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2046 #if OMPT_SUPPORT
2047     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2048                           return_address);
2049 #endif
2050     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2051     // TODO: parent_team->t.t_level == INT_MAX ???
2052     if (!master_th->th.th_teams_microtask || level > teams_level) {
2053       int new_level = parent_team->t.t_level + 1;
2054       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2055       new_level = parent_team->t.t_active_level + 1;
2056       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2057     } else {
2058       // AC: Do not increase parallel level at start of the teams construct
2059       int new_level = parent_team->t.t_level;
2060       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2061       new_level = parent_team->t.t_active_level;
2062       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2063     }
2064     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2065     // set master's schedule as new run-time schedule
2066     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2067 
2068     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2069     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2070 
2071     // Update the floating point rounding in the team if required.
2072     propagateFPControl(team);
2073 
2074     if (__kmp_tasking_mode != tskm_immediate_exec) {
2075       // Set master's task team to team's task team. Unless this is hot team, it
2076       // should be NULL.
2077       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2078                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2079       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2080                     "%p, new task_team %p / team %p\n",
2081                     __kmp_gtid_from_thread(master_th),
2082                     master_th->th.th_task_team, parent_team,
2083                     team->t.t_task_team[master_th->th.th_task_state], team));
2084 
2085       if (active_level || master_th->th.th_task_team) {
2086         // Take a memo of master's task_state
2087         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2088         if (master_th->th.th_task_state_top >=
2089             master_th->th.th_task_state_stack_sz) { // increase size
2090           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2091           kmp_uint8 *old_stack, *new_stack;
2092           kmp_uint32 i;
2093           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2094           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2095             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2096           }
2097           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2098                ++i) { // zero-init rest of stack
2099             new_stack[i] = 0;
2100           }
2101           old_stack = master_th->th.th_task_state_memo_stack;
2102           master_th->th.th_task_state_memo_stack = new_stack;
2103           master_th->th.th_task_state_stack_sz = new_size;
2104           __kmp_free(old_stack);
2105         }
2106         // Store master's task_state on stack
2107         master_th->th
2108             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2109             master_th->th.th_task_state;
2110         master_th->th.th_task_state_top++;
2111 #if KMP_NESTED_HOT_TEAMS
2112         if (master_th->th.th_hot_teams &&
2113             active_level < __kmp_hot_teams_max_level &&
2114             team == master_th->th.th_hot_teams[active_level].hot_team) {
2115           // Restore master's nested state if nested hot team
2116           master_th->th.th_task_state =
2117               master_th->th
2118                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2119         } else {
2120 #endif
2121           master_th->th.th_task_state = 0;
2122 #if KMP_NESTED_HOT_TEAMS
2123         }
2124 #endif
2125       }
2126 #if !KMP_NESTED_HOT_TEAMS
2127       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2128                        (team == root->r.r_hot_team));
2129 #endif
2130     }
2131 
2132     KA_TRACE(
2133         20,
2134         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2135          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2136          team->t.t_nproc));
2137     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2138                      (team->t.t_master_tid == 0 &&
2139                       (team->t.t_parent == root->r.r_root_team ||
2140                        team->t.t_parent->t.t_serialized)));
2141     KMP_MB();
2142 
2143     /* now, setup the arguments */
2144     argv = (void **)team->t.t_argv;
2145     if (ap) {
2146       for (i = argc - 1; i >= 0; --i) {
2147         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2148         KMP_CHECK_UPDATE(*argv, new_argv);
2149         argv++;
2150       }
2151     } else {
2152       for (i = 0; i < argc; ++i) {
2153         // Get args from parent team for teams construct
2154         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2155       }
2156     }
2157 
2158     /* now actually fork the threads */
2159     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2160     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2161       root->r.r_active = TRUE;
2162 
2163     __kmp_fork_team_threads(root, team, master_th, gtid);
2164     __kmp_setup_icv_copy(team, nthreads,
2165                          &master_th->th.th_current_task->td_icvs, loc);
2166 
2167 #if OMPT_SUPPORT
2168     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2169 #endif
2170 
2171     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2172 
2173 #if USE_ITT_BUILD
2174     if (team->t.t_active_level == 1 // only report frames at level 1
2175         && !master_th->th.th_teams_microtask) { // not in teams construct
2176 #if USE_ITT_NOTIFY
2177       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2178           (__kmp_forkjoin_frames_mode == 3 ||
2179            __kmp_forkjoin_frames_mode == 1)) {
2180         kmp_uint64 tmp_time = 0;
2181         if (__itt_get_timestamp_ptr)
2182           tmp_time = __itt_get_timestamp();
2183         // Internal fork - report frame begin
2184         master_th->th.th_frame_time = tmp_time;
2185         if (__kmp_forkjoin_frames_mode == 3)
2186           team->t.t_region_time = tmp_time;
2187       } else
2188 // only one notification scheme (either "submit" or "forking/joined", not both)
2189 #endif /* USE_ITT_NOTIFY */
2190           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2191               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2192         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2193         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2194       }
2195     }
2196 #endif /* USE_ITT_BUILD */
2197 
2198     /* now go on and do the work */
2199     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2200     KMP_MB();
2201     KF_TRACE(10,
2202              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2203               root, team, master_th, gtid));
2204 
2205 #if USE_ITT_BUILD
2206     if (__itt_stack_caller_create_ptr) {
2207       team->t.t_stack_id =
2208           __kmp_itt_stack_caller_create(); // create new stack stitching id
2209       // before entering fork barrier
2210     }
2211 #endif /* USE_ITT_BUILD */
2212 
2213     // AC: skip __kmp_internal_fork at teams construct, let only master
2214     // threads execute
2215     if (ap) {
2216       __kmp_internal_fork(loc, gtid, team);
2217       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2218                     "master_th=%p, gtid=%d\n",
2219                     root, team, master_th, gtid));
2220     }
2221 
2222     if (call_context == fork_context_gnu) {
2223       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2224       return TRUE;
2225     }
2226 
2227     /* Invoke microtask for MASTER thread */
2228     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2229                   team->t.t_id, team->t.t_pkfn));
2230   } // END of timer KMP_fork_call block
2231 
2232 #if KMP_STATS_ENABLED
2233   // If beginning a teams construct, then change thread state
2234   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2235   if (!ap) {
2236     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2237   }
2238 #endif
2239 
2240   if (!team->t.t_invoke(gtid)) {
2241     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2242   }
2243 
2244 #if KMP_STATS_ENABLED
2245   // If was beginning of a teams construct, then reset thread state
2246   if (!ap) {
2247     KMP_SET_THREAD_STATE(previous_state);
2248   }
2249 #endif
2250 
2251   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2252                 team->t.t_id, team->t.t_pkfn));
2253   KMP_MB(); /* Flush all pending memory write invalidates.  */
2254 
2255   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2256 
2257 #if OMPT_SUPPORT
2258   if (ompt_enabled.enabled) {
2259     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2260   }
2261 #endif
2262 
2263   return TRUE;
2264 }
2265 
2266 #if OMPT_SUPPORT
2267 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2268                                             kmp_team_t *team) {
2269   // restore state outside the region
2270   thread->th.ompt_thread_info.state =
2271       ((team->t.t_serialized) ? ompt_state_work_serial
2272                               : ompt_state_work_parallel);
2273 }
2274 
2275 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2276                                    kmp_team_t *team, ompt_data_t *parallel_data,
2277                                    int flags, void *codeptr) {
2278   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2279   if (ompt_enabled.ompt_callback_parallel_end) {
2280     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2281         parallel_data, &(task_info->task_data), flags, codeptr);
2282   }
2283 
2284   task_info->frame.enter_frame = ompt_data_none;
2285   __kmp_join_restore_state(thread, team);
2286 }
2287 #endif
2288 
2289 void __kmp_join_call(ident_t *loc, int gtid
2290 #if OMPT_SUPPORT
2291                      ,
2292                      enum fork_context_e fork_context
2293 #endif
2294                      ,
2295                      int exit_teams) {
2296   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2297   kmp_team_t *team;
2298   kmp_team_t *parent_team;
2299   kmp_info_t *master_th;
2300   kmp_root_t *root;
2301   int master_active;
2302 
2303   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2304 
2305   /* setup current data */
2306   master_th = __kmp_threads[gtid];
2307   root = master_th->th.th_root;
2308   team = master_th->th.th_team;
2309   parent_team = team->t.t_parent;
2310 
2311   master_th->th.th_ident = loc;
2312 
2313 #if OMPT_SUPPORT
2314   void *team_microtask = (void *)team->t.t_pkfn;
2315   // For GOMP interface with serialized parallel, need the
2316   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2317   // and end-parallel events.
2318   if (ompt_enabled.enabled &&
2319       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2320     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2321   }
2322 #endif
2323 
2324 #if KMP_DEBUG
2325   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2326     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2327                   "th_task_team = %p\n",
2328                   __kmp_gtid_from_thread(master_th), team,
2329                   team->t.t_task_team[master_th->th.th_task_state],
2330                   master_th->th.th_task_team));
2331     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2332                      team->t.t_task_team[master_th->th.th_task_state]);
2333   }
2334 #endif
2335 
2336   if (team->t.t_serialized) {
2337     if (master_th->th.th_teams_microtask) {
2338       // We are in teams construct
2339       int level = team->t.t_level;
2340       int tlevel = master_th->th.th_teams_level;
2341       if (level == tlevel) {
2342         // AC: we haven't incremented it earlier at start of teams construct,
2343         //     so do it here - at the end of teams construct
2344         team->t.t_level++;
2345       } else if (level == tlevel + 1) {
2346         // AC: we are exiting parallel inside teams, need to increment
2347         // serialization in order to restore it in the next call to
2348         // __kmpc_end_serialized_parallel
2349         team->t.t_serialized++;
2350       }
2351     }
2352     __kmpc_end_serialized_parallel(loc, gtid);
2353 
2354 #if OMPT_SUPPORT
2355     if (ompt_enabled.enabled) {
2356       __kmp_join_restore_state(master_th, parent_team);
2357     }
2358 #endif
2359 
2360     return;
2361   }
2362 
2363   master_active = team->t.t_master_active;
2364 
2365   if (!exit_teams) {
2366     // AC: No barrier for internal teams at exit from teams construct.
2367     //     But there is barrier for external team (league).
2368     __kmp_internal_join(loc, gtid, team);
2369   } else {
2370     master_th->th.th_task_state =
2371         0; // AC: no tasking in teams (out of any parallel)
2372   }
2373 
2374   KMP_MB();
2375 
2376 #if OMPT_SUPPORT
2377   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2378   void *codeptr = team->t.ompt_team_info.master_return_address;
2379 #endif
2380 
2381 #if USE_ITT_BUILD
2382   if (__itt_stack_caller_create_ptr) {
2383     // destroy the stack stitching id after join barrier
2384     __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2385   }
2386   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2387   if (team->t.t_active_level == 1 &&
2388       (!master_th->th.th_teams_microtask || /* not in teams construct */
2389        master_th->th.th_teams_size.nteams == 1)) {
2390     master_th->th.th_ident = loc;
2391     // only one notification scheme (either "submit" or "forking/joined", not
2392     // both)
2393     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2394         __kmp_forkjoin_frames_mode == 3)
2395       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2396                              master_th->th.th_frame_time, 0, loc,
2397                              master_th->th.th_team_nproc, 1);
2398     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2399              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2400       __kmp_itt_region_joined(gtid);
2401   } // active_level == 1
2402 #endif /* USE_ITT_BUILD */
2403 
2404   if (master_th->th.th_teams_microtask && !exit_teams &&
2405       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2406       team->t.t_level == master_th->th.th_teams_level + 1) {
2407 // AC: We need to leave the team structure intact at the end of parallel
2408 // inside the teams construct, so that at the next parallel same (hot) team
2409 // works, only adjust nesting levels
2410 #if OMPT_SUPPORT
2411     ompt_data_t ompt_parallel_data = ompt_data_none;
2412     if (ompt_enabled.enabled) {
2413       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2414       if (ompt_enabled.ompt_callback_implicit_task) {
2415         int ompt_team_size = team->t.t_nproc;
2416         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2417             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2418             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2419       }
2420       task_info->frame.exit_frame = ompt_data_none;
2421       task_info->task_data = ompt_data_none;
2422       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2423       __ompt_lw_taskteam_unlink(master_th);
2424     }
2425 #endif
2426     /* Decrement our nested depth level */
2427     team->t.t_level--;
2428     team->t.t_active_level--;
2429     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2430 
2431     // Restore number of threads in the team if needed. This code relies on
2432     // the proper adjustment of th_teams_size.nth after the fork in
2433     // __kmp_teams_master on each teams master in the case that
2434     // __kmp_reserve_threads reduced it.
2435     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2436       int old_num = master_th->th.th_team_nproc;
2437       int new_num = master_th->th.th_teams_size.nth;
2438       kmp_info_t **other_threads = team->t.t_threads;
2439       team->t.t_nproc = new_num;
2440       for (int i = 0; i < old_num; ++i) {
2441         other_threads[i]->th.th_team_nproc = new_num;
2442       }
2443       // Adjust states of non-used threads of the team
2444       for (int i = old_num; i < new_num; ++i) {
2445         // Re-initialize thread's barrier data.
2446         KMP_DEBUG_ASSERT(other_threads[i]);
2447         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2448         for (int b = 0; b < bs_last_barrier; ++b) {
2449           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2450           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2451 #if USE_DEBUGGER
2452           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2453 #endif
2454         }
2455         if (__kmp_tasking_mode != tskm_immediate_exec) {
2456           // Synchronize thread's task state
2457           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2458         }
2459       }
2460     }
2461 
2462 #if OMPT_SUPPORT
2463     if (ompt_enabled.enabled) {
2464       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2465                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2466     }
2467 #endif
2468 
2469     return;
2470   }
2471 
2472   /* do cleanup and restore the parent team */
2473   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2474   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2475 
2476   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2477 
2478   /* jc: The following lock has instructions with REL and ACQ semantics,
2479      separating the parallel user code called in this parallel region
2480      from the serial user code called after this function returns. */
2481   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2482 
2483   if (!master_th->th.th_teams_microtask ||
2484       team->t.t_level > master_th->th.th_teams_level) {
2485     /* Decrement our nested depth level */
2486     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2487   }
2488   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2489 
2490 #if OMPT_SUPPORT
2491   if (ompt_enabled.enabled) {
2492     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2493     if (ompt_enabled.ompt_callback_implicit_task) {
2494       int flags = (team_microtask == (void *)__kmp_teams_master)
2495                       ? ompt_task_initial
2496                       : ompt_task_implicit;
2497       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2498       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2499           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2500           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2501     }
2502     task_info->frame.exit_frame = ompt_data_none;
2503     task_info->task_data = ompt_data_none;
2504   }
2505 #endif
2506 
2507   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2508                 master_th, team));
2509   __kmp_pop_current_task_from_thread(master_th);
2510 
2511 #if KMP_AFFINITY_SUPPORTED
2512   // Restore master thread's partition.
2513   master_th->th.th_first_place = team->t.t_first_place;
2514   master_th->th.th_last_place = team->t.t_last_place;
2515 #endif // KMP_AFFINITY_SUPPORTED
2516   master_th->th.th_def_allocator = team->t.t_def_allocator;
2517 
2518   updateHWFPControl(team);
2519 
2520   if (root->r.r_active != master_active)
2521     root->r.r_active = master_active;
2522 
2523   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2524                             master_th)); // this will free worker threads
2525 
2526   /* this race was fun to find. make sure the following is in the critical
2527      region otherwise assertions may fail occasionally since the old team may be
2528      reallocated and the hierarchy appears inconsistent. it is actually safe to
2529      run and won't cause any bugs, but will cause those assertion failures. it's
2530      only one deref&assign so might as well put this in the critical region */
2531   master_th->th.th_team = parent_team;
2532   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2533   master_th->th.th_team_master = parent_team->t.t_threads[0];
2534   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2535 
2536   /* restore serialized team, if need be */
2537   if (parent_team->t.t_serialized &&
2538       parent_team != master_th->th.th_serial_team &&
2539       parent_team != root->r.r_root_team) {
2540     __kmp_free_team(root,
2541                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2542     master_th->th.th_serial_team = parent_team;
2543   }
2544 
2545   if (__kmp_tasking_mode != tskm_immediate_exec) {
2546     if (master_th->th.th_task_state_top >
2547         0) { // Restore task state from memo stack
2548       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2549       // Remember master's state if we re-use this nested hot team
2550       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2551           master_th->th.th_task_state;
2552       --master_th->th.th_task_state_top; // pop
2553       // Now restore state at this level
2554       master_th->th.th_task_state =
2555           master_th->th
2556               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2557     }
2558     // Copy the task team from the parent team to the master thread
2559     master_th->th.th_task_team =
2560         parent_team->t.t_task_team[master_th->th.th_task_state];
2561     KA_TRACE(20,
2562              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2563               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2564               parent_team));
2565   }
2566 
2567   // TODO: GEH - cannot do this assertion because root thread not set up as
2568   // executing
2569   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2570   master_th->th.th_current_task->td_flags.executing = 1;
2571 
2572   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2573 
2574 #if OMPT_SUPPORT
2575   int flags =
2576       OMPT_INVOKER(fork_context) |
2577       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2578                                                       : ompt_parallel_team);
2579   if (ompt_enabled.enabled) {
2580     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2581                     codeptr);
2582   }
2583 #endif
2584 
2585   KMP_MB();
2586   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2587 }
2588 
2589 /* Check whether we should push an internal control record onto the
2590    serial team stack.  If so, do it.  */
2591 void __kmp_save_internal_controls(kmp_info_t *thread) {
2592 
2593   if (thread->th.th_team != thread->th.th_serial_team) {
2594     return;
2595   }
2596   if (thread->th.th_team->t.t_serialized > 1) {
2597     int push = 0;
2598 
2599     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2600       push = 1;
2601     } else {
2602       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2603           thread->th.th_team->t.t_serialized) {
2604         push = 1;
2605       }
2606     }
2607     if (push) { /* push a record on the serial team's stack */
2608       kmp_internal_control_t *control =
2609           (kmp_internal_control_t *)__kmp_allocate(
2610               sizeof(kmp_internal_control_t));
2611 
2612       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2613 
2614       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2615 
2616       control->next = thread->th.th_team->t.t_control_stack_top;
2617       thread->th.th_team->t.t_control_stack_top = control;
2618     }
2619   }
2620 }
2621 
2622 /* Changes set_nproc */
2623 void __kmp_set_num_threads(int new_nth, int gtid) {
2624   kmp_info_t *thread;
2625   kmp_root_t *root;
2626 
2627   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2628   KMP_DEBUG_ASSERT(__kmp_init_serial);
2629 
2630   if (new_nth < 1)
2631     new_nth = 1;
2632   else if (new_nth > __kmp_max_nth)
2633     new_nth = __kmp_max_nth;
2634 
2635   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2636   thread = __kmp_threads[gtid];
2637   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2638     return; // nothing to do
2639 
2640   __kmp_save_internal_controls(thread);
2641 
2642   set__nproc(thread, new_nth);
2643 
2644   // If this omp_set_num_threads() call will cause the hot team size to be
2645   // reduced (in the absence of a num_threads clause), then reduce it now,
2646   // rather than waiting for the next parallel region.
2647   root = thread->th.th_root;
2648   if (__kmp_init_parallel && (!root->r.r_active) &&
2649       (root->r.r_hot_team->t.t_nproc > new_nth)
2650 #if KMP_NESTED_HOT_TEAMS
2651       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2652 #endif
2653       ) {
2654     kmp_team_t *hot_team = root->r.r_hot_team;
2655     int f;
2656 
2657     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2658 
2659     // Release the extra threads we don't need any more.
2660     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2661       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2662       if (__kmp_tasking_mode != tskm_immediate_exec) {
2663         // When decreasing team size, threads no longer in the team should unref
2664         // task team.
2665         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2666       }
2667       __kmp_free_thread(hot_team->t.t_threads[f]);
2668       hot_team->t.t_threads[f] = NULL;
2669     }
2670     hot_team->t.t_nproc = new_nth;
2671 #if KMP_NESTED_HOT_TEAMS
2672     if (thread->th.th_hot_teams) {
2673       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2674       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2675     }
2676 #endif
2677 
2678     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2679 
2680     // Update the t_nproc field in the threads that are still active.
2681     for (f = 0; f < new_nth; f++) {
2682       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2683       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2684     }
2685     // Special flag in case omp_set_num_threads() call
2686     hot_team->t.t_size_changed = -1;
2687   }
2688 }
2689 
2690 /* Changes max_active_levels */
2691 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2692   kmp_info_t *thread;
2693 
2694   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2695                 "%d = (%d)\n",
2696                 gtid, max_active_levels));
2697   KMP_DEBUG_ASSERT(__kmp_init_serial);
2698 
2699   // validate max_active_levels
2700   if (max_active_levels < 0) {
2701     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2702     // We ignore this call if the user has specified a negative value.
2703     // The current setting won't be changed. The last valid setting will be
2704     // used. A warning will be issued (if warnings are allowed as controlled by
2705     // the KMP_WARNINGS env var).
2706     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2707                   "max_active_levels for thread %d = (%d)\n",
2708                   gtid, max_active_levels));
2709     return;
2710   }
2711   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2712     // it's OK, the max_active_levels is within the valid range: [ 0;
2713     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2714     // We allow a zero value. (implementation defined behavior)
2715   } else {
2716     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2717                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2718     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2719     // Current upper limit is MAX_INT. (implementation defined behavior)
2720     // If the input exceeds the upper limit, we correct the input to be the
2721     // upper limit. (implementation defined behavior)
2722     // Actually, the flow should never get here until we use MAX_INT limit.
2723   }
2724   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2725                 "max_active_levels for thread %d = (%d)\n",
2726                 gtid, max_active_levels));
2727 
2728   thread = __kmp_threads[gtid];
2729 
2730   __kmp_save_internal_controls(thread);
2731 
2732   set__max_active_levels(thread, max_active_levels);
2733 }
2734 
2735 /* Gets max_active_levels */
2736 int __kmp_get_max_active_levels(int gtid) {
2737   kmp_info_t *thread;
2738 
2739   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2740   KMP_DEBUG_ASSERT(__kmp_init_serial);
2741 
2742   thread = __kmp_threads[gtid];
2743   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2744   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2745                 "curtask_maxaclevel=%d\n",
2746                 gtid, thread->th.th_current_task,
2747                 thread->th.th_current_task->td_icvs.max_active_levels));
2748   return thread->th.th_current_task->td_icvs.max_active_levels;
2749 }
2750 
2751 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2752 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2753 
2754 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2755 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2756   kmp_info_t *thread;
2757   kmp_sched_t orig_kind;
2758   //    kmp_team_t *team;
2759 
2760   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2761                 gtid, (int)kind, chunk));
2762   KMP_DEBUG_ASSERT(__kmp_init_serial);
2763 
2764   // Check if the kind parameter is valid, correct if needed.
2765   // Valid parameters should fit in one of two intervals - standard or extended:
2766   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2767   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2768   orig_kind = kind;
2769   kind = __kmp_sched_without_mods(kind);
2770 
2771   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2772       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2773     // TODO: Hint needs attention in case we change the default schedule.
2774     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2775               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2776               __kmp_msg_null);
2777     kind = kmp_sched_default;
2778     chunk = 0; // ignore chunk value in case of bad kind
2779   }
2780 
2781   thread = __kmp_threads[gtid];
2782 
2783   __kmp_save_internal_controls(thread);
2784 
2785   if (kind < kmp_sched_upper_std) {
2786     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2787       // differ static chunked vs. unchunked:  chunk should be invalid to
2788       // indicate unchunked schedule (which is the default)
2789       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2790     } else {
2791       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2792           __kmp_sch_map[kind - kmp_sched_lower - 1];
2793     }
2794   } else {
2795     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2796     //    kmp_sched_lower - 2 ];
2797     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2798         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2799                       kmp_sched_lower - 2];
2800   }
2801   __kmp_sched_apply_mods_intkind(
2802       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2803   if (kind == kmp_sched_auto || chunk < 1) {
2804     // ignore parameter chunk for schedule auto
2805     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2806   } else {
2807     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2808   }
2809 }
2810 
2811 /* Gets def_sched_var ICV values */
2812 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2813   kmp_info_t *thread;
2814   enum sched_type th_type;
2815 
2816   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2817   KMP_DEBUG_ASSERT(__kmp_init_serial);
2818 
2819   thread = __kmp_threads[gtid];
2820 
2821   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2822   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2823   case kmp_sch_static:
2824   case kmp_sch_static_greedy:
2825   case kmp_sch_static_balanced:
2826     *kind = kmp_sched_static;
2827     __kmp_sched_apply_mods_stdkind(kind, th_type);
2828     *chunk = 0; // chunk was not set, try to show this fact via zero value
2829     return;
2830   case kmp_sch_static_chunked:
2831     *kind = kmp_sched_static;
2832     break;
2833   case kmp_sch_dynamic_chunked:
2834     *kind = kmp_sched_dynamic;
2835     break;
2836   case kmp_sch_guided_chunked:
2837   case kmp_sch_guided_iterative_chunked:
2838   case kmp_sch_guided_analytical_chunked:
2839     *kind = kmp_sched_guided;
2840     break;
2841   case kmp_sch_auto:
2842     *kind = kmp_sched_auto;
2843     break;
2844   case kmp_sch_trapezoidal:
2845     *kind = kmp_sched_trapezoidal;
2846     break;
2847 #if KMP_STATIC_STEAL_ENABLED
2848   case kmp_sch_static_steal:
2849     *kind = kmp_sched_static_steal;
2850     break;
2851 #endif
2852   default:
2853     KMP_FATAL(UnknownSchedulingType, th_type);
2854   }
2855 
2856   __kmp_sched_apply_mods_stdkind(kind, th_type);
2857   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2858 }
2859 
2860 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2861 
2862   int ii, dd;
2863   kmp_team_t *team;
2864   kmp_info_t *thr;
2865 
2866   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2867   KMP_DEBUG_ASSERT(__kmp_init_serial);
2868 
2869   // validate level
2870   if (level == 0)
2871     return 0;
2872   if (level < 0)
2873     return -1;
2874   thr = __kmp_threads[gtid];
2875   team = thr->th.th_team;
2876   ii = team->t.t_level;
2877   if (level > ii)
2878     return -1;
2879 
2880   if (thr->th.th_teams_microtask) {
2881     // AC: we are in teams region where multiple nested teams have same level
2882     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2883     if (level <=
2884         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2885       KMP_DEBUG_ASSERT(ii >= tlevel);
2886       // AC: As we need to pass by the teams league, we need to artificially
2887       // increase ii
2888       if (ii == tlevel) {
2889         ii += 2; // three teams have same level
2890       } else {
2891         ii++; // two teams have same level
2892       }
2893     }
2894   }
2895 
2896   if (ii == level)
2897     return __kmp_tid_from_gtid(gtid);
2898 
2899   dd = team->t.t_serialized;
2900   level++;
2901   while (ii > level) {
2902     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2903     }
2904     if ((team->t.t_serialized) && (!dd)) {
2905       team = team->t.t_parent;
2906       continue;
2907     }
2908     if (ii > level) {
2909       team = team->t.t_parent;
2910       dd = team->t.t_serialized;
2911       ii--;
2912     }
2913   }
2914 
2915   return (dd > 1) ? (0) : (team->t.t_master_tid);
2916 }
2917 
2918 int __kmp_get_team_size(int gtid, int level) {
2919 
2920   int ii, dd;
2921   kmp_team_t *team;
2922   kmp_info_t *thr;
2923 
2924   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2925   KMP_DEBUG_ASSERT(__kmp_init_serial);
2926 
2927   // validate level
2928   if (level == 0)
2929     return 1;
2930   if (level < 0)
2931     return -1;
2932   thr = __kmp_threads[gtid];
2933   team = thr->th.th_team;
2934   ii = team->t.t_level;
2935   if (level > ii)
2936     return -1;
2937 
2938   if (thr->th.th_teams_microtask) {
2939     // AC: we are in teams region where multiple nested teams have same level
2940     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2941     if (level <=
2942         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2943       KMP_DEBUG_ASSERT(ii >= tlevel);
2944       // AC: As we need to pass by the teams league, we need to artificially
2945       // increase ii
2946       if (ii == tlevel) {
2947         ii += 2; // three teams have same level
2948       } else {
2949         ii++; // two teams have same level
2950       }
2951     }
2952   }
2953 
2954   while (ii > level) {
2955     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2956     }
2957     if (team->t.t_serialized && (!dd)) {
2958       team = team->t.t_parent;
2959       continue;
2960     }
2961     if (ii > level) {
2962       team = team->t.t_parent;
2963       ii--;
2964     }
2965   }
2966 
2967   return team->t.t_nproc;
2968 }
2969 
2970 kmp_r_sched_t __kmp_get_schedule_global() {
2971   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2972   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2973   // independently. So one can get the updated schedule here.
2974 
2975   kmp_r_sched_t r_sched;
2976 
2977   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2978   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2979   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2980   // different roots (even in OMP 2.5)
2981   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2982   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2983   if (s == kmp_sch_static) {
2984     // replace STATIC with more detailed schedule (balanced or greedy)
2985     r_sched.r_sched_type = __kmp_static;
2986   } else if (s == kmp_sch_guided_chunked) {
2987     // replace GUIDED with more detailed schedule (iterative or analytical)
2988     r_sched.r_sched_type = __kmp_guided;
2989   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2990     r_sched.r_sched_type = __kmp_sched;
2991   }
2992   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2993 
2994   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2995     // __kmp_chunk may be wrong here (if it was not ever set)
2996     r_sched.chunk = KMP_DEFAULT_CHUNK;
2997   } else {
2998     r_sched.chunk = __kmp_chunk;
2999   }
3000 
3001   return r_sched;
3002 }
3003 
3004 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3005    at least argc number of *t_argv entries for the requested team. */
3006 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3007 
3008   KMP_DEBUG_ASSERT(team);
3009   if (!realloc || argc > team->t.t_max_argc) {
3010 
3011     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3012                    "current entries=%d\n",
3013                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3014     /* if previously allocated heap space for args, free them */
3015     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3016       __kmp_free((void *)team->t.t_argv);
3017 
3018     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3019       /* use unused space in the cache line for arguments */
3020       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3021       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3022                      "argv entries\n",
3023                      team->t.t_id, team->t.t_max_argc));
3024       team->t.t_argv = &team->t.t_inline_argv[0];
3025       if (__kmp_storage_map) {
3026         __kmp_print_storage_map_gtid(
3027             -1, &team->t.t_inline_argv[0],
3028             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3029             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3030             team->t.t_id);
3031       }
3032     } else {
3033       /* allocate space for arguments in the heap */
3034       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3035                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3036                                : 2 * argc;
3037       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3038                      "argv entries\n",
3039                      team->t.t_id, team->t.t_max_argc));
3040       team->t.t_argv =
3041           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3042       if (__kmp_storage_map) {
3043         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3044                                      &team->t.t_argv[team->t.t_max_argc],
3045                                      sizeof(void *) * team->t.t_max_argc,
3046                                      "team_%d.t_argv", team->t.t_id);
3047       }
3048     }
3049   }
3050 }
3051 
3052 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3053   int i;
3054   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3055   team->t.t_threads =
3056       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3057   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3058       sizeof(dispatch_shared_info_t) * num_disp_buff);
3059   team->t.t_dispatch =
3060       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3061   team->t.t_implicit_task_taskdata =
3062       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3063   team->t.t_max_nproc = max_nth;
3064 
3065   /* setup dispatch buffers */
3066   for (i = 0; i < num_disp_buff; ++i) {
3067     team->t.t_disp_buffer[i].buffer_index = i;
3068     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3069   }
3070 }
3071 
3072 static void __kmp_free_team_arrays(kmp_team_t *team) {
3073   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3074   int i;
3075   for (i = 0; i < team->t.t_max_nproc; ++i) {
3076     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3077       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3078       team->t.t_dispatch[i].th_disp_buffer = NULL;
3079     }
3080   }
3081 #if KMP_USE_HIER_SCHED
3082   __kmp_dispatch_free_hierarchies(team);
3083 #endif
3084   __kmp_free(team->t.t_threads);
3085   __kmp_free(team->t.t_disp_buffer);
3086   __kmp_free(team->t.t_dispatch);
3087   __kmp_free(team->t.t_implicit_task_taskdata);
3088   team->t.t_threads = NULL;
3089   team->t.t_disp_buffer = NULL;
3090   team->t.t_dispatch = NULL;
3091   team->t.t_implicit_task_taskdata = 0;
3092 }
3093 
3094 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3095   kmp_info_t **oldThreads = team->t.t_threads;
3096 
3097   __kmp_free(team->t.t_disp_buffer);
3098   __kmp_free(team->t.t_dispatch);
3099   __kmp_free(team->t.t_implicit_task_taskdata);
3100   __kmp_allocate_team_arrays(team, max_nth);
3101 
3102   KMP_MEMCPY(team->t.t_threads, oldThreads,
3103              team->t.t_nproc * sizeof(kmp_info_t *));
3104 
3105   __kmp_free(oldThreads);
3106 }
3107 
3108 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3109 
3110   kmp_r_sched_t r_sched =
3111       __kmp_get_schedule_global(); // get current state of scheduling globals
3112 
3113   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3114 
3115   kmp_internal_control_t g_icvs = {
3116     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3117     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3118     // adjustment of threads (per thread)
3119     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3120     // whether blocktime is explicitly set
3121     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3122 #if KMP_USE_MONITOR
3123     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3124 // intervals
3125 #endif
3126     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3127     // next parallel region (per thread)
3128     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3129     __kmp_cg_max_nth, // int thread_limit;
3130     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3131     // for max_active_levels
3132     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3133     // {sched,chunk} pair
3134     __kmp_nested_proc_bind.bind_types[0],
3135     __kmp_default_device,
3136     NULL // struct kmp_internal_control *next;
3137   };
3138 
3139   return g_icvs;
3140 }
3141 
3142 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3143 
3144   kmp_internal_control_t gx_icvs;
3145   gx_icvs.serial_nesting_level =
3146       0; // probably =team->t.t_serial like in save_inter_controls
3147   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3148   gx_icvs.next = NULL;
3149 
3150   return gx_icvs;
3151 }
3152 
3153 static void __kmp_initialize_root(kmp_root_t *root) {
3154   int f;
3155   kmp_team_t *root_team;
3156   kmp_team_t *hot_team;
3157   int hot_team_max_nth;
3158   kmp_r_sched_t r_sched =
3159       __kmp_get_schedule_global(); // get current state of scheduling globals
3160   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3161   KMP_DEBUG_ASSERT(root);
3162   KMP_ASSERT(!root->r.r_begin);
3163 
3164   /* setup the root state structure */
3165   __kmp_init_lock(&root->r.r_begin_lock);
3166   root->r.r_begin = FALSE;
3167   root->r.r_active = FALSE;
3168   root->r.r_in_parallel = 0;
3169   root->r.r_blocktime = __kmp_dflt_blocktime;
3170 
3171   /* setup the root team for this task */
3172   /* allocate the root team structure */
3173   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3174 
3175   root_team =
3176       __kmp_allocate_team(root,
3177                           1, // new_nproc
3178                           1, // max_nproc
3179 #if OMPT_SUPPORT
3180                           ompt_data_none, // root parallel id
3181 #endif
3182                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3183                           0 // argc
3184                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3185                           );
3186 #if USE_DEBUGGER
3187   // Non-NULL value should be assigned to make the debugger display the root
3188   // team.
3189   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3190 #endif
3191 
3192   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3193 
3194   root->r.r_root_team = root_team;
3195   root_team->t.t_control_stack_top = NULL;
3196 
3197   /* initialize root team */
3198   root_team->t.t_threads[0] = NULL;
3199   root_team->t.t_nproc = 1;
3200   root_team->t.t_serialized = 1;
3201   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3202   root_team->t.t_sched.sched = r_sched.sched;
3203   KA_TRACE(
3204       20,
3205       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3206        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3207 
3208   /* setup the  hot team for this task */
3209   /* allocate the hot team structure */
3210   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3211 
3212   hot_team =
3213       __kmp_allocate_team(root,
3214                           1, // new_nproc
3215                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3216 #if OMPT_SUPPORT
3217                           ompt_data_none, // root parallel id
3218 #endif
3219                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3220                           0 // argc
3221                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3222                           );
3223   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3224 
3225   root->r.r_hot_team = hot_team;
3226   root_team->t.t_control_stack_top = NULL;
3227 
3228   /* first-time initialization */
3229   hot_team->t.t_parent = root_team;
3230 
3231   /* initialize hot team */
3232   hot_team_max_nth = hot_team->t.t_max_nproc;
3233   for (f = 0; f < hot_team_max_nth; ++f) {
3234     hot_team->t.t_threads[f] = NULL;
3235   }
3236   hot_team->t.t_nproc = 1;
3237   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3238   hot_team->t.t_sched.sched = r_sched.sched;
3239   hot_team->t.t_size_changed = 0;
3240 }
3241 
3242 #ifdef KMP_DEBUG
3243 
3244 typedef struct kmp_team_list_item {
3245   kmp_team_p const *entry;
3246   struct kmp_team_list_item *next;
3247 } kmp_team_list_item_t;
3248 typedef kmp_team_list_item_t *kmp_team_list_t;
3249 
3250 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3251     kmp_team_list_t list, // List of teams.
3252     kmp_team_p const *team // Team to add.
3253     ) {
3254 
3255   // List must terminate with item where both entry and next are NULL.
3256   // Team is added to the list only once.
3257   // List is sorted in ascending order by team id.
3258   // Team id is *not* a key.
3259 
3260   kmp_team_list_t l;
3261 
3262   KMP_DEBUG_ASSERT(list != NULL);
3263   if (team == NULL) {
3264     return;
3265   }
3266 
3267   __kmp_print_structure_team_accum(list, team->t.t_parent);
3268   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3269 
3270   // Search list for the team.
3271   l = list;
3272   while (l->next != NULL && l->entry != team) {
3273     l = l->next;
3274   }
3275   if (l->next != NULL) {
3276     return; // Team has been added before, exit.
3277   }
3278 
3279   // Team is not found. Search list again for insertion point.
3280   l = list;
3281   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3282     l = l->next;
3283   }
3284 
3285   // Insert team.
3286   {
3287     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3288         sizeof(kmp_team_list_item_t));
3289     *item = *l;
3290     l->entry = team;
3291     l->next = item;
3292   }
3293 }
3294 
3295 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3296 
3297                                        ) {
3298   __kmp_printf("%s", title);
3299   if (team != NULL) {
3300     __kmp_printf("%2x %p\n", team->t.t_id, team);
3301   } else {
3302     __kmp_printf(" - (nil)\n");
3303   }
3304 }
3305 
3306 static void __kmp_print_structure_thread(char const *title,
3307                                          kmp_info_p const *thread) {
3308   __kmp_printf("%s", title);
3309   if (thread != NULL) {
3310     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3311   } else {
3312     __kmp_printf(" - (nil)\n");
3313   }
3314 }
3315 
3316 void __kmp_print_structure(void) {
3317 
3318   kmp_team_list_t list;
3319 
3320   // Initialize list of teams.
3321   list =
3322       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3323   list->entry = NULL;
3324   list->next = NULL;
3325 
3326   __kmp_printf("\n------------------------------\nGlobal Thread "
3327                "Table\n------------------------------\n");
3328   {
3329     int gtid;
3330     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3331       __kmp_printf("%2d", gtid);
3332       if (__kmp_threads != NULL) {
3333         __kmp_printf(" %p", __kmp_threads[gtid]);
3334       }
3335       if (__kmp_root != NULL) {
3336         __kmp_printf(" %p", __kmp_root[gtid]);
3337       }
3338       __kmp_printf("\n");
3339     }
3340   }
3341 
3342   // Print out __kmp_threads array.
3343   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3344                "----------\n");
3345   if (__kmp_threads != NULL) {
3346     int gtid;
3347     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3348       kmp_info_t const *thread = __kmp_threads[gtid];
3349       if (thread != NULL) {
3350         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3351         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3352         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3353         __kmp_print_structure_team("    Serial Team:  ",
3354                                    thread->th.th_serial_team);
3355         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3356         __kmp_print_structure_thread("    Master:       ",
3357                                      thread->th.th_team_master);
3358         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3359         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3360         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3361         __kmp_print_structure_thread("    Next in pool: ",
3362                                      thread->th.th_next_pool);
3363         __kmp_printf("\n");
3364         __kmp_print_structure_team_accum(list, thread->th.th_team);
3365         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3366       }
3367     }
3368   } else {
3369     __kmp_printf("Threads array is not allocated.\n");
3370   }
3371 
3372   // Print out __kmp_root array.
3373   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3374                "--------\n");
3375   if (__kmp_root != NULL) {
3376     int gtid;
3377     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3378       kmp_root_t const *root = __kmp_root[gtid];
3379       if (root != NULL) {
3380         __kmp_printf("GTID %2d %p:\n", gtid, root);
3381         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3382         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3383         __kmp_print_structure_thread("    Uber Thread:  ",
3384                                      root->r.r_uber_thread);
3385         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3386         __kmp_printf("    In Parallel:  %2d\n",
3387                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3388         __kmp_printf("\n");
3389         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3390         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3391       }
3392     }
3393   } else {
3394     __kmp_printf("Ubers array is not allocated.\n");
3395   }
3396 
3397   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3398                "--------\n");
3399   while (list->next != NULL) {
3400     kmp_team_p const *team = list->entry;
3401     int i;
3402     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3403     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3404     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3405     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3406     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3407     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3408     for (i = 0; i < team->t.t_nproc; ++i) {
3409       __kmp_printf("    Thread %2d:      ", i);
3410       __kmp_print_structure_thread("", team->t.t_threads[i]);
3411     }
3412     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3413     __kmp_printf("\n");
3414     list = list->next;
3415   }
3416 
3417   // Print out __kmp_thread_pool and __kmp_team_pool.
3418   __kmp_printf("\n------------------------------\nPools\n----------------------"
3419                "--------\n");
3420   __kmp_print_structure_thread("Thread pool:          ",
3421                                CCAST(kmp_info_t *, __kmp_thread_pool));
3422   __kmp_print_structure_team("Team pool:            ",
3423                              CCAST(kmp_team_t *, __kmp_team_pool));
3424   __kmp_printf("\n");
3425 
3426   // Free team list.
3427   while (list != NULL) {
3428     kmp_team_list_item_t *item = list;
3429     list = list->next;
3430     KMP_INTERNAL_FREE(item);
3431   }
3432 }
3433 
3434 #endif
3435 
3436 //---------------------------------------------------------------------------
3437 //  Stuff for per-thread fast random number generator
3438 //  Table of primes
3439 static const unsigned __kmp_primes[] = {
3440     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3441     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3442     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3443     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3444     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3445     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3446     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3447     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3448     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3449     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3450     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3451 
3452 //---------------------------------------------------------------------------
3453 //  __kmp_get_random: Get a random number using a linear congruential method.
3454 unsigned short __kmp_get_random(kmp_info_t *thread) {
3455   unsigned x = thread->th.th_x;
3456   unsigned short r = (unsigned short)(x >> 16);
3457 
3458   thread->th.th_x = x * thread->th.th_a + 1;
3459 
3460   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3461                 thread->th.th_info.ds.ds_tid, r));
3462 
3463   return r;
3464 }
3465 //--------------------------------------------------------
3466 // __kmp_init_random: Initialize a random number generator
3467 void __kmp_init_random(kmp_info_t *thread) {
3468   unsigned seed = thread->th.th_info.ds.ds_tid;
3469 
3470   thread->th.th_a =
3471       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3472   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3473   KA_TRACE(30,
3474            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3475 }
3476 
3477 #if KMP_OS_WINDOWS
3478 /* reclaim array entries for root threads that are already dead, returns number
3479  * reclaimed */
3480 static int __kmp_reclaim_dead_roots(void) {
3481   int i, r = 0;
3482 
3483   for (i = 0; i < __kmp_threads_capacity; ++i) {
3484     if (KMP_UBER_GTID(i) &&
3485         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3486         !__kmp_root[i]
3487              ->r.r_active) { // AC: reclaim only roots died in non-active state
3488       r += __kmp_unregister_root_other_thread(i);
3489     }
3490   }
3491   return r;
3492 }
3493 #endif
3494 
3495 /* This function attempts to create free entries in __kmp_threads and
3496    __kmp_root, and returns the number of free entries generated.
3497 
3498    For Windows* OS static library, the first mechanism used is to reclaim array
3499    entries for root threads that are already dead.
3500 
3501    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3502    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3503    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3504    threadprivate cache array has been created. Synchronization with
3505    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3506 
3507    After any dead root reclamation, if the clipping value allows array expansion
3508    to result in the generation of a total of nNeed free slots, the function does
3509    that expansion. If not, nothing is done beyond the possible initial root
3510    thread reclamation.
3511 
3512    If any argument is negative, the behavior is undefined. */
3513 static int __kmp_expand_threads(int nNeed) {
3514   int added = 0;
3515   int minimumRequiredCapacity;
3516   int newCapacity;
3517   kmp_info_t **newThreads;
3518   kmp_root_t **newRoot;
3519 
3520 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3521 // resizing __kmp_threads does not need additional protection if foreign
3522 // threads are present
3523 
3524 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3525   /* only for Windows static library */
3526   /* reclaim array entries for root threads that are already dead */
3527   added = __kmp_reclaim_dead_roots();
3528 
3529   if (nNeed) {
3530     nNeed -= added;
3531     if (nNeed < 0)
3532       nNeed = 0;
3533   }
3534 #endif
3535   if (nNeed <= 0)
3536     return added;
3537 
3538   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3539   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3540   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3541   // > __kmp_max_nth in one of two ways:
3542   //
3543   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3544   //    may not be reused by another thread, so we may need to increase
3545   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3546   //
3547   // 2) New foreign root(s) are encountered.  We always register new foreign
3548   //    roots. This may cause a smaller # of threads to be allocated at
3549   //    subsequent parallel regions, but the worker threads hang around (and
3550   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3551   //
3552   // Anyway, that is the reason for moving the check to see if
3553   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3554   // instead of having it performed here. -BB
3555 
3556   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3557 
3558   /* compute expansion headroom to check if we can expand */
3559   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3560     /* possible expansion too small -- give up */
3561     return added;
3562   }
3563   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3564 
3565   newCapacity = __kmp_threads_capacity;
3566   do {
3567     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3568                                                           : __kmp_sys_max_nth;
3569   } while (newCapacity < minimumRequiredCapacity);
3570   newThreads = (kmp_info_t **)__kmp_allocate(
3571       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3572   newRoot =
3573       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3574   KMP_MEMCPY(newThreads, __kmp_threads,
3575              __kmp_threads_capacity * sizeof(kmp_info_t *));
3576   KMP_MEMCPY(newRoot, __kmp_root,
3577              __kmp_threads_capacity * sizeof(kmp_root_t *));
3578 
3579   kmp_info_t **temp_threads = __kmp_threads;
3580   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3581   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3582   __kmp_free(temp_threads);
3583   added += newCapacity - __kmp_threads_capacity;
3584   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3585 
3586   if (newCapacity > __kmp_tp_capacity) {
3587     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3588     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3589       __kmp_threadprivate_resize_cache(newCapacity);
3590     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3591       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3592     }
3593     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3594   }
3595 
3596   return added;
3597 }
3598 
3599 /* Register the current thread as a root thread and obtain our gtid. We must
3600    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3601    thread that calls from __kmp_do_serial_initialize() */
3602 int __kmp_register_root(int initial_thread) {
3603   kmp_info_t *root_thread;
3604   kmp_root_t *root;
3605   int gtid;
3606   int capacity;
3607   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3608   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3609   KMP_MB();
3610 
3611   /* 2007-03-02:
3612      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3613      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3614      work as expected -- it may return false (that means there is at least one
3615      empty slot in __kmp_threads array), but it is possible the only free slot
3616      is #0, which is reserved for initial thread and so cannot be used for this
3617      one. Following code workarounds this bug.
3618 
3619      However, right solution seems to be not reserving slot #0 for initial
3620      thread because:
3621      (1) there is no magic in slot #0,
3622      (2) we cannot detect initial thread reliably (the first thread which does
3623         serial initialization may be not a real initial thread).
3624   */
3625   capacity = __kmp_threads_capacity;
3626   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3627     --capacity;
3628   }
3629 
3630   /* see if there are too many threads */
3631   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3632     if (__kmp_tp_cached) {
3633       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3634                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3635                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3636     } else {
3637       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3638                   __kmp_msg_null);
3639     }
3640   }
3641 
3642   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3643   // 0: initial thread, also a regular OpenMP thread.
3644   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3645   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3646   // regular OpenMP threads.
3647   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3648     // Find an available thread slot for hidden helper thread. Slots for hidden
3649     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3650     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3651                    gtid <= __kmp_hidden_helper_threads_num;
3652          gtid++)
3653       ;
3654     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3655     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3656                  "hidden helper thread: T#%d\n",
3657                  gtid));
3658   } else {
3659     /* find an available thread slot */
3660     // Don't reassign the zero slot since we need that to only be used by
3661     // initial thread. Slots for hidden helper threads should also be skipped.
3662     if (initial_thread && __kmp_threads[0] == NULL) {
3663       gtid = 0;
3664     } else {
3665       for (gtid = __kmp_hidden_helper_threads_num + 1;
3666            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3667         ;
3668     }
3669     KA_TRACE(
3670         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3671     KMP_ASSERT(gtid < __kmp_threads_capacity);
3672   }
3673 
3674   /* update global accounting */
3675   __kmp_all_nth++;
3676   TCW_4(__kmp_nth, __kmp_nth + 1);
3677 
3678   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3679   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3680   if (__kmp_adjust_gtid_mode) {
3681     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3682       if (TCR_4(__kmp_gtid_mode) != 2) {
3683         TCW_4(__kmp_gtid_mode, 2);
3684       }
3685     } else {
3686       if (TCR_4(__kmp_gtid_mode) != 1) {
3687         TCW_4(__kmp_gtid_mode, 1);
3688       }
3689     }
3690   }
3691 
3692 #ifdef KMP_ADJUST_BLOCKTIME
3693   /* Adjust blocktime to zero if necessary            */
3694   /* Middle initialization might not have occurred yet */
3695   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3696     if (__kmp_nth > __kmp_avail_proc) {
3697       __kmp_zero_bt = TRUE;
3698     }
3699   }
3700 #endif /* KMP_ADJUST_BLOCKTIME */
3701 
3702   /* setup this new hierarchy */
3703   if (!(root = __kmp_root[gtid])) {
3704     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3705     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3706   }
3707 
3708 #if KMP_STATS_ENABLED
3709   // Initialize stats as soon as possible (right after gtid assignment).
3710   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3711   __kmp_stats_thread_ptr->startLife();
3712   KMP_SET_THREAD_STATE(SERIAL_REGION);
3713   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3714 #endif
3715   __kmp_initialize_root(root);
3716 
3717   /* setup new root thread structure */
3718   if (root->r.r_uber_thread) {
3719     root_thread = root->r.r_uber_thread;
3720   } else {
3721     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3722     if (__kmp_storage_map) {
3723       __kmp_print_thread_storage_map(root_thread, gtid);
3724     }
3725     root_thread->th.th_info.ds.ds_gtid = gtid;
3726 #if OMPT_SUPPORT
3727     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3728 #endif
3729     root_thread->th.th_root = root;
3730     if (__kmp_env_consistency_check) {
3731       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3732     }
3733 #if USE_FAST_MEMORY
3734     __kmp_initialize_fast_memory(root_thread);
3735 #endif /* USE_FAST_MEMORY */
3736 
3737 #if KMP_USE_BGET
3738     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3739     __kmp_initialize_bget(root_thread);
3740 #endif
3741     __kmp_init_random(root_thread); // Initialize random number generator
3742   }
3743 
3744   /* setup the serial team held in reserve by the root thread */
3745   if (!root_thread->th.th_serial_team) {
3746     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3747     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3748     root_thread->th.th_serial_team = __kmp_allocate_team(
3749         root, 1, 1,
3750 #if OMPT_SUPPORT
3751         ompt_data_none, // root parallel id
3752 #endif
3753         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3754   }
3755   KMP_ASSERT(root_thread->th.th_serial_team);
3756   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3757                 root_thread->th.th_serial_team));
3758 
3759   /* drop root_thread into place */
3760   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3761 
3762   root->r.r_root_team->t.t_threads[0] = root_thread;
3763   root->r.r_hot_team->t.t_threads[0] = root_thread;
3764   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3765   // AC: the team created in reserve, not for execution (it is unused for now).
3766   root_thread->th.th_serial_team->t.t_serialized = 0;
3767   root->r.r_uber_thread = root_thread;
3768 
3769   /* initialize the thread, get it ready to go */
3770   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3771   TCW_4(__kmp_init_gtid, TRUE);
3772 
3773   /* prepare the master thread for get_gtid() */
3774   __kmp_gtid_set_specific(gtid);
3775 
3776 #if USE_ITT_BUILD
3777   __kmp_itt_thread_name(gtid);
3778 #endif /* USE_ITT_BUILD */
3779 
3780 #ifdef KMP_TDATA_GTID
3781   __kmp_gtid = gtid;
3782 #endif
3783   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3784   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3785 
3786   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3787                 "plain=%u\n",
3788                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3789                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3790                 KMP_INIT_BARRIER_STATE));
3791   { // Initialize barrier data.
3792     int b;
3793     for (b = 0; b < bs_last_barrier; ++b) {
3794       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3795 #if USE_DEBUGGER
3796       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3797 #endif
3798     }
3799   }
3800   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3801                    KMP_INIT_BARRIER_STATE);
3802 
3803 #if KMP_AFFINITY_SUPPORTED
3804   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3805   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3806   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3807   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3808   if (TCR_4(__kmp_init_middle)) {
3809     __kmp_affinity_set_init_mask(gtid, TRUE);
3810   }
3811 #endif /* KMP_AFFINITY_SUPPORTED */
3812   root_thread->th.th_def_allocator = __kmp_def_allocator;
3813   root_thread->th.th_prev_level = 0;
3814   root_thread->th.th_prev_num_threads = 1;
3815 
3816   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3817   tmp->cg_root = root_thread;
3818   tmp->cg_thread_limit = __kmp_cg_max_nth;
3819   tmp->cg_nthreads = 1;
3820   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3821                  " cg_nthreads init to 1\n",
3822                  root_thread, tmp));
3823   tmp->up = NULL;
3824   root_thread->th.th_cg_roots = tmp;
3825 
3826   __kmp_root_counter++;
3827 
3828 #if OMPT_SUPPORT
3829   if (!initial_thread && ompt_enabled.enabled) {
3830 
3831     kmp_info_t *root_thread = ompt_get_thread();
3832 
3833     ompt_set_thread_state(root_thread, ompt_state_overhead);
3834 
3835     if (ompt_enabled.ompt_callback_thread_begin) {
3836       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3837           ompt_thread_initial, __ompt_get_thread_data_internal());
3838     }
3839     ompt_data_t *task_data;
3840     ompt_data_t *parallel_data;
3841     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3842     if (ompt_enabled.ompt_callback_implicit_task) {
3843       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3844           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3845     }
3846 
3847     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3848   }
3849 #endif
3850 
3851   KMP_MB();
3852   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3853 
3854   return gtid;
3855 }
3856 
3857 #if KMP_NESTED_HOT_TEAMS
3858 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3859                                 const int max_level) {
3860   int i, n, nth;
3861   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3862   if (!hot_teams || !hot_teams[level].hot_team) {
3863     return 0;
3864   }
3865   KMP_DEBUG_ASSERT(level < max_level);
3866   kmp_team_t *team = hot_teams[level].hot_team;
3867   nth = hot_teams[level].hot_team_nth;
3868   n = nth - 1; // master is not freed
3869   if (level < max_level - 1) {
3870     for (i = 0; i < nth; ++i) {
3871       kmp_info_t *th = team->t.t_threads[i];
3872       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3873       if (i > 0 && th->th.th_hot_teams) {
3874         __kmp_free(th->th.th_hot_teams);
3875         th->th.th_hot_teams = NULL;
3876       }
3877     }
3878   }
3879   __kmp_free_team(root, team, NULL);
3880   return n;
3881 }
3882 #endif
3883 
3884 // Resets a root thread and clear its root and hot teams.
3885 // Returns the number of __kmp_threads entries directly and indirectly freed.
3886 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3887   kmp_team_t *root_team = root->r.r_root_team;
3888   kmp_team_t *hot_team = root->r.r_hot_team;
3889   int n = hot_team->t.t_nproc;
3890   int i;
3891 
3892   KMP_DEBUG_ASSERT(!root->r.r_active);
3893 
3894   root->r.r_root_team = NULL;
3895   root->r.r_hot_team = NULL;
3896   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3897   // before call to __kmp_free_team().
3898   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3899 #if KMP_NESTED_HOT_TEAMS
3900   if (__kmp_hot_teams_max_level >
3901       0) { // need to free nested hot teams and their threads if any
3902     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3903       kmp_info_t *th = hot_team->t.t_threads[i];
3904       if (__kmp_hot_teams_max_level > 1) {
3905         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3906       }
3907       if (th->th.th_hot_teams) {
3908         __kmp_free(th->th.th_hot_teams);
3909         th->th.th_hot_teams = NULL;
3910       }
3911     }
3912   }
3913 #endif
3914   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3915 
3916   // Before we can reap the thread, we need to make certain that all other
3917   // threads in the teams that had this root as ancestor have stopped trying to
3918   // steal tasks.
3919   if (__kmp_tasking_mode != tskm_immediate_exec) {
3920     __kmp_wait_to_unref_task_teams();
3921   }
3922 
3923 #if KMP_OS_WINDOWS
3924   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3925   KA_TRACE(
3926       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3927            "\n",
3928            (LPVOID) & (root->r.r_uber_thread->th),
3929            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3930   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3931 #endif /* KMP_OS_WINDOWS */
3932 
3933 #if OMPT_SUPPORT
3934   ompt_data_t *task_data;
3935   ompt_data_t *parallel_data;
3936   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3937   if (ompt_enabled.ompt_callback_implicit_task) {
3938     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3939         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3940   }
3941   if (ompt_enabled.ompt_callback_thread_end) {
3942     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3943         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3944   }
3945 #endif
3946 
3947   TCW_4(__kmp_nth,
3948         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3949   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3950   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3951                  " to %d\n",
3952                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3953                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3954   if (i == 1) {
3955     // need to free contention group structure
3956     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3957                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3958     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3959     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3960     root->r.r_uber_thread->th.th_cg_roots = NULL;
3961   }
3962   __kmp_reap_thread(root->r.r_uber_thread, 1);
3963 
3964   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3965   // instead of freeing.
3966   root->r.r_uber_thread = NULL;
3967   /* mark root as no longer in use */
3968   root->r.r_begin = FALSE;
3969 
3970   return n;
3971 }
3972 
3973 void __kmp_unregister_root_current_thread(int gtid) {
3974   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3975   /* this lock should be ok, since unregister_root_current_thread is never
3976      called during an abort, only during a normal close. furthermore, if you
3977      have the forkjoin lock, you should never try to get the initz lock */
3978   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3979   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3980     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3981                   "exiting T#%d\n",
3982                   gtid));
3983     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3984     return;
3985   }
3986   kmp_root_t *root = __kmp_root[gtid];
3987 
3988   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3989   KMP_ASSERT(KMP_UBER_GTID(gtid));
3990   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3991   KMP_ASSERT(root->r.r_active == FALSE);
3992 
3993   KMP_MB();
3994 
3995   kmp_info_t *thread = __kmp_threads[gtid];
3996   kmp_team_t *team = thread->th.th_team;
3997   kmp_task_team_t *task_team = thread->th.th_task_team;
3998 
3999   // we need to wait for the proxy tasks before finishing the thread
4000   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4001 #if OMPT_SUPPORT
4002     // the runtime is shutting down so we won't report any events
4003     thread->th.ompt_thread_info.state = ompt_state_undefined;
4004 #endif
4005     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4006   }
4007 
4008   __kmp_reset_root(gtid, root);
4009 
4010   KMP_MB();
4011   KC_TRACE(10,
4012            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4013 
4014   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4015 }
4016 
4017 #if KMP_OS_WINDOWS
4018 /* __kmp_forkjoin_lock must be already held
4019    Unregisters a root thread that is not the current thread.  Returns the number
4020    of __kmp_threads entries freed as a result. */
4021 static int __kmp_unregister_root_other_thread(int gtid) {
4022   kmp_root_t *root = __kmp_root[gtid];
4023   int r;
4024 
4025   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4026   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4027   KMP_ASSERT(KMP_UBER_GTID(gtid));
4028   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4029   KMP_ASSERT(root->r.r_active == FALSE);
4030 
4031   r = __kmp_reset_root(gtid, root);
4032   KC_TRACE(10,
4033            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4034   return r;
4035 }
4036 #endif
4037 
4038 #if KMP_DEBUG
4039 void __kmp_task_info() {
4040 
4041   kmp_int32 gtid = __kmp_entry_gtid();
4042   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4043   kmp_info_t *this_thr = __kmp_threads[gtid];
4044   kmp_team_t *steam = this_thr->th.th_serial_team;
4045   kmp_team_t *team = this_thr->th.th_team;
4046 
4047   __kmp_printf(
4048       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4049       "ptask=%p\n",
4050       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4051       team->t.t_implicit_task_taskdata[tid].td_parent);
4052 }
4053 #endif // KMP_DEBUG
4054 
4055 /* TODO optimize with one big memclr, take out what isn't needed, split
4056    responsibility to workers as much as possible, and delay initialization of
4057    features as much as possible  */
4058 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4059                                   int tid, int gtid) {
4060   /* this_thr->th.th_info.ds.ds_gtid is setup in
4061      kmp_allocate_thread/create_worker.
4062      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4063   kmp_info_t *master = team->t.t_threads[0];
4064   KMP_DEBUG_ASSERT(this_thr != NULL);
4065   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4066   KMP_DEBUG_ASSERT(team);
4067   KMP_DEBUG_ASSERT(team->t.t_threads);
4068   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4069   KMP_DEBUG_ASSERT(master);
4070   KMP_DEBUG_ASSERT(master->th.th_root);
4071 
4072   KMP_MB();
4073 
4074   TCW_SYNC_PTR(this_thr->th.th_team, team);
4075 
4076   this_thr->th.th_info.ds.ds_tid = tid;
4077   this_thr->th.th_set_nproc = 0;
4078   if (__kmp_tasking_mode != tskm_immediate_exec)
4079     // When tasking is possible, threads are not safe to reap until they are
4080     // done tasking; this will be set when tasking code is exited in wait
4081     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4082   else // no tasking --> always safe to reap
4083     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4084   this_thr->th.th_set_proc_bind = proc_bind_default;
4085 #if KMP_AFFINITY_SUPPORTED
4086   this_thr->th.th_new_place = this_thr->th.th_current_place;
4087 #endif
4088   this_thr->th.th_root = master->th.th_root;
4089 
4090   /* setup the thread's cache of the team structure */
4091   this_thr->th.th_team_nproc = team->t.t_nproc;
4092   this_thr->th.th_team_master = master;
4093   this_thr->th.th_team_serialized = team->t.t_serialized;
4094   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4095 
4096   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4097 
4098   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4099                 tid, gtid, this_thr, this_thr->th.th_current_task));
4100 
4101   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4102                            team, tid, TRUE);
4103 
4104   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4105                 tid, gtid, this_thr, this_thr->th.th_current_task));
4106   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4107   // __kmp_initialize_team()?
4108 
4109   /* TODO no worksharing in speculative threads */
4110   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4111 
4112   this_thr->th.th_local.this_construct = 0;
4113 
4114   if (!this_thr->th.th_pri_common) {
4115     this_thr->th.th_pri_common =
4116         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4117     if (__kmp_storage_map) {
4118       __kmp_print_storage_map_gtid(
4119           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4120           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4121     }
4122     this_thr->th.th_pri_head = NULL;
4123   }
4124 
4125   if (this_thr != master && // Master's CG root is initialized elsewhere
4126       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4127     // Make new thread's CG root same as master's
4128     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4129     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4130     if (tmp) {
4131       // worker changes CG, need to check if old CG should be freed
4132       int i = tmp->cg_nthreads--;
4133       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4134                      " on node %p of thread %p to %d\n",
4135                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4136       if (i == 1) {
4137         __kmp_free(tmp); // last thread left CG --> free it
4138       }
4139     }
4140     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4141     // Increment new thread's CG root's counter to add the new thread
4142     this_thr->th.th_cg_roots->cg_nthreads++;
4143     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4144                    " node %p of thread %p to %d\n",
4145                    this_thr, this_thr->th.th_cg_roots,
4146                    this_thr->th.th_cg_roots->cg_root,
4147                    this_thr->th.th_cg_roots->cg_nthreads));
4148     this_thr->th.th_current_task->td_icvs.thread_limit =
4149         this_thr->th.th_cg_roots->cg_thread_limit;
4150   }
4151 
4152   /* Initialize dynamic dispatch */
4153   {
4154     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4155     // Use team max_nproc since this will never change for the team.
4156     size_t disp_size =
4157         sizeof(dispatch_private_info_t) *
4158         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4159     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4160                   team->t.t_max_nproc));
4161     KMP_ASSERT(dispatch);
4162     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4163     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4164 
4165     dispatch->th_disp_index = 0;
4166     dispatch->th_doacross_buf_idx = 0;
4167     if (!dispatch->th_disp_buffer) {
4168       dispatch->th_disp_buffer =
4169           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4170 
4171       if (__kmp_storage_map) {
4172         __kmp_print_storage_map_gtid(
4173             gtid, &dispatch->th_disp_buffer[0],
4174             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4175                                           ? 1
4176                                           : __kmp_dispatch_num_buffers],
4177             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4178                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4179             gtid, team->t.t_id, gtid);
4180       }
4181     } else {
4182       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4183     }
4184 
4185     dispatch->th_dispatch_pr_current = 0;
4186     dispatch->th_dispatch_sh_current = 0;
4187 
4188     dispatch->th_deo_fcn = 0; /* ORDERED     */
4189     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4190   }
4191 
4192   this_thr->th.th_next_pool = NULL;
4193 
4194   if (!this_thr->th.th_task_state_memo_stack) {
4195     size_t i;
4196     this_thr->th.th_task_state_memo_stack =
4197         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4198     this_thr->th.th_task_state_top = 0;
4199     this_thr->th.th_task_state_stack_sz = 4;
4200     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4201          ++i) // zero init the stack
4202       this_thr->th.th_task_state_memo_stack[i] = 0;
4203   }
4204 
4205   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4206   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4207 
4208   KMP_MB();
4209 }
4210 
4211 /* allocate a new thread for the requesting team. this is only called from
4212    within a forkjoin critical section. we will first try to get an available
4213    thread from the thread pool. if none is available, we will fork a new one
4214    assuming we are able to create a new one. this should be assured, as the
4215    caller should check on this first. */
4216 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4217                                   int new_tid) {
4218   kmp_team_t *serial_team;
4219   kmp_info_t *new_thr;
4220   int new_gtid;
4221 
4222   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4223   KMP_DEBUG_ASSERT(root && team);
4224 #if !KMP_NESTED_HOT_TEAMS
4225   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4226 #endif
4227   KMP_MB();
4228 
4229   /* first, try to get one from the thread pool */
4230   if (__kmp_thread_pool) {
4231     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4232     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4233     if (new_thr == __kmp_thread_pool_insert_pt) {
4234       __kmp_thread_pool_insert_pt = NULL;
4235     }
4236     TCW_4(new_thr->th.th_in_pool, FALSE);
4237     __kmp_suspend_initialize_thread(new_thr);
4238     __kmp_lock_suspend_mx(new_thr);
4239     if (new_thr->th.th_active_in_pool == TRUE) {
4240       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4241       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4242       new_thr->th.th_active_in_pool = FALSE;
4243     }
4244     __kmp_unlock_suspend_mx(new_thr);
4245 
4246     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4247                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4248     KMP_ASSERT(!new_thr->th.th_team);
4249     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4250 
4251     /* setup the thread structure */
4252     __kmp_initialize_info(new_thr, team, new_tid,
4253                           new_thr->th.th_info.ds.ds_gtid);
4254     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4255 
4256     TCW_4(__kmp_nth, __kmp_nth + 1);
4257 
4258     new_thr->th.th_task_state = 0;
4259     new_thr->th.th_task_state_top = 0;
4260     new_thr->th.th_task_state_stack_sz = 4;
4261 
4262 #ifdef KMP_ADJUST_BLOCKTIME
4263     /* Adjust blocktime back to zero if necessary */
4264     /* Middle initialization might not have occurred yet */
4265     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4266       if (__kmp_nth > __kmp_avail_proc) {
4267         __kmp_zero_bt = TRUE;
4268       }
4269     }
4270 #endif /* KMP_ADJUST_BLOCKTIME */
4271 
4272 #if KMP_DEBUG
4273     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4274     // KMP_BARRIER_PARENT_FLAG.
4275     int b;
4276     kmp_balign_t *balign = new_thr->th.th_bar;
4277     for (b = 0; b < bs_last_barrier; ++b)
4278       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4279 #endif
4280 
4281     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4282                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4283 
4284     KMP_MB();
4285     return new_thr;
4286   }
4287 
4288   /* no, well fork a new one */
4289   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4290   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4291 
4292 #if KMP_USE_MONITOR
4293   // If this is the first worker thread the RTL is creating, then also
4294   // launch the monitor thread.  We try to do this as early as possible.
4295   if (!TCR_4(__kmp_init_monitor)) {
4296     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4297     if (!TCR_4(__kmp_init_monitor)) {
4298       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4299       TCW_4(__kmp_init_monitor, 1);
4300       __kmp_create_monitor(&__kmp_monitor);
4301       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4302 #if KMP_OS_WINDOWS
4303       // AC: wait until monitor has started. This is a fix for CQ232808.
4304       // The reason is that if the library is loaded/unloaded in a loop with
4305       // small (parallel) work in between, then there is high probability that
4306       // monitor thread started after the library shutdown. At shutdown it is
4307       // too late to cope with the problem, because when the master is in
4308       // DllMain (process detach) the monitor has no chances to start (it is
4309       // blocked), and master has no means to inform the monitor that the
4310       // library has gone, because all the memory which the monitor can access
4311       // is going to be released/reset.
4312       while (TCR_4(__kmp_init_monitor) < 2) {
4313         KMP_YIELD(TRUE);
4314       }
4315       KF_TRACE(10, ("after monitor thread has started\n"));
4316 #endif
4317     }
4318     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4319   }
4320 #endif
4321 
4322   KMP_MB();
4323 
4324   {
4325     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4326                              ? 1
4327                              : __kmp_hidden_helper_threads_num + 1;
4328 
4329     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4330          ++new_gtid) {
4331       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4332     }
4333 
4334     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4335       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4336     }
4337   }
4338 
4339   /* allocate space for it. */
4340   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4341 
4342   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4343 
4344 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4345   // suppress race conditions detection on synchronization flags in debug mode
4346   // this helps to analyze library internals eliminating false positives
4347   __itt_suppress_mark_range(
4348       __itt_suppress_range, __itt_suppress_threading_errors,
4349       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4350   __itt_suppress_mark_range(
4351       __itt_suppress_range, __itt_suppress_threading_errors,
4352       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4353 #if KMP_OS_WINDOWS
4354   __itt_suppress_mark_range(
4355       __itt_suppress_range, __itt_suppress_threading_errors,
4356       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4357 #else
4358   __itt_suppress_mark_range(__itt_suppress_range,
4359                             __itt_suppress_threading_errors,
4360                             &new_thr->th.th_suspend_init_count,
4361                             sizeof(new_thr->th.th_suspend_init_count));
4362 #endif
4363   // TODO: check if we need to also suppress b_arrived flags
4364   __itt_suppress_mark_range(__itt_suppress_range,
4365                             __itt_suppress_threading_errors,
4366                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4367                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4368   __itt_suppress_mark_range(__itt_suppress_range,
4369                             __itt_suppress_threading_errors,
4370                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4371                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4372   __itt_suppress_mark_range(__itt_suppress_range,
4373                             __itt_suppress_threading_errors,
4374                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4375                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4376 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4377   if (__kmp_storage_map) {
4378     __kmp_print_thread_storage_map(new_thr, new_gtid);
4379   }
4380 
4381   // add the reserve serialized team, initialized from the team's master thread
4382   {
4383     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4384     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4385     new_thr->th.th_serial_team = serial_team =
4386         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4387 #if OMPT_SUPPORT
4388                                           ompt_data_none, // root parallel id
4389 #endif
4390                                           proc_bind_default, &r_icvs,
4391                                           0 USE_NESTED_HOT_ARG(NULL));
4392   }
4393   KMP_ASSERT(serial_team);
4394   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4395   // execution (it is unused for now).
4396   serial_team->t.t_threads[0] = new_thr;
4397   KF_TRACE(10,
4398            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4399             new_thr));
4400 
4401   /* setup the thread structures */
4402   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4403 
4404 #if USE_FAST_MEMORY
4405   __kmp_initialize_fast_memory(new_thr);
4406 #endif /* USE_FAST_MEMORY */
4407 
4408 #if KMP_USE_BGET
4409   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4410   __kmp_initialize_bget(new_thr);
4411 #endif
4412 
4413   __kmp_init_random(new_thr); // Initialize random number generator
4414 
4415   /* Initialize these only once when thread is grabbed for a team allocation */
4416   KA_TRACE(20,
4417            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4418             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4419 
4420   int b;
4421   kmp_balign_t *balign = new_thr->th.th_bar;
4422   for (b = 0; b < bs_last_barrier; ++b) {
4423     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4424     balign[b].bb.team = NULL;
4425     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4426     balign[b].bb.use_oncore_barrier = 0;
4427   }
4428 
4429   new_thr->th.th_spin_here = FALSE;
4430   new_thr->th.th_next_waiting = 0;
4431 #if KMP_OS_UNIX
4432   new_thr->th.th_blocking = false;
4433 #endif
4434 
4435 #if KMP_AFFINITY_SUPPORTED
4436   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4437   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4438   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4439   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4440 #endif
4441   new_thr->th.th_def_allocator = __kmp_def_allocator;
4442   new_thr->th.th_prev_level = 0;
4443   new_thr->th.th_prev_num_threads = 1;
4444 
4445   TCW_4(new_thr->th.th_in_pool, FALSE);
4446   new_thr->th.th_active_in_pool = FALSE;
4447   TCW_4(new_thr->th.th_active, TRUE);
4448 
4449   /* adjust the global counters */
4450   __kmp_all_nth++;
4451   __kmp_nth++;
4452 
4453   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4454   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4455   if (__kmp_adjust_gtid_mode) {
4456     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4457       if (TCR_4(__kmp_gtid_mode) != 2) {
4458         TCW_4(__kmp_gtid_mode, 2);
4459       }
4460     } else {
4461       if (TCR_4(__kmp_gtid_mode) != 1) {
4462         TCW_4(__kmp_gtid_mode, 1);
4463       }
4464     }
4465   }
4466 
4467 #ifdef KMP_ADJUST_BLOCKTIME
4468   /* Adjust blocktime back to zero if necessary       */
4469   /* Middle initialization might not have occurred yet */
4470   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4471     if (__kmp_nth > __kmp_avail_proc) {
4472       __kmp_zero_bt = TRUE;
4473     }
4474   }
4475 #endif /* KMP_ADJUST_BLOCKTIME */
4476 
4477   /* actually fork it and create the new worker thread */
4478   KF_TRACE(
4479       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4480   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4481   KF_TRACE(10,
4482            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4483 
4484   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4485                 new_gtid));
4486   KMP_MB();
4487   return new_thr;
4488 }
4489 
4490 /* Reinitialize team for reuse.
4491    The hot team code calls this case at every fork barrier, so EPCC barrier
4492    test are extremely sensitive to changes in it, esp. writes to the team
4493    struct, which cause a cache invalidation in all threads.
4494    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4495 static void __kmp_reinitialize_team(kmp_team_t *team,
4496                                     kmp_internal_control_t *new_icvs,
4497                                     ident_t *loc) {
4498   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4499                 team->t.t_threads[0], team));
4500   KMP_DEBUG_ASSERT(team && new_icvs);
4501   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4502   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4503 
4504   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4505   // Copy ICVs to the master thread's implicit taskdata
4506   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4507   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4508 
4509   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4510                 team->t.t_threads[0], team));
4511 }
4512 
4513 /* Initialize the team data structure.
4514    This assumes the t_threads and t_max_nproc are already set.
4515    Also, we don't touch the arguments */
4516 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4517                                   kmp_internal_control_t *new_icvs,
4518                                   ident_t *loc) {
4519   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4520 
4521   /* verify */
4522   KMP_DEBUG_ASSERT(team);
4523   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4524   KMP_DEBUG_ASSERT(team->t.t_threads);
4525   KMP_MB();
4526 
4527   team->t.t_master_tid = 0; /* not needed */
4528   /* team->t.t_master_bar;        not needed */
4529   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4530   team->t.t_nproc = new_nproc;
4531 
4532   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4533   team->t.t_next_pool = NULL;
4534   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4535    * up hot team */
4536 
4537   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4538   team->t.t_invoke = NULL; /* not needed */
4539 
4540   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4541   team->t.t_sched.sched = new_icvs->sched.sched;
4542 
4543 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4544   team->t.t_fp_control_saved = FALSE; /* not needed */
4545   team->t.t_x87_fpu_control_word = 0; /* not needed */
4546   team->t.t_mxcsr = 0; /* not needed */
4547 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4548 
4549   team->t.t_construct = 0;
4550 
4551   team->t.t_ordered.dt.t_value = 0;
4552   team->t.t_master_active = FALSE;
4553 
4554 #ifdef KMP_DEBUG
4555   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4556 #endif
4557 #if KMP_OS_WINDOWS
4558   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4559 #endif
4560 
4561   team->t.t_control_stack_top = NULL;
4562 
4563   __kmp_reinitialize_team(team, new_icvs, loc);
4564 
4565   KMP_MB();
4566   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4567 }
4568 
4569 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4570 /* Sets full mask for thread and returns old mask, no changes to structures. */
4571 static void
4572 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4573   if (KMP_AFFINITY_CAPABLE()) {
4574     int status;
4575     if (old_mask != NULL) {
4576       status = __kmp_get_system_affinity(old_mask, TRUE);
4577       int error = errno;
4578       if (status != 0) {
4579         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4580                     __kmp_msg_null);
4581       }
4582     }
4583     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4584   }
4585 }
4586 #endif
4587 
4588 #if KMP_AFFINITY_SUPPORTED
4589 
4590 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4591 // It calculates the worker + master thread's partition based upon the parent
4592 // thread's partition, and binds each worker to a thread in their partition.
4593 // The master thread's partition should already include its current binding.
4594 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4595   // Copy the master thread's place partition to the team struct
4596   kmp_info_t *master_th = team->t.t_threads[0];
4597   KMP_DEBUG_ASSERT(master_th != NULL);
4598   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4599   int first_place = master_th->th.th_first_place;
4600   int last_place = master_th->th.th_last_place;
4601   int masters_place = master_th->th.th_current_place;
4602   team->t.t_first_place = first_place;
4603   team->t.t_last_place = last_place;
4604 
4605   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4606                 "bound to place %d partition = [%d,%d]\n",
4607                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4608                 team->t.t_id, masters_place, first_place, last_place));
4609 
4610   switch (proc_bind) {
4611 
4612   case proc_bind_default:
4613     // serial teams might have the proc_bind policy set to proc_bind_default. It
4614     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4615     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4616     break;
4617 
4618   case proc_bind_master: {
4619     int f;
4620     int n_th = team->t.t_nproc;
4621     for (f = 1; f < n_th; f++) {
4622       kmp_info_t *th = team->t.t_threads[f];
4623       KMP_DEBUG_ASSERT(th != NULL);
4624       th->th.th_first_place = first_place;
4625       th->th.th_last_place = last_place;
4626       th->th.th_new_place = masters_place;
4627       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4628           team->t.t_display_affinity != 1) {
4629         team->t.t_display_affinity = 1;
4630       }
4631 
4632       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4633                      "partition = [%d,%d]\n",
4634                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4635                      f, masters_place, first_place, last_place));
4636     }
4637   } break;
4638 
4639   case proc_bind_close: {
4640     int f;
4641     int n_th = team->t.t_nproc;
4642     int n_places;
4643     if (first_place <= last_place) {
4644       n_places = last_place - first_place + 1;
4645     } else {
4646       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4647     }
4648     if (n_th <= n_places) {
4649       int place = masters_place;
4650       for (f = 1; f < n_th; f++) {
4651         kmp_info_t *th = team->t.t_threads[f];
4652         KMP_DEBUG_ASSERT(th != NULL);
4653 
4654         if (place == last_place) {
4655           place = first_place;
4656         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4657           place = 0;
4658         } else {
4659           place++;
4660         }
4661         th->th.th_first_place = first_place;
4662         th->th.th_last_place = last_place;
4663         th->th.th_new_place = place;
4664         if (__kmp_display_affinity && place != th->th.th_current_place &&
4665             team->t.t_display_affinity != 1) {
4666           team->t.t_display_affinity = 1;
4667         }
4668 
4669         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4670                        "partition = [%d,%d]\n",
4671                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4672                        team->t.t_id, f, place, first_place, last_place));
4673       }
4674     } else {
4675       int S, rem, gap, s_count;
4676       S = n_th / n_places;
4677       s_count = 0;
4678       rem = n_th - (S * n_places);
4679       gap = rem > 0 ? n_places / rem : n_places;
4680       int place = masters_place;
4681       int gap_ct = gap;
4682       for (f = 0; f < n_th; f++) {
4683         kmp_info_t *th = team->t.t_threads[f];
4684         KMP_DEBUG_ASSERT(th != NULL);
4685 
4686         th->th.th_first_place = first_place;
4687         th->th.th_last_place = last_place;
4688         th->th.th_new_place = place;
4689         if (__kmp_display_affinity && place != th->th.th_current_place &&
4690             team->t.t_display_affinity != 1) {
4691           team->t.t_display_affinity = 1;
4692         }
4693         s_count++;
4694 
4695         if ((s_count == S) && rem && (gap_ct == gap)) {
4696           // do nothing, add an extra thread to place on next iteration
4697         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4698           // we added an extra thread to this place; move to next place
4699           if (place == last_place) {
4700             place = first_place;
4701           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4702             place = 0;
4703           } else {
4704             place++;
4705           }
4706           s_count = 0;
4707           gap_ct = 1;
4708           rem--;
4709         } else if (s_count == S) { // place full; don't add extra
4710           if (place == last_place) {
4711             place = first_place;
4712           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4713             place = 0;
4714           } else {
4715             place++;
4716           }
4717           gap_ct++;
4718           s_count = 0;
4719         }
4720 
4721         KA_TRACE(100,
4722                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4723                   "partition = [%d,%d]\n",
4724                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4725                   th->th.th_new_place, first_place, last_place));
4726       }
4727       KMP_DEBUG_ASSERT(place == masters_place);
4728     }
4729   } break;
4730 
4731   case proc_bind_spread: {
4732     int f;
4733     int n_th = team->t.t_nproc;
4734     int n_places;
4735     int thidx;
4736     if (first_place <= last_place) {
4737       n_places = last_place - first_place + 1;
4738     } else {
4739       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4740     }
4741     if (n_th <= n_places) {
4742       int place = -1;
4743 
4744       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4745         int S = n_places / n_th;
4746         int s_count, rem, gap, gap_ct;
4747 
4748         place = masters_place;
4749         rem = n_places - n_th * S;
4750         gap = rem ? n_th / rem : 1;
4751         gap_ct = gap;
4752         thidx = n_th;
4753         if (update_master_only == 1)
4754           thidx = 1;
4755         for (f = 0; f < thidx; f++) {
4756           kmp_info_t *th = team->t.t_threads[f];
4757           KMP_DEBUG_ASSERT(th != NULL);
4758 
4759           th->th.th_first_place = place;
4760           th->th.th_new_place = place;
4761           if (__kmp_display_affinity && place != th->th.th_current_place &&
4762               team->t.t_display_affinity != 1) {
4763             team->t.t_display_affinity = 1;
4764           }
4765           s_count = 1;
4766           while (s_count < S) {
4767             if (place == last_place) {
4768               place = first_place;
4769             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4770               place = 0;
4771             } else {
4772               place++;
4773             }
4774             s_count++;
4775           }
4776           if (rem && (gap_ct == gap)) {
4777             if (place == last_place) {
4778               place = first_place;
4779             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4780               place = 0;
4781             } else {
4782               place++;
4783             }
4784             rem--;
4785             gap_ct = 0;
4786           }
4787           th->th.th_last_place = place;
4788           gap_ct++;
4789 
4790           if (place == last_place) {
4791             place = first_place;
4792           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4793             place = 0;
4794           } else {
4795             place++;
4796           }
4797 
4798           KA_TRACE(100,
4799                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4800                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4801                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4802                     f, th->th.th_new_place, th->th.th_first_place,
4803                     th->th.th_last_place, __kmp_affinity_num_masks));
4804         }
4805       } else {
4806         /* Having uniform space of available computation places I can create
4807            T partitions of round(P/T) size and put threads into the first
4808            place of each partition. */
4809         double current = static_cast<double>(masters_place);
4810         double spacing =
4811             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4812         int first, last;
4813         kmp_info_t *th;
4814 
4815         thidx = n_th + 1;
4816         if (update_master_only == 1)
4817           thidx = 1;
4818         for (f = 0; f < thidx; f++) {
4819           first = static_cast<int>(current);
4820           last = static_cast<int>(current + spacing) - 1;
4821           KMP_DEBUG_ASSERT(last >= first);
4822           if (first >= n_places) {
4823             if (masters_place) {
4824               first -= n_places;
4825               last -= n_places;
4826               if (first == (masters_place + 1)) {
4827                 KMP_DEBUG_ASSERT(f == n_th);
4828                 first--;
4829               }
4830               if (last == masters_place) {
4831                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4832                 last--;
4833               }
4834             } else {
4835               KMP_DEBUG_ASSERT(f == n_th);
4836               first = 0;
4837               last = 0;
4838             }
4839           }
4840           if (last >= n_places) {
4841             last = (n_places - 1);
4842           }
4843           place = first;
4844           current += spacing;
4845           if (f < n_th) {
4846             KMP_DEBUG_ASSERT(0 <= first);
4847             KMP_DEBUG_ASSERT(n_places > first);
4848             KMP_DEBUG_ASSERT(0 <= last);
4849             KMP_DEBUG_ASSERT(n_places > last);
4850             KMP_DEBUG_ASSERT(last_place >= first_place);
4851             th = team->t.t_threads[f];
4852             KMP_DEBUG_ASSERT(th);
4853             th->th.th_first_place = first;
4854             th->th.th_new_place = place;
4855             th->th.th_last_place = last;
4856             if (__kmp_display_affinity && place != th->th.th_current_place &&
4857                 team->t.t_display_affinity != 1) {
4858               team->t.t_display_affinity = 1;
4859             }
4860             KA_TRACE(100,
4861                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4862                       "partition = [%d,%d], spacing = %.4f\n",
4863                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4864                       team->t.t_id, f, th->th.th_new_place,
4865                       th->th.th_first_place, th->th.th_last_place, spacing));
4866           }
4867         }
4868       }
4869       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4870     } else {
4871       int S, rem, gap, s_count;
4872       S = n_th / n_places;
4873       s_count = 0;
4874       rem = n_th - (S * n_places);
4875       gap = rem > 0 ? n_places / rem : n_places;
4876       int place = masters_place;
4877       int gap_ct = gap;
4878       thidx = n_th;
4879       if (update_master_only == 1)
4880         thidx = 1;
4881       for (f = 0; f < thidx; f++) {
4882         kmp_info_t *th = team->t.t_threads[f];
4883         KMP_DEBUG_ASSERT(th != NULL);
4884 
4885         th->th.th_first_place = place;
4886         th->th.th_last_place = place;
4887         th->th.th_new_place = place;
4888         if (__kmp_display_affinity && place != th->th.th_current_place &&
4889             team->t.t_display_affinity != 1) {
4890           team->t.t_display_affinity = 1;
4891         }
4892         s_count++;
4893 
4894         if ((s_count == S) && rem && (gap_ct == gap)) {
4895           // do nothing, add an extra thread to place on next iteration
4896         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4897           // we added an extra thread to this place; move on to next place
4898           if (place == last_place) {
4899             place = first_place;
4900           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4901             place = 0;
4902           } else {
4903             place++;
4904           }
4905           s_count = 0;
4906           gap_ct = 1;
4907           rem--;
4908         } else if (s_count == S) { // place is full; don't add extra thread
4909           if (place == last_place) {
4910             place = first_place;
4911           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4912             place = 0;
4913           } else {
4914             place++;
4915           }
4916           gap_ct++;
4917           s_count = 0;
4918         }
4919 
4920         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4921                        "partition = [%d,%d]\n",
4922                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4923                        team->t.t_id, f, th->th.th_new_place,
4924                        th->th.th_first_place, th->th.th_last_place));
4925       }
4926       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4927     }
4928   } break;
4929 
4930   default:
4931     break;
4932   }
4933 
4934   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4935 }
4936 
4937 #endif // KMP_AFFINITY_SUPPORTED
4938 
4939 /* allocate a new team data structure to use.  take one off of the free pool if
4940    available */
4941 kmp_team_t *
4942 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4943 #if OMPT_SUPPORT
4944                     ompt_data_t ompt_parallel_data,
4945 #endif
4946                     kmp_proc_bind_t new_proc_bind,
4947                     kmp_internal_control_t *new_icvs,
4948                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4949   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4950   int f;
4951   kmp_team_t *team;
4952   int use_hot_team = !root->r.r_active;
4953   int level = 0;
4954 
4955   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4956   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4957   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4958   KMP_MB();
4959 
4960 #if KMP_NESTED_HOT_TEAMS
4961   kmp_hot_team_ptr_t *hot_teams;
4962   if (master) {
4963     team = master->th.th_team;
4964     level = team->t.t_active_level;
4965     if (master->th.th_teams_microtask) { // in teams construct?
4966       if (master->th.th_teams_size.nteams > 1 &&
4967           ( // #teams > 1
4968               team->t.t_pkfn ==
4969                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4970               master->th.th_teams_level <
4971                   team->t.t_level)) { // or nested parallel inside the teams
4972         ++level; // not increment if #teams==1, or for outer fork of the teams;
4973         // increment otherwise
4974       }
4975     }
4976     hot_teams = master->th.th_hot_teams;
4977     if (level < __kmp_hot_teams_max_level && hot_teams &&
4978         hot_teams[level].hot_team) {
4979       // hot team has already been allocated for given level
4980       use_hot_team = 1;
4981     } else {
4982       use_hot_team = 0;
4983     }
4984   } else {
4985     // check we won't access uninitialized hot_teams, just in case
4986     KMP_DEBUG_ASSERT(new_nproc == 1);
4987   }
4988 #endif
4989   // Optimization to use a "hot" team
4990   if (use_hot_team && new_nproc > 1) {
4991     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4992 #if KMP_NESTED_HOT_TEAMS
4993     team = hot_teams[level].hot_team;
4994 #else
4995     team = root->r.r_hot_team;
4996 #endif
4997 #if KMP_DEBUG
4998     if (__kmp_tasking_mode != tskm_immediate_exec) {
4999       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5000                     "task_team[1] = %p before reinit\n",
5001                     team->t.t_task_team[0], team->t.t_task_team[1]));
5002     }
5003 #endif
5004 
5005     // Has the number of threads changed?
5006     /* Let's assume the most common case is that the number of threads is
5007        unchanged, and put that case first. */
5008     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5009       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5010       // This case can mean that omp_set_num_threads() was called and the hot
5011       // team size was already reduced, so we check the special flag
5012       if (team->t.t_size_changed == -1) {
5013         team->t.t_size_changed = 1;
5014       } else {
5015         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5016       }
5017 
5018       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5019       kmp_r_sched_t new_sched = new_icvs->sched;
5020       // set master's schedule as new run-time schedule
5021       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5022 
5023       __kmp_reinitialize_team(team, new_icvs,
5024                               root->r.r_uber_thread->th.th_ident);
5025 
5026       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5027                     team->t.t_threads[0], team));
5028       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5029 
5030 #if KMP_AFFINITY_SUPPORTED
5031       if ((team->t.t_size_changed == 0) &&
5032           (team->t.t_proc_bind == new_proc_bind)) {
5033         if (new_proc_bind == proc_bind_spread) {
5034           __kmp_partition_places(
5035               team, 1); // add flag to update only master for spread
5036         }
5037         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5038                        "proc_bind = %d, partition = [%d,%d]\n",
5039                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5040                        team->t.t_last_place));
5041       } else {
5042         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5043         __kmp_partition_places(team);
5044       }
5045 #else
5046       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5047 #endif /* KMP_AFFINITY_SUPPORTED */
5048     } else if (team->t.t_nproc > new_nproc) {
5049       KA_TRACE(20,
5050                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5051                 new_nproc));
5052 
5053       team->t.t_size_changed = 1;
5054 #if KMP_NESTED_HOT_TEAMS
5055       if (__kmp_hot_teams_mode == 0) {
5056         // AC: saved number of threads should correspond to team's value in this
5057         // mode, can be bigger in mode 1, when hot team has threads in reserve
5058         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5059         hot_teams[level].hot_team_nth = new_nproc;
5060 #endif // KMP_NESTED_HOT_TEAMS
5061         /* release the extra threads we don't need any more */
5062         for (f = new_nproc; f < team->t.t_nproc; f++) {
5063           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5064           if (__kmp_tasking_mode != tskm_immediate_exec) {
5065             // When decreasing team size, threads no longer in the team should
5066             // unref task team.
5067             team->t.t_threads[f]->th.th_task_team = NULL;
5068           }
5069           __kmp_free_thread(team->t.t_threads[f]);
5070           team->t.t_threads[f] = NULL;
5071         }
5072 #if KMP_NESTED_HOT_TEAMS
5073       } // (__kmp_hot_teams_mode == 0)
5074       else {
5075         // When keeping extra threads in team, switch threads to wait on own
5076         // b_go flag
5077         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5078           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5079           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5080           for (int b = 0; b < bs_last_barrier; ++b) {
5081             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5082               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5083             }
5084             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5085           }
5086         }
5087       }
5088 #endif // KMP_NESTED_HOT_TEAMS
5089       team->t.t_nproc = new_nproc;
5090       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5091       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5092       __kmp_reinitialize_team(team, new_icvs,
5093                               root->r.r_uber_thread->th.th_ident);
5094 
5095       // Update remaining threads
5096       for (f = 0; f < new_nproc; ++f) {
5097         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5098       }
5099 
5100       // restore the current task state of the master thread: should be the
5101       // implicit task
5102       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5103                     team->t.t_threads[0], team));
5104 
5105       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5106 
5107 #ifdef KMP_DEBUG
5108       for (f = 0; f < team->t.t_nproc; f++) {
5109         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5110                          team->t.t_threads[f]->th.th_team_nproc ==
5111                              team->t.t_nproc);
5112       }
5113 #endif
5114 
5115       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5116 #if KMP_AFFINITY_SUPPORTED
5117       __kmp_partition_places(team);
5118 #endif
5119     } else { // team->t.t_nproc < new_nproc
5120 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5121       kmp_affin_mask_t *old_mask;
5122       if (KMP_AFFINITY_CAPABLE()) {
5123         KMP_CPU_ALLOC(old_mask);
5124       }
5125 #endif
5126 
5127       KA_TRACE(20,
5128                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5129                 new_nproc));
5130 
5131       team->t.t_size_changed = 1;
5132 
5133 #if KMP_NESTED_HOT_TEAMS
5134       int avail_threads = hot_teams[level].hot_team_nth;
5135       if (new_nproc < avail_threads)
5136         avail_threads = new_nproc;
5137       kmp_info_t **other_threads = team->t.t_threads;
5138       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5139         // Adjust barrier data of reserved threads (if any) of the team
5140         // Other data will be set in __kmp_initialize_info() below.
5141         int b;
5142         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5143         for (b = 0; b < bs_last_barrier; ++b) {
5144           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5145           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5146 #if USE_DEBUGGER
5147           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5148 #endif
5149         }
5150       }
5151       if (hot_teams[level].hot_team_nth >= new_nproc) {
5152         // we have all needed threads in reserve, no need to allocate any
5153         // this only possible in mode 1, cannot have reserved threads in mode 0
5154         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5155         team->t.t_nproc = new_nproc; // just get reserved threads involved
5156       } else {
5157         // we may have some threads in reserve, but not enough
5158         team->t.t_nproc =
5159             hot_teams[level]
5160                 .hot_team_nth; // get reserved threads involved if any
5161         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5162 #endif // KMP_NESTED_HOT_TEAMS
5163         if (team->t.t_max_nproc < new_nproc) {
5164           /* reallocate larger arrays */
5165           __kmp_reallocate_team_arrays(team, new_nproc);
5166           __kmp_reinitialize_team(team, new_icvs, NULL);
5167         }
5168 
5169 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5170         /* Temporarily set full mask for master thread before creation of
5171            workers. The reason is that workers inherit the affinity from master,
5172            so if a lot of workers are created on the single core quickly, they
5173            don't get a chance to set their own affinity for a long time. */
5174         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5175 #endif
5176 
5177         /* allocate new threads for the hot team */
5178         for (f = team->t.t_nproc; f < new_nproc; f++) {
5179           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5180           KMP_DEBUG_ASSERT(new_worker);
5181           team->t.t_threads[f] = new_worker;
5182 
5183           KA_TRACE(20,
5184                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5185                     "join=%llu, plain=%llu\n",
5186                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5187                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5188                     team->t.t_bar[bs_plain_barrier].b_arrived));
5189 
5190           { // Initialize barrier data for new threads.
5191             int b;
5192             kmp_balign_t *balign = new_worker->th.th_bar;
5193             for (b = 0; b < bs_last_barrier; ++b) {
5194               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5195               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5196                                KMP_BARRIER_PARENT_FLAG);
5197 #if USE_DEBUGGER
5198               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5199 #endif
5200             }
5201           }
5202         }
5203 
5204 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5205         if (KMP_AFFINITY_CAPABLE()) {
5206           /* Restore initial master thread's affinity mask */
5207           __kmp_set_system_affinity(old_mask, TRUE);
5208           KMP_CPU_FREE(old_mask);
5209         }
5210 #endif
5211 #if KMP_NESTED_HOT_TEAMS
5212       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5213 #endif // KMP_NESTED_HOT_TEAMS
5214       /* make sure everyone is syncronized */
5215       int old_nproc = team->t.t_nproc; // save old value and use to update only
5216       // new threads below
5217       __kmp_initialize_team(team, new_nproc, new_icvs,
5218                             root->r.r_uber_thread->th.th_ident);
5219 
5220       /* reinitialize the threads */
5221       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5222       for (f = 0; f < team->t.t_nproc; ++f)
5223         __kmp_initialize_info(team->t.t_threads[f], team, f,
5224                               __kmp_gtid_from_tid(f, team));
5225 
5226       if (level) { // set th_task_state for new threads in nested hot team
5227         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5228         // only need to set the th_task_state for the new threads. th_task_state
5229         // for master thread will not be accurate until after this in
5230         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5231         // correct value.
5232         for (f = old_nproc; f < team->t.t_nproc; ++f)
5233           team->t.t_threads[f]->th.th_task_state =
5234               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5235       } else { // set th_task_state for new threads in non-nested hot team
5236         kmp_uint8 old_state =
5237             team->t.t_threads[0]->th.th_task_state; // copy master's state
5238         for (f = old_nproc; f < team->t.t_nproc; ++f)
5239           team->t.t_threads[f]->th.th_task_state = old_state;
5240       }
5241 
5242 #ifdef KMP_DEBUG
5243       for (f = 0; f < team->t.t_nproc; ++f) {
5244         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5245                          team->t.t_threads[f]->th.th_team_nproc ==
5246                              team->t.t_nproc);
5247       }
5248 #endif
5249 
5250       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5251 #if KMP_AFFINITY_SUPPORTED
5252       __kmp_partition_places(team);
5253 #endif
5254     } // Check changes in number of threads
5255 
5256     kmp_info_t *master = team->t.t_threads[0];
5257     if (master->th.th_teams_microtask) {
5258       for (f = 1; f < new_nproc; ++f) {
5259         // propagate teams construct specific info to workers
5260         kmp_info_t *thr = team->t.t_threads[f];
5261         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5262         thr->th.th_teams_level = master->th.th_teams_level;
5263         thr->th.th_teams_size = master->th.th_teams_size;
5264       }
5265     }
5266 #if KMP_NESTED_HOT_TEAMS
5267     if (level) {
5268       // Sync barrier state for nested hot teams, not needed for outermost hot
5269       // team.
5270       for (f = 1; f < new_nproc; ++f) {
5271         kmp_info_t *thr = team->t.t_threads[f];
5272         int b;
5273         kmp_balign_t *balign = thr->th.th_bar;
5274         for (b = 0; b < bs_last_barrier; ++b) {
5275           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5276           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5277 #if USE_DEBUGGER
5278           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5279 #endif
5280         }
5281       }
5282     }
5283 #endif // KMP_NESTED_HOT_TEAMS
5284 
5285     /* reallocate space for arguments if necessary */
5286     __kmp_alloc_argv_entries(argc, team, TRUE);
5287     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5288     // The hot team re-uses the previous task team,
5289     // if untouched during the previous release->gather phase.
5290 
5291     KF_TRACE(10, (" hot_team = %p\n", team));
5292 
5293 #if KMP_DEBUG
5294     if (__kmp_tasking_mode != tskm_immediate_exec) {
5295       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5296                     "task_team[1] = %p after reinit\n",
5297                     team->t.t_task_team[0], team->t.t_task_team[1]));
5298     }
5299 #endif
5300 
5301 #if OMPT_SUPPORT
5302     __ompt_team_assign_id(team, ompt_parallel_data);
5303 #endif
5304 
5305     KMP_MB();
5306 
5307     return team;
5308   }
5309 
5310   /* next, let's try to take one from the team pool */
5311   KMP_MB();
5312   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5313     /* TODO: consider resizing undersized teams instead of reaping them, now
5314        that we have a resizing mechanism */
5315     if (team->t.t_max_nproc >= max_nproc) {
5316       /* take this team from the team pool */
5317       __kmp_team_pool = team->t.t_next_pool;
5318 
5319       /* setup the team for fresh use */
5320       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5321 
5322       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5323                     "task_team[1] %p to NULL\n",
5324                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5325       team->t.t_task_team[0] = NULL;
5326       team->t.t_task_team[1] = NULL;
5327 
5328       /* reallocate space for arguments if necessary */
5329       __kmp_alloc_argv_entries(argc, team, TRUE);
5330       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5331 
5332       KA_TRACE(
5333           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5334                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5335       { // Initialize barrier data.
5336         int b;
5337         for (b = 0; b < bs_last_barrier; ++b) {
5338           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5339 #if USE_DEBUGGER
5340           team->t.t_bar[b].b_master_arrived = 0;
5341           team->t.t_bar[b].b_team_arrived = 0;
5342 #endif
5343         }
5344       }
5345 
5346       team->t.t_proc_bind = new_proc_bind;
5347 
5348       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5349                     team->t.t_id));
5350 
5351 #if OMPT_SUPPORT
5352       __ompt_team_assign_id(team, ompt_parallel_data);
5353 #endif
5354 
5355       KMP_MB();
5356 
5357       return team;
5358     }
5359 
5360     /* reap team if it is too small, then loop back and check the next one */
5361     // not sure if this is wise, but, will be redone during the hot-teams
5362     // rewrite.
5363     /* TODO: Use technique to find the right size hot-team, don't reap them */
5364     team = __kmp_reap_team(team);
5365     __kmp_team_pool = team;
5366   }
5367 
5368   /* nothing available in the pool, no matter, make a new team! */
5369   KMP_MB();
5370   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5371 
5372   /* and set it up */
5373   team->t.t_max_nproc = max_nproc;
5374   /* NOTE well, for some reason allocating one big buffer and dividing it up
5375      seems to really hurt performance a lot on the P4, so, let's not use this */
5376   __kmp_allocate_team_arrays(team, max_nproc);
5377 
5378   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5379   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5380 
5381   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5382                 "%p to NULL\n",
5383                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5384   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5385   // memory, no need to duplicate
5386   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5387   // memory, no need to duplicate
5388 
5389   if (__kmp_storage_map) {
5390     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5391   }
5392 
5393   /* allocate space for arguments */
5394   __kmp_alloc_argv_entries(argc, team, FALSE);
5395   team->t.t_argc = argc;
5396 
5397   KA_TRACE(20,
5398            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5399             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5400   { // Initialize barrier data.
5401     int b;
5402     for (b = 0; b < bs_last_barrier; ++b) {
5403       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5404 #if USE_DEBUGGER
5405       team->t.t_bar[b].b_master_arrived = 0;
5406       team->t.t_bar[b].b_team_arrived = 0;
5407 #endif
5408     }
5409   }
5410 
5411   team->t.t_proc_bind = new_proc_bind;
5412 
5413 #if OMPT_SUPPORT
5414   __ompt_team_assign_id(team, ompt_parallel_data);
5415   team->t.ompt_serialized_team_info = NULL;
5416 #endif
5417 
5418   KMP_MB();
5419 
5420   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5421                 team->t.t_id));
5422 
5423   return team;
5424 }
5425 
5426 /* TODO implement hot-teams at all levels */
5427 /* TODO implement lazy thread release on demand (disband request) */
5428 
5429 /* free the team.  return it to the team pool.  release all the threads
5430  * associated with it */
5431 void __kmp_free_team(kmp_root_t *root,
5432                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5433   int f;
5434   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5435                 team->t.t_id));
5436 
5437   /* verify state */
5438   KMP_DEBUG_ASSERT(root);
5439   KMP_DEBUG_ASSERT(team);
5440   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5441   KMP_DEBUG_ASSERT(team->t.t_threads);
5442 
5443   int use_hot_team = team == root->r.r_hot_team;
5444 #if KMP_NESTED_HOT_TEAMS
5445   int level;
5446   kmp_hot_team_ptr_t *hot_teams;
5447   if (master) {
5448     level = team->t.t_active_level - 1;
5449     if (master->th.th_teams_microtask) { // in teams construct?
5450       if (master->th.th_teams_size.nteams > 1) {
5451         ++level; // level was not increased in teams construct for
5452         // team_of_masters
5453       }
5454       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5455           master->th.th_teams_level == team->t.t_level) {
5456         ++level; // level was not increased in teams construct for
5457         // team_of_workers before the parallel
5458       } // team->t.t_level will be increased inside parallel
5459     }
5460     hot_teams = master->th.th_hot_teams;
5461     if (level < __kmp_hot_teams_max_level) {
5462       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5463       use_hot_team = 1;
5464     }
5465   }
5466 #endif // KMP_NESTED_HOT_TEAMS
5467 
5468   /* team is done working */
5469   TCW_SYNC_PTR(team->t.t_pkfn,
5470                NULL); // Important for Debugging Support Library.
5471 #if KMP_OS_WINDOWS
5472   team->t.t_copyin_counter = 0; // init counter for possible reuse
5473 #endif
5474   // Do not reset pointer to parent team to NULL for hot teams.
5475 
5476   /* if we are non-hot team, release our threads */
5477   if (!use_hot_team) {
5478     if (__kmp_tasking_mode != tskm_immediate_exec) {
5479       // Wait for threads to reach reapable state
5480       for (f = 1; f < team->t.t_nproc; ++f) {
5481         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5482         kmp_info_t *th = team->t.t_threads[f];
5483         volatile kmp_uint32 *state = &th->th.th_reap_state;
5484         while (*state != KMP_SAFE_TO_REAP) {
5485 #if KMP_OS_WINDOWS
5486           // On Windows a thread can be killed at any time, check this
5487           DWORD ecode;
5488           if (!__kmp_is_thread_alive(th, &ecode)) {
5489             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5490             break;
5491           }
5492 #endif
5493           // first check if thread is sleeping
5494           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5495           if (fl.is_sleeping())
5496             fl.resume(__kmp_gtid_from_thread(th));
5497           KMP_CPU_PAUSE();
5498         }
5499       }
5500 
5501       // Delete task teams
5502       int tt_idx;
5503       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5504         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5505         if (task_team != NULL) {
5506           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5507             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5508             team->t.t_threads[f]->th.th_task_team = NULL;
5509           }
5510           KA_TRACE(
5511               20,
5512               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5513                __kmp_get_gtid(), task_team, team->t.t_id));
5514 #if KMP_NESTED_HOT_TEAMS
5515           __kmp_free_task_team(master, task_team);
5516 #endif
5517           team->t.t_task_team[tt_idx] = NULL;
5518         }
5519       }
5520     }
5521 
5522     // Reset pointer to parent team only for non-hot teams.
5523     team->t.t_parent = NULL;
5524     team->t.t_level = 0;
5525     team->t.t_active_level = 0;
5526 
5527     /* free the worker threads */
5528     for (f = 1; f < team->t.t_nproc; ++f) {
5529       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5530       __kmp_free_thread(team->t.t_threads[f]);
5531       team->t.t_threads[f] = NULL;
5532     }
5533 
5534     /* put the team back in the team pool */
5535     /* TODO limit size of team pool, call reap_team if pool too large */
5536     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5537     __kmp_team_pool = (volatile kmp_team_t *)team;
5538   } else { // Check if team was created for the masters in a teams construct
5539     // See if first worker is a CG root
5540     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5541                      team->t.t_threads[1]->th.th_cg_roots);
5542     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5543       // Clean up the CG root nodes on workers so that this team can be re-used
5544       for (f = 1; f < team->t.t_nproc; ++f) {
5545         kmp_info_t *thr = team->t.t_threads[f];
5546         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5547                          thr->th.th_cg_roots->cg_root == thr);
5548         // Pop current CG root off list
5549         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5550         thr->th.th_cg_roots = tmp->up;
5551         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5552                        " up to node %p. cg_nthreads was %d\n",
5553                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5554         int i = tmp->cg_nthreads--;
5555         if (i == 1) {
5556           __kmp_free(tmp); // free CG if we are the last thread in it
5557         }
5558         // Restore current task's thread_limit from CG root
5559         if (thr->th.th_cg_roots)
5560           thr->th.th_current_task->td_icvs.thread_limit =
5561               thr->th.th_cg_roots->cg_thread_limit;
5562       }
5563     }
5564   }
5565 
5566   KMP_MB();
5567 }
5568 
5569 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5570 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5571   kmp_team_t *next_pool = team->t.t_next_pool;
5572 
5573   KMP_DEBUG_ASSERT(team);
5574   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5575   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5576   KMP_DEBUG_ASSERT(team->t.t_threads);
5577   KMP_DEBUG_ASSERT(team->t.t_argv);
5578 
5579   /* TODO clean the threads that are a part of this? */
5580 
5581   /* free stuff */
5582   __kmp_free_team_arrays(team);
5583   if (team->t.t_argv != &team->t.t_inline_argv[0])
5584     __kmp_free((void *)team->t.t_argv);
5585   __kmp_free(team);
5586 
5587   KMP_MB();
5588   return next_pool;
5589 }
5590 
5591 // Free the thread.  Don't reap it, just place it on the pool of available
5592 // threads.
5593 //
5594 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5595 // binding for the affinity mechanism to be useful.
5596 //
5597 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5598 // However, we want to avoid a potential performance problem by always
5599 // scanning through the list to find the correct point at which to insert
5600 // the thread (potential N**2 behavior).  To do this we keep track of the
5601 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5602 // With single-level parallelism, threads will always be added to the tail
5603 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5604 // parallelism, all bets are off and we may need to scan through the entire
5605 // free list.
5606 //
5607 // This change also has a potentially large performance benefit, for some
5608 // applications.  Previously, as threads were freed from the hot team, they
5609 // would be placed back on the free list in inverse order.  If the hot team
5610 // grew back to it's original size, then the freed thread would be placed
5611 // back on the hot team in reverse order.  This could cause bad cache
5612 // locality problems on programs where the size of the hot team regularly
5613 // grew and shrunk.
5614 //
5615 // Now, for single-level parallelism, the OMP tid is always == gtid.
5616 void __kmp_free_thread(kmp_info_t *this_th) {
5617   int gtid;
5618   kmp_info_t **scan;
5619 
5620   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5621                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5622 
5623   KMP_DEBUG_ASSERT(this_th);
5624 
5625   // When moving thread to pool, switch thread to wait on own b_go flag, and
5626   // uninitialized (NULL team).
5627   int b;
5628   kmp_balign_t *balign = this_th->th.th_bar;
5629   for (b = 0; b < bs_last_barrier; ++b) {
5630     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5631       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5632     balign[b].bb.team = NULL;
5633     balign[b].bb.leaf_kids = 0;
5634   }
5635   this_th->th.th_task_state = 0;
5636   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5637 
5638   /* put thread back on the free pool */
5639   TCW_PTR(this_th->th.th_team, NULL);
5640   TCW_PTR(this_th->th.th_root, NULL);
5641   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5642 
5643   while (this_th->th.th_cg_roots) {
5644     this_th->th.th_cg_roots->cg_nthreads--;
5645     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5646                    " %p of thread  %p to %d\n",
5647                    this_th, this_th->th.th_cg_roots,
5648                    this_th->th.th_cg_roots->cg_root,
5649                    this_th->th.th_cg_roots->cg_nthreads));
5650     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5651     if (tmp->cg_root == this_th) { // Thread is a cg_root
5652       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5653       KA_TRACE(
5654           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5655       this_th->th.th_cg_roots = tmp->up;
5656       __kmp_free(tmp);
5657     } else { // Worker thread
5658       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5659         __kmp_free(tmp);
5660       }
5661       this_th->th.th_cg_roots = NULL;
5662       break;
5663     }
5664   }
5665 
5666   /* If the implicit task assigned to this thread can be used by other threads
5667    * -> multiple threads can share the data and try to free the task at
5668    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5669    * with higher probability when hot team is disabled but can occurs even when
5670    * the hot team is enabled */
5671   __kmp_free_implicit_task(this_th);
5672   this_th->th.th_current_task = NULL;
5673 
5674   // If the __kmp_thread_pool_insert_pt is already past the new insert
5675   // point, then we need to re-scan the entire list.
5676   gtid = this_th->th.th_info.ds.ds_gtid;
5677   if (__kmp_thread_pool_insert_pt != NULL) {
5678     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5679     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5680       __kmp_thread_pool_insert_pt = NULL;
5681     }
5682   }
5683 
5684   // Scan down the list to find the place to insert the thread.
5685   // scan is the address of a link in the list, possibly the address of
5686   // __kmp_thread_pool itself.
5687   //
5688   // In the absence of nested parallelism, the for loop will have 0 iterations.
5689   if (__kmp_thread_pool_insert_pt != NULL) {
5690     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5691   } else {
5692     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5693   }
5694   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5695        scan = &((*scan)->th.th_next_pool))
5696     ;
5697 
5698   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5699   // to its address.
5700   TCW_PTR(this_th->th.th_next_pool, *scan);
5701   __kmp_thread_pool_insert_pt = *scan = this_th;
5702   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5703                    (this_th->th.th_info.ds.ds_gtid <
5704                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5705   TCW_4(this_th->th.th_in_pool, TRUE);
5706   __kmp_suspend_initialize_thread(this_th);
5707   __kmp_lock_suspend_mx(this_th);
5708   if (this_th->th.th_active == TRUE) {
5709     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5710     this_th->th.th_active_in_pool = TRUE;
5711   }
5712 #if KMP_DEBUG
5713   else {
5714     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5715   }
5716 #endif
5717   __kmp_unlock_suspend_mx(this_th);
5718 
5719   TCW_4(__kmp_nth, __kmp_nth - 1);
5720 
5721 #ifdef KMP_ADJUST_BLOCKTIME
5722   /* Adjust blocktime back to user setting or default if necessary */
5723   /* Middle initialization might never have occurred                */
5724   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5725     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5726     if (__kmp_nth <= __kmp_avail_proc) {
5727       __kmp_zero_bt = FALSE;
5728     }
5729   }
5730 #endif /* KMP_ADJUST_BLOCKTIME */
5731 
5732   KMP_MB();
5733 }
5734 
5735 /* ------------------------------------------------------------------------ */
5736 
5737 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5738   int gtid = this_thr->th.th_info.ds.ds_gtid;
5739   /*    void                 *stack_data;*/
5740   kmp_team_t **volatile pteam;
5741 
5742   KMP_MB();
5743   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5744 
5745   if (__kmp_env_consistency_check) {
5746     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5747   }
5748 
5749 #if OMPT_SUPPORT
5750   ompt_data_t *thread_data;
5751   if (ompt_enabled.enabled) {
5752     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5753     *thread_data = ompt_data_none;
5754 
5755     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5756     this_thr->th.ompt_thread_info.wait_id = 0;
5757     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5758     this_thr->th.ompt_thread_info.parallel_flags = 0;
5759     if (ompt_enabled.ompt_callback_thread_begin) {
5760       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5761           ompt_thread_worker, thread_data);
5762     }
5763     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5764   }
5765 #endif
5766 
5767   /* This is the place where threads wait for work */
5768   while (!TCR_4(__kmp_global.g.g_done)) {
5769     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5770     KMP_MB();
5771 
5772     /* wait for work to do */
5773     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5774 
5775     /* No tid yet since not part of a team */
5776     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5777 
5778 #if OMPT_SUPPORT
5779     if (ompt_enabled.enabled) {
5780       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5781     }
5782 #endif
5783 
5784     pteam = &this_thr->th.th_team;
5785 
5786     /* have we been allocated? */
5787     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5788       /* we were just woken up, so run our new task */
5789       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5790         int rc;
5791         KA_TRACE(20,
5792                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5793                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5794                   (*pteam)->t.t_pkfn));
5795 
5796         updateHWFPControl(*pteam);
5797 
5798 #if OMPT_SUPPORT
5799         if (ompt_enabled.enabled) {
5800           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5801         }
5802 #endif
5803 
5804         rc = (*pteam)->t.t_invoke(gtid);
5805         KMP_ASSERT(rc);
5806 
5807         KMP_MB();
5808         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5809                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5810                       (*pteam)->t.t_pkfn));
5811       }
5812 #if OMPT_SUPPORT
5813       if (ompt_enabled.enabled) {
5814         /* no frame set while outside task */
5815         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5816 
5817         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5818       }
5819 #endif
5820       /* join barrier after parallel region */
5821       __kmp_join_barrier(gtid);
5822     }
5823   }
5824   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5825 
5826 #if OMPT_SUPPORT
5827   if (ompt_enabled.ompt_callback_thread_end) {
5828     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5829   }
5830 #endif
5831 
5832   this_thr->th.th_task_team = NULL;
5833   /* run the destructors for the threadprivate data for this thread */
5834   __kmp_common_destroy_gtid(gtid);
5835 
5836   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5837   KMP_MB();
5838   return this_thr;
5839 }
5840 
5841 /* ------------------------------------------------------------------------ */
5842 
5843 void __kmp_internal_end_dest(void *specific_gtid) {
5844   // Make sure no significant bits are lost
5845   int gtid;
5846   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5847 
5848   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5849   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5850    * this is because 0 is reserved for the nothing-stored case */
5851 
5852   __kmp_internal_end_thread(gtid);
5853 }
5854 
5855 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5856 
5857 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5858   __kmp_internal_end_atexit();
5859 }
5860 
5861 #endif
5862 
5863 /* [Windows] josh: when the atexit handler is called, there may still be more
5864    than one thread alive */
5865 void __kmp_internal_end_atexit(void) {
5866   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5867   /* [Windows]
5868      josh: ideally, we want to completely shutdown the library in this atexit
5869      handler, but stat code that depends on thread specific data for gtid fails
5870      because that data becomes unavailable at some point during the shutdown, so
5871      we call __kmp_internal_end_thread instead. We should eventually remove the
5872      dependency on __kmp_get_specific_gtid in the stat code and use
5873      __kmp_internal_end_library to cleanly shutdown the library.
5874 
5875      // TODO: Can some of this comment about GVS be removed?
5876      I suspect that the offending stat code is executed when the calling thread
5877      tries to clean up a dead root thread's data structures, resulting in GVS
5878      code trying to close the GVS structures for that thread, but since the stat
5879      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5880      the calling thread is cleaning up itself instead of another thread, it get
5881      confused. This happens because allowing a thread to unregister and cleanup
5882      another thread is a recent modification for addressing an issue.
5883      Based on the current design (20050722), a thread may end up
5884      trying to unregister another thread only if thread death does not trigger
5885      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5886      thread specific data destructor function to detect thread death. For
5887      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5888      is nothing.  Thus, the workaround is applicable only for Windows static
5889      stat library. */
5890   __kmp_internal_end_library(-1);
5891 #if KMP_OS_WINDOWS
5892   __kmp_close_console();
5893 #endif
5894 }
5895 
5896 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5897   // It is assumed __kmp_forkjoin_lock is acquired.
5898 
5899   int gtid;
5900 
5901   KMP_DEBUG_ASSERT(thread != NULL);
5902 
5903   gtid = thread->th.th_info.ds.ds_gtid;
5904 
5905   if (!is_root) {
5906     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5907       /* Assume the threads are at the fork barrier here */
5908       KA_TRACE(
5909           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5910                gtid));
5911       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5912        * (GEH) */
5913       ANNOTATE_HAPPENS_BEFORE(thread);
5914       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5915                          thread);
5916       __kmp_release_64(&flag);
5917     }
5918 
5919     // Terminate OS thread.
5920     __kmp_reap_worker(thread);
5921 
5922     // The thread was killed asynchronously.  If it was actively
5923     // spinning in the thread pool, decrement the global count.
5924     //
5925     // There is a small timing hole here - if the worker thread was just waking
5926     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5927     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5928     // the global counter might not get updated.
5929     //
5930     // Currently, this can only happen as the library is unloaded,
5931     // so there are no harmful side effects.
5932     if (thread->th.th_active_in_pool) {
5933       thread->th.th_active_in_pool = FALSE;
5934       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5935       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5936     }
5937   }
5938 
5939   __kmp_free_implicit_task(thread);
5940 
5941 // Free the fast memory for tasking
5942 #if USE_FAST_MEMORY
5943   __kmp_free_fast_memory(thread);
5944 #endif /* USE_FAST_MEMORY */
5945 
5946   __kmp_suspend_uninitialize_thread(thread);
5947 
5948   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5949   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5950 
5951   --__kmp_all_nth;
5952 // __kmp_nth was decremented when thread is added to the pool.
5953 
5954 #ifdef KMP_ADJUST_BLOCKTIME
5955   /* Adjust blocktime back to user setting or default if necessary */
5956   /* Middle initialization might never have occurred                */
5957   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5958     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5959     if (__kmp_nth <= __kmp_avail_proc) {
5960       __kmp_zero_bt = FALSE;
5961     }
5962   }
5963 #endif /* KMP_ADJUST_BLOCKTIME */
5964 
5965   /* free the memory being used */
5966   if (__kmp_env_consistency_check) {
5967     if (thread->th.th_cons) {
5968       __kmp_free_cons_stack(thread->th.th_cons);
5969       thread->th.th_cons = NULL;
5970     }
5971   }
5972 
5973   if (thread->th.th_pri_common != NULL) {
5974     __kmp_free(thread->th.th_pri_common);
5975     thread->th.th_pri_common = NULL;
5976   }
5977 
5978   if (thread->th.th_task_state_memo_stack != NULL) {
5979     __kmp_free(thread->th.th_task_state_memo_stack);
5980     thread->th.th_task_state_memo_stack = NULL;
5981   }
5982 
5983 #if KMP_USE_BGET
5984   if (thread->th.th_local.bget_data != NULL) {
5985     __kmp_finalize_bget(thread);
5986   }
5987 #endif
5988 
5989 #if KMP_AFFINITY_SUPPORTED
5990   if (thread->th.th_affin_mask != NULL) {
5991     KMP_CPU_FREE(thread->th.th_affin_mask);
5992     thread->th.th_affin_mask = NULL;
5993   }
5994 #endif /* KMP_AFFINITY_SUPPORTED */
5995 
5996 #if KMP_USE_HIER_SCHED
5997   if (thread->th.th_hier_bar_data != NULL) {
5998     __kmp_free(thread->th.th_hier_bar_data);
5999     thread->th.th_hier_bar_data = NULL;
6000   }
6001 #endif
6002 
6003   __kmp_reap_team(thread->th.th_serial_team);
6004   thread->th.th_serial_team = NULL;
6005   __kmp_free(thread);
6006 
6007   KMP_MB();
6008 
6009 } // __kmp_reap_thread
6010 
6011 static void __kmp_internal_end(void) {
6012   int i;
6013 
6014   /* First, unregister the library */
6015   __kmp_unregister_library();
6016 
6017 #if KMP_OS_WINDOWS
6018   /* In Win static library, we can't tell when a root actually dies, so we
6019      reclaim the data structures for any root threads that have died but not
6020      unregistered themselves, in order to shut down cleanly.
6021      In Win dynamic library we also can't tell when a thread dies.  */
6022   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6023 // dead roots
6024 #endif
6025 
6026   for (i = 0; i < __kmp_threads_capacity; i++)
6027     if (__kmp_root[i])
6028       if (__kmp_root[i]->r.r_active)
6029         break;
6030   KMP_MB(); /* Flush all pending memory write invalidates.  */
6031   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6032 
6033   if (i < __kmp_threads_capacity) {
6034 #if KMP_USE_MONITOR
6035     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6036     KMP_MB(); /* Flush all pending memory write invalidates.  */
6037 
6038     // Need to check that monitor was initialized before reaping it. If we are
6039     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6040     // __kmp_monitor will appear to contain valid data, but it is only valid in
6041     // the parent process, not the child.
6042     // New behavior (201008): instead of keying off of the flag
6043     // __kmp_init_parallel, the monitor thread creation is keyed off
6044     // of the new flag __kmp_init_monitor.
6045     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6046     if (TCR_4(__kmp_init_monitor)) {
6047       __kmp_reap_monitor(&__kmp_monitor);
6048       TCW_4(__kmp_init_monitor, 0);
6049     }
6050     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6051     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6052 #endif // KMP_USE_MONITOR
6053   } else {
6054 /* TODO move this to cleanup code */
6055 #ifdef KMP_DEBUG
6056     /* make sure that everything has properly ended */
6057     for (i = 0; i < __kmp_threads_capacity; i++) {
6058       if (__kmp_root[i]) {
6059         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6060         //                    there can be uber threads alive here
6061         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6062       }
6063     }
6064 #endif
6065 
6066     KMP_MB();
6067 
6068     // Reap the worker threads.
6069     // This is valid for now, but be careful if threads are reaped sooner.
6070     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6071       // Get the next thread from the pool.
6072       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6073       __kmp_thread_pool = thread->th.th_next_pool;
6074       // Reap it.
6075       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6076       thread->th.th_next_pool = NULL;
6077       thread->th.th_in_pool = FALSE;
6078       __kmp_reap_thread(thread, 0);
6079     }
6080     __kmp_thread_pool_insert_pt = NULL;
6081 
6082     // Reap teams.
6083     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6084       // Get the next team from the pool.
6085       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6086       __kmp_team_pool = team->t.t_next_pool;
6087       // Reap it.
6088       team->t.t_next_pool = NULL;
6089       __kmp_reap_team(team);
6090     }
6091 
6092     __kmp_reap_task_teams();
6093 
6094 #if KMP_OS_UNIX
6095     // Threads that are not reaped should not access any resources since they
6096     // are going to be deallocated soon, so the shutdown sequence should wait
6097     // until all threads either exit the final spin-waiting loop or begin
6098     // sleeping after the given blocktime.
6099     for (i = 0; i < __kmp_threads_capacity; i++) {
6100       kmp_info_t *thr = __kmp_threads[i];
6101       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6102         KMP_CPU_PAUSE();
6103     }
6104 #endif
6105 
6106     for (i = 0; i < __kmp_threads_capacity; ++i) {
6107       // TBD: Add some checking...
6108       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6109     }
6110 
6111     /* Make sure all threadprivate destructors get run by joining with all
6112        worker threads before resetting this flag */
6113     TCW_SYNC_4(__kmp_init_common, FALSE);
6114 
6115     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6116     KMP_MB();
6117 
6118 #if KMP_USE_MONITOR
6119     // See note above: One of the possible fixes for CQ138434 / CQ140126
6120     //
6121     // FIXME: push both code fragments down and CSE them?
6122     // push them into __kmp_cleanup() ?
6123     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6124     if (TCR_4(__kmp_init_monitor)) {
6125       __kmp_reap_monitor(&__kmp_monitor);
6126       TCW_4(__kmp_init_monitor, 0);
6127     }
6128     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6129     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6130 #endif
6131   } /* else !__kmp_global.t_active */
6132   TCW_4(__kmp_init_gtid, FALSE);
6133   KMP_MB(); /* Flush all pending memory write invalidates.  */
6134 
6135   __kmp_cleanup();
6136 #if OMPT_SUPPORT
6137   ompt_fini();
6138 #endif
6139 }
6140 
6141 void __kmp_internal_end_library(int gtid_req) {
6142   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6143   /* this shouldn't be a race condition because __kmp_internal_end() is the
6144      only place to clear __kmp_serial_init */
6145   /* we'll check this later too, after we get the lock */
6146   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6147   // redundant, because the next check will work in any case.
6148   if (__kmp_global.g.g_abort) {
6149     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6150     /* TODO abort? */
6151     return;
6152   }
6153   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6154     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6155     return;
6156   }
6157 
6158   KMP_MB(); /* Flush all pending memory write invalidates.  */
6159   /* find out who we are and what we should do */
6160   {
6161     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6162     KA_TRACE(
6163         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6164     if (gtid == KMP_GTID_SHUTDOWN) {
6165       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6166                     "already shutdown\n"));
6167       return;
6168     } else if (gtid == KMP_GTID_MONITOR) {
6169       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6170                     "registered, or system shutdown\n"));
6171       return;
6172     } else if (gtid == KMP_GTID_DNE) {
6173       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6174                     "shutdown\n"));
6175       /* we don't know who we are, but we may still shutdown the library */
6176     } else if (KMP_UBER_GTID(gtid)) {
6177       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6178       if (__kmp_root[gtid]->r.r_active) {
6179         __kmp_global.g.g_abort = -1;
6180         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6181         __kmp_unregister_library();
6182         KA_TRACE(10,
6183                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6184                   gtid));
6185         return;
6186       } else {
6187         KA_TRACE(
6188             10,
6189             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6190         __kmp_unregister_root_current_thread(gtid);
6191       }
6192     } else {
6193 /* worker threads may call this function through the atexit handler, if they
6194  * call exit() */
6195 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6196    TODO: do a thorough shutdown instead */
6197 #ifdef DUMP_DEBUG_ON_EXIT
6198       if (__kmp_debug_buf)
6199         __kmp_dump_debug_buffer();
6200 #endif
6201       // added unregister library call here when we switch to shm linux
6202       // if we don't, it will leave lots of files in /dev/shm
6203       // cleanup shared memory file before exiting.
6204       __kmp_unregister_library();
6205       return;
6206     }
6207   }
6208   /* synchronize the termination process */
6209   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6210 
6211   /* have we already finished */
6212   if (__kmp_global.g.g_abort) {
6213     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6214     /* TODO abort? */
6215     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6216     return;
6217   }
6218   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6219     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6220     return;
6221   }
6222 
6223   /* We need this lock to enforce mutex between this reading of
6224      __kmp_threads_capacity and the writing by __kmp_register_root.
6225      Alternatively, we can use a counter of roots that is atomically updated by
6226      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6227      __kmp_internal_end_*.  */
6228   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6229 
6230   /* now we can safely conduct the actual termination */
6231   __kmp_internal_end();
6232 
6233   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6234   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6235 
6236   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6237 
6238 #ifdef DUMP_DEBUG_ON_EXIT
6239   if (__kmp_debug_buf)
6240     __kmp_dump_debug_buffer();
6241 #endif
6242 
6243 #if KMP_OS_WINDOWS
6244   __kmp_close_console();
6245 #endif
6246 
6247   __kmp_fini_allocator();
6248 
6249 } // __kmp_internal_end_library
6250 
6251 void __kmp_internal_end_thread(int gtid_req) {
6252   int i;
6253 
6254   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6255   /* this shouldn't be a race condition because __kmp_internal_end() is the
6256    * only place to clear __kmp_serial_init */
6257   /* we'll check this later too, after we get the lock */
6258   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6259   // redundant, because the next check will work in any case.
6260   if (__kmp_global.g.g_abort) {
6261     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6262     /* TODO abort? */
6263     return;
6264   }
6265   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6266     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6267     return;
6268   }
6269 
6270   // If hidden helper team has been initialized, we need to deinit it
6271   if (TCR_4(__kmp_init_hidden_helper)) {
6272     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6273     // First release the main thread to let it continue its work
6274     __kmp_hidden_helper_main_thread_release();
6275     // Wait until the hidden helper team has been destroyed
6276     __kmp_hidden_helper_threads_deinitz_wait();
6277   }
6278 
6279   KMP_MB(); /* Flush all pending memory write invalidates.  */
6280 
6281   /* find out who we are and what we should do */
6282   {
6283     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6284     KA_TRACE(10,
6285              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6286     if (gtid == KMP_GTID_SHUTDOWN) {
6287       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6288                     "already shutdown\n"));
6289       return;
6290     } else if (gtid == KMP_GTID_MONITOR) {
6291       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6292                     "registered, or system shutdown\n"));
6293       return;
6294     } else if (gtid == KMP_GTID_DNE) {
6295       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6296                     "shutdown\n"));
6297       return;
6298       /* we don't know who we are */
6299     } else if (KMP_UBER_GTID(gtid)) {
6300       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6301       if (__kmp_root[gtid]->r.r_active) {
6302         __kmp_global.g.g_abort = -1;
6303         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6304         KA_TRACE(10,
6305                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6306                   gtid));
6307         return;
6308       } else {
6309         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6310                       gtid));
6311         __kmp_unregister_root_current_thread(gtid);
6312       }
6313     } else {
6314       /* just a worker thread, let's leave */
6315       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6316 
6317       if (gtid >= 0) {
6318         __kmp_threads[gtid]->th.th_task_team = NULL;
6319       }
6320 
6321       KA_TRACE(10,
6322                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6323                 gtid));
6324       return;
6325     }
6326   }
6327 #if KMP_DYNAMIC_LIB
6328   if (__kmp_pause_status != kmp_hard_paused)
6329   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6330   // because we will better shutdown later in the library destructor.
6331   {
6332     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6333     return;
6334   }
6335 #endif
6336   /* synchronize the termination process */
6337   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6338 
6339   /* have we already finished */
6340   if (__kmp_global.g.g_abort) {
6341     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6342     /* TODO abort? */
6343     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6344     return;
6345   }
6346   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6347     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6348     return;
6349   }
6350 
6351   /* We need this lock to enforce mutex between this reading of
6352      __kmp_threads_capacity and the writing by __kmp_register_root.
6353      Alternatively, we can use a counter of roots that is atomically updated by
6354      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6355      __kmp_internal_end_*.  */
6356 
6357   /* should we finish the run-time?  are all siblings done? */
6358   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6359 
6360   for (i = 0; i < __kmp_threads_capacity; ++i) {
6361     if (KMP_UBER_GTID(i)) {
6362       KA_TRACE(
6363           10,
6364           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6365       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6366       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6367       return;
6368     }
6369   }
6370 
6371   /* now we can safely conduct the actual termination */
6372 
6373   __kmp_internal_end();
6374 
6375   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6376   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6377 
6378   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6379 
6380 #ifdef DUMP_DEBUG_ON_EXIT
6381   if (__kmp_debug_buf)
6382     __kmp_dump_debug_buffer();
6383 #endif
6384 } // __kmp_internal_end_thread
6385 
6386 // -----------------------------------------------------------------------------
6387 // Library registration stuff.
6388 
6389 static long __kmp_registration_flag = 0;
6390 // Random value used to indicate library initialization.
6391 static char *__kmp_registration_str = NULL;
6392 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6393 
6394 static inline char *__kmp_reg_status_name() {
6395 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6396    each thread. If registration and unregistration go in different threads
6397    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6398    env var can not be found, because the name will contain different pid. */
6399 // macOS* complains about name being too long with additional getuid()
6400 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6401   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6402                           (int)getuid());
6403 #else
6404   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6405 #endif
6406 } // __kmp_reg_status_get
6407 
6408 void __kmp_register_library_startup(void) {
6409 
6410   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6411   int done = 0;
6412   union {
6413     double dtime;
6414     long ltime;
6415   } time;
6416 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6417   __kmp_initialize_system_tick();
6418 #endif
6419   __kmp_read_system_time(&time.dtime);
6420   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6421   __kmp_registration_str =
6422       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6423                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6424 
6425   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6426                 __kmp_registration_str));
6427 
6428   while (!done) {
6429 
6430     char *value = NULL; // Actual value of the environment variable.
6431 
6432 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6433     char *shm_name = __kmp_str_format("/%s", name);
6434     int shm_preexist = 0;
6435     char *data1;
6436     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6437     if ((fd1 == -1) && (errno == EEXIST)) {
6438       // file didn't open because it already exists.
6439       // try opening existing file
6440       fd1 = shm_open(shm_name, O_RDWR, 0666);
6441       if (fd1 == -1) { // file didn't open
6442         // error out here
6443         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6444                     __kmp_msg_null);
6445       } else {
6446         // able to open existing file
6447         shm_preexist = 1;
6448       }
6449     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6450       // already exists.
6451       // error out here.
6452       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6453                   __kmp_msg_null);
6454     }
6455     if (shm_preexist == 0) {
6456       // we created SHM now set size
6457       if (ftruncate(fd1, SHM_SIZE) == -1) {
6458         // error occured setting size;
6459         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6460                     KMP_ERR(errno), __kmp_msg_null);
6461       }
6462     }
6463     data1 =
6464         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6465     if (data1 == MAP_FAILED) {
6466       // failed to map shared memory
6467       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6468                   __kmp_msg_null);
6469     }
6470     if (shm_preexist == 0) { // set data to SHM, set value
6471       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6472     }
6473     // Read value from either what we just wrote or existing file.
6474     value = __kmp_str_format("%s", data1); // read value from SHM
6475     munmap(data1, SHM_SIZE);
6476     close(fd1);
6477 #else // Windows and unix with static library
6478     // Set environment variable, but do not overwrite if it is exist.
6479     __kmp_env_set(name, __kmp_registration_str, 0);
6480     // read value to see if it got set
6481     value = __kmp_env_get(name);
6482 #endif
6483 
6484     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6485       done = 1; // Ok, environment variable set successfully, exit the loop.
6486     } else {
6487       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6488       // Check whether it alive or dead.
6489       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6490       char *tail = value;
6491       char *flag_addr_str = NULL;
6492       char *flag_val_str = NULL;
6493       char const *file_name = NULL;
6494       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6495       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6496       file_name = tail;
6497       if (tail != NULL) {
6498         long *flag_addr = 0;
6499         long flag_val = 0;
6500         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6501         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6502         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6503           // First, check whether environment-encoded address is mapped into
6504           // addr space.
6505           // If so, dereference it to see if it still has the right value.
6506           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6507             neighbor = 1;
6508           } else {
6509             // If not, then we know the other copy of the library is no longer
6510             // running.
6511             neighbor = 2;
6512           }
6513         }
6514       }
6515       switch (neighbor) {
6516       case 0: // Cannot parse environment variable -- neighbor status unknown.
6517         // Assume it is the incompatible format of future version of the
6518         // library. Assume the other library is alive.
6519         // WARN( ... ); // TODO: Issue a warning.
6520         file_name = "unknown library";
6521         KMP_FALLTHROUGH();
6522       // Attention! Falling to the next case. That's intentional.
6523       case 1: { // Neighbor is alive.
6524         // Check it is allowed.
6525         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6526         if (!__kmp_str_match_true(duplicate_ok)) {
6527           // That's not allowed. Issue fatal error.
6528           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6529                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6530         }
6531         KMP_INTERNAL_FREE(duplicate_ok);
6532         __kmp_duplicate_library_ok = 1;
6533         done = 1; // Exit the loop.
6534       } break;
6535       case 2: { // Neighbor is dead.
6536 
6537 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6538         // close shared memory.
6539         shm_unlink(shm_name); // this removes file in /dev/shm
6540 #else
6541         // Clear the variable and try to register library again.
6542         __kmp_env_unset(name);
6543 #endif
6544       } break;
6545       default: { KMP_DEBUG_ASSERT(0); } break;
6546       }
6547     }
6548     KMP_INTERNAL_FREE((void *)value);
6549 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6550     KMP_INTERNAL_FREE((void *)shm_name);
6551 #endif
6552   } // while
6553   KMP_INTERNAL_FREE((void *)name);
6554 
6555 } // func __kmp_register_library_startup
6556 
6557 void __kmp_unregister_library(void) {
6558 
6559   char *name = __kmp_reg_status_name();
6560   char *value = NULL;
6561 
6562 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6563   char *shm_name = __kmp_str_format("/%s", name);
6564   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6565   if (fd1 == -1) {
6566     // file did not open. return.
6567     return;
6568   }
6569   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6570   if (data1 != MAP_FAILED) {
6571     value = __kmp_str_format("%s", data1); // read value from SHM
6572     munmap(data1, SHM_SIZE);
6573   }
6574   close(fd1);
6575 #else
6576   value = __kmp_env_get(name);
6577 #endif
6578 
6579   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6580   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6581   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6582 //  Ok, this is our variable. Delete it.
6583 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6584     shm_unlink(shm_name); // this removes file in /dev/shm
6585 #else
6586     __kmp_env_unset(name);
6587 #endif
6588   }
6589 
6590 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6591   KMP_INTERNAL_FREE(shm_name);
6592 #endif
6593 
6594   KMP_INTERNAL_FREE(__kmp_registration_str);
6595   KMP_INTERNAL_FREE(value);
6596   KMP_INTERNAL_FREE(name);
6597 
6598   __kmp_registration_flag = 0;
6599   __kmp_registration_str = NULL;
6600 
6601 } // __kmp_unregister_library
6602 
6603 // End of Library registration stuff.
6604 // -----------------------------------------------------------------------------
6605 
6606 #if KMP_MIC_SUPPORTED
6607 
6608 static void __kmp_check_mic_type() {
6609   kmp_cpuid_t cpuid_state = {0};
6610   kmp_cpuid_t *cs_p = &cpuid_state;
6611   __kmp_x86_cpuid(1, 0, cs_p);
6612   // We don't support mic1 at the moment
6613   if ((cs_p->eax & 0xff0) == 0xB10) {
6614     __kmp_mic_type = mic2;
6615   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6616     __kmp_mic_type = mic3;
6617   } else {
6618     __kmp_mic_type = non_mic;
6619   }
6620 }
6621 
6622 #endif /* KMP_MIC_SUPPORTED */
6623 
6624 #if KMP_HAVE_UMWAIT
6625 static void __kmp_user_level_mwait_init() {
6626   struct kmp_cpuid buf;
6627   __kmp_x86_cpuid(7, 0, &buf);
6628   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6629   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6630                 __kmp_umwait_enabled));
6631 }
6632 #elif KMP_HAVE_MWAIT
6633 #ifndef AT_INTELPHIUSERMWAIT
6634 // Spurious, non-existent value that should always fail to return anything.
6635 // Will be replaced with the correct value when we know that.
6636 #define AT_INTELPHIUSERMWAIT 10000
6637 #endif
6638 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6639 // earlier OS is used to build the RTL, we'll use the following internal
6640 // function when the entry is not found.
6641 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6642 unsigned long getauxval(unsigned long) { return 0; }
6643 
6644 static void __kmp_user_level_mwait_init() {
6645   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6646   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6647   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6648   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6649   if (__kmp_mic_type == mic3) {
6650     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6651     if ((res & 0x1) || __kmp_user_level_mwait) {
6652       __kmp_mwait_enabled = TRUE;
6653       if (__kmp_user_level_mwait) {
6654         KMP_INFORM(EnvMwaitWarn);
6655       }
6656     } else {
6657       __kmp_mwait_enabled = FALSE;
6658     }
6659   }
6660   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6661                 "__kmp_mwait_enabled = %d\n",
6662                 __kmp_mic_type, __kmp_mwait_enabled));
6663 }
6664 #endif /* KMP_HAVE_UMWAIT */
6665 
6666 static void __kmp_do_serial_initialize(void) {
6667   int i, gtid;
6668   size_t size;
6669 
6670   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6671 
6672   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6673   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6674   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6675   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6676   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6677 
6678 #if OMPT_SUPPORT
6679   ompt_pre_init();
6680 #endif
6681 
6682   __kmp_validate_locks();
6683 
6684   /* Initialize internal memory allocator */
6685   __kmp_init_allocator();
6686 
6687   /* Register the library startup via an environment variable and check to see
6688      whether another copy of the library is already registered. */
6689 
6690   __kmp_register_library_startup();
6691 
6692   /* TODO reinitialization of library */
6693   if (TCR_4(__kmp_global.g.g_done)) {
6694     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6695   }
6696 
6697   __kmp_global.g.g_abort = 0;
6698   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6699 
6700 /* initialize the locks */
6701 #if KMP_USE_ADAPTIVE_LOCKS
6702 #if KMP_DEBUG_ADAPTIVE_LOCKS
6703   __kmp_init_speculative_stats();
6704 #endif
6705 #endif
6706 #if KMP_STATS_ENABLED
6707   __kmp_stats_init();
6708 #endif
6709   __kmp_init_lock(&__kmp_global_lock);
6710   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6711   __kmp_init_lock(&__kmp_debug_lock);
6712   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6713   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6714   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6715   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6716   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6717   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6718   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6719   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6720   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6721   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6722   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6723   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6724   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6725   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6726   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6727 #if KMP_USE_MONITOR
6728   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6729 #endif
6730   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6731 
6732   /* conduct initialization and initial setup of configuration */
6733 
6734   __kmp_runtime_initialize();
6735 
6736 #if KMP_MIC_SUPPORTED
6737   __kmp_check_mic_type();
6738 #endif
6739 
6740 // Some global variable initialization moved here from kmp_env_initialize()
6741 #ifdef KMP_DEBUG
6742   kmp_diag = 0;
6743 #endif
6744   __kmp_abort_delay = 0;
6745 
6746   // From __kmp_init_dflt_team_nth()
6747   /* assume the entire machine will be used */
6748   __kmp_dflt_team_nth_ub = __kmp_xproc;
6749   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6750     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6751   }
6752   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6753     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6754   }
6755   __kmp_max_nth = __kmp_sys_max_nth;
6756   __kmp_cg_max_nth = __kmp_sys_max_nth;
6757   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6758   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6759     __kmp_teams_max_nth = __kmp_sys_max_nth;
6760   }
6761 
6762   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6763   // part
6764   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6765 #if KMP_USE_MONITOR
6766   __kmp_monitor_wakeups =
6767       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6768   __kmp_bt_intervals =
6769       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6770 #endif
6771   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6772   __kmp_library = library_throughput;
6773   // From KMP_SCHEDULE initialization
6774   __kmp_static = kmp_sch_static_balanced;
6775 // AC: do not use analytical here, because it is non-monotonous
6776 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6777 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6778 // need to repeat assignment
6779 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6780 // bit control and barrier method control parts
6781 #if KMP_FAST_REDUCTION_BARRIER
6782 #define kmp_reduction_barrier_gather_bb ((int)1)
6783 #define kmp_reduction_barrier_release_bb ((int)1)
6784 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6785 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6786 #endif // KMP_FAST_REDUCTION_BARRIER
6787   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6788     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6789     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6790     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6791     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6792 #if KMP_FAST_REDUCTION_BARRIER
6793     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6794       // lin_64 ): hyper,1
6795       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6796       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6797       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6798       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6799     }
6800 #endif // KMP_FAST_REDUCTION_BARRIER
6801   }
6802 #if KMP_FAST_REDUCTION_BARRIER
6803 #undef kmp_reduction_barrier_release_pat
6804 #undef kmp_reduction_barrier_gather_pat
6805 #undef kmp_reduction_barrier_release_bb
6806 #undef kmp_reduction_barrier_gather_bb
6807 #endif // KMP_FAST_REDUCTION_BARRIER
6808 #if KMP_MIC_SUPPORTED
6809   if (__kmp_mic_type == mic2) { // KNC
6810     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6811     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6812     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6813         1; // forkjoin release
6814     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6815     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6816   }
6817 #if KMP_FAST_REDUCTION_BARRIER
6818   if (__kmp_mic_type == mic2) { // KNC
6819     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6820     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6821   }
6822 #endif // KMP_FAST_REDUCTION_BARRIER
6823 #endif // KMP_MIC_SUPPORTED
6824 
6825 // From KMP_CHECKS initialization
6826 #ifdef KMP_DEBUG
6827   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6828 #else
6829   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6830 #endif
6831 
6832   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6833   __kmp_foreign_tp = TRUE;
6834 
6835   __kmp_global.g.g_dynamic = FALSE;
6836   __kmp_global.g.g_dynamic_mode = dynamic_default;
6837 
6838   __kmp_env_initialize(NULL);
6839 
6840 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6841   __kmp_user_level_mwait_init();
6842 #endif
6843 // Print all messages in message catalog for testing purposes.
6844 #ifdef KMP_DEBUG
6845   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6846   if (__kmp_str_match_true(val)) {
6847     kmp_str_buf_t buffer;
6848     __kmp_str_buf_init(&buffer);
6849     __kmp_i18n_dump_catalog(&buffer);
6850     __kmp_printf("%s", buffer.str);
6851     __kmp_str_buf_free(&buffer);
6852   }
6853   __kmp_env_free(&val);
6854 #endif
6855 
6856   __kmp_threads_capacity =
6857       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6858   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6859   __kmp_tp_capacity = __kmp_default_tp_capacity(
6860       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6861 
6862   // If the library is shut down properly, both pools must be NULL. Just in
6863   // case, set them to NULL -- some memory may leak, but subsequent code will
6864   // work even if pools are not freed.
6865   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6866   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6867   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6868   __kmp_thread_pool = NULL;
6869   __kmp_thread_pool_insert_pt = NULL;
6870   __kmp_team_pool = NULL;
6871 
6872   /* Allocate all of the variable sized records */
6873   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6874    * expandable */
6875   /* Since allocation is cache-aligned, just add extra padding at the end */
6876   size =
6877       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6878       CACHE_LINE;
6879   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6880   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6881                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6882 
6883   /* init thread counts */
6884   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6885                    0); // Asserts fail if the library is reinitializing and
6886   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6887   __kmp_all_nth = 0;
6888   __kmp_nth = 0;
6889 
6890   /* setup the uber master thread and hierarchy */
6891   gtid = __kmp_register_root(TRUE);
6892   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6893   KMP_ASSERT(KMP_UBER_GTID(gtid));
6894   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6895 
6896   KMP_MB(); /* Flush all pending memory write invalidates.  */
6897 
6898   __kmp_common_initialize();
6899 
6900 #if KMP_OS_UNIX
6901   /* invoke the child fork handler */
6902   __kmp_register_atfork();
6903 #endif
6904 
6905 #if !KMP_DYNAMIC_LIB
6906   {
6907     /* Invoke the exit handler when the program finishes, only for static
6908        library. For dynamic library, we already have _fini and DllMain. */
6909     int rc = atexit(__kmp_internal_end_atexit);
6910     if (rc != 0) {
6911       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6912                   __kmp_msg_null);
6913     }
6914   }
6915 #endif
6916 
6917 #if KMP_HANDLE_SIGNALS
6918 #if KMP_OS_UNIX
6919   /* NOTE: make sure that this is called before the user installs their own
6920      signal handlers so that the user handlers are called first. this way they
6921      can return false, not call our handler, avoid terminating the library, and
6922      continue execution where they left off. */
6923   __kmp_install_signals(FALSE);
6924 #endif /* KMP_OS_UNIX */
6925 #if KMP_OS_WINDOWS
6926   __kmp_install_signals(TRUE);
6927 #endif /* KMP_OS_WINDOWS */
6928 #endif
6929 
6930   /* we have finished the serial initialization */
6931   __kmp_init_counter++;
6932 
6933   __kmp_init_serial = TRUE;
6934 
6935   if (__kmp_settings) {
6936     __kmp_env_print();
6937   }
6938 
6939   if (__kmp_display_env || __kmp_display_env_verbose) {
6940     __kmp_env_print_2();
6941   }
6942 
6943 #if OMPT_SUPPORT
6944   ompt_post_init();
6945 #endif
6946 
6947   KMP_MB();
6948 
6949   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6950 }
6951 
6952 void __kmp_serial_initialize(void) {
6953   if (__kmp_init_serial) {
6954     return;
6955   }
6956   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6957   if (__kmp_init_serial) {
6958     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6959     return;
6960   }
6961   __kmp_do_serial_initialize();
6962   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6963 }
6964 
6965 static void __kmp_do_middle_initialize(void) {
6966   int i, j;
6967   int prev_dflt_team_nth;
6968 
6969   if (!__kmp_init_serial) {
6970     __kmp_do_serial_initialize();
6971   }
6972 
6973   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6974 
6975   // Save the previous value for the __kmp_dflt_team_nth so that
6976   // we can avoid some reinitialization if it hasn't changed.
6977   prev_dflt_team_nth = __kmp_dflt_team_nth;
6978 
6979 #if KMP_AFFINITY_SUPPORTED
6980   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6981   // number of cores on the machine.
6982   __kmp_affinity_initialize();
6983 
6984   // Run through the __kmp_threads array and set the affinity mask
6985   // for each root thread that is currently registered with the RTL.
6986   for (i = 0; i < __kmp_threads_capacity; i++) {
6987     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6988       __kmp_affinity_set_init_mask(i, TRUE);
6989     }
6990   }
6991 #endif /* KMP_AFFINITY_SUPPORTED */
6992 
6993   KMP_ASSERT(__kmp_xproc > 0);
6994   if (__kmp_avail_proc == 0) {
6995     __kmp_avail_proc = __kmp_xproc;
6996   }
6997 
6998   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6999   // correct them now
7000   j = 0;
7001   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7002     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7003         __kmp_avail_proc;
7004     j++;
7005   }
7006 
7007   if (__kmp_dflt_team_nth == 0) {
7008 #ifdef KMP_DFLT_NTH_CORES
7009     // Default #threads = #cores
7010     __kmp_dflt_team_nth = __kmp_ncores;
7011     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7012                   "__kmp_ncores (%d)\n",
7013                   __kmp_dflt_team_nth));
7014 #else
7015     // Default #threads = #available OS procs
7016     __kmp_dflt_team_nth = __kmp_avail_proc;
7017     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7018                   "__kmp_avail_proc(%d)\n",
7019                   __kmp_dflt_team_nth));
7020 #endif /* KMP_DFLT_NTH_CORES */
7021   }
7022 
7023   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7024     __kmp_dflt_team_nth = KMP_MIN_NTH;
7025   }
7026   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7027     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7028   }
7029 
7030   // There's no harm in continuing if the following check fails,
7031   // but it indicates an error in the previous logic.
7032   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7033 
7034   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7035     // Run through the __kmp_threads array and set the num threads icv for each
7036     // root thread that is currently registered with the RTL (which has not
7037     // already explicitly set its nthreads-var with a call to
7038     // omp_set_num_threads()).
7039     for (i = 0; i < __kmp_threads_capacity; i++) {
7040       kmp_info_t *thread = __kmp_threads[i];
7041       if (thread == NULL)
7042         continue;
7043       if (thread->th.th_current_task->td_icvs.nproc != 0)
7044         continue;
7045 
7046       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7047     }
7048   }
7049   KA_TRACE(
7050       20,
7051       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7052        __kmp_dflt_team_nth));
7053 
7054 #ifdef KMP_ADJUST_BLOCKTIME
7055   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7056   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7057     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7058     if (__kmp_nth > __kmp_avail_proc) {
7059       __kmp_zero_bt = TRUE;
7060     }
7061   }
7062 #endif /* KMP_ADJUST_BLOCKTIME */
7063 
7064   /* we have finished middle initialization */
7065   TCW_SYNC_4(__kmp_init_middle, TRUE);
7066 
7067   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7068 }
7069 
7070 void __kmp_middle_initialize(void) {
7071   if (__kmp_init_middle) {
7072     return;
7073   }
7074   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7075   if (__kmp_init_middle) {
7076     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7077     return;
7078   }
7079   __kmp_do_middle_initialize();
7080   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7081 }
7082 
7083 void __kmp_parallel_initialize(void) {
7084   int gtid = __kmp_entry_gtid(); // this might be a new root
7085 
7086   /* synchronize parallel initialization (for sibling) */
7087   if (TCR_4(__kmp_init_parallel))
7088     return;
7089   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7090   if (TCR_4(__kmp_init_parallel)) {
7091     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7092     return;
7093   }
7094 
7095   /* TODO reinitialization after we have already shut down */
7096   if (TCR_4(__kmp_global.g.g_done)) {
7097     KA_TRACE(
7098         10,
7099         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7100     __kmp_infinite_loop();
7101   }
7102 
7103   /* jc: The lock __kmp_initz_lock is already held, so calling
7104      __kmp_serial_initialize would cause a deadlock.  So we call
7105      __kmp_do_serial_initialize directly. */
7106   if (!__kmp_init_middle) {
7107     __kmp_do_middle_initialize();
7108   }
7109   __kmp_resume_if_hard_paused();
7110 
7111   /* begin initialization */
7112   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7113   KMP_ASSERT(KMP_UBER_GTID(gtid));
7114 
7115 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7116   // Save the FP control regs.
7117   // Worker threads will set theirs to these values at thread startup.
7118   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7119   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7120   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7121 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7122 
7123 #if KMP_OS_UNIX
7124 #if KMP_HANDLE_SIGNALS
7125   /*  must be after __kmp_serial_initialize  */
7126   __kmp_install_signals(TRUE);
7127 #endif
7128 #endif
7129 
7130   __kmp_suspend_initialize();
7131 
7132 #if defined(USE_LOAD_BALANCE)
7133   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7134     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7135   }
7136 #else
7137   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7138     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7139   }
7140 #endif
7141 
7142   if (__kmp_version) {
7143     __kmp_print_version_2();
7144   }
7145 
7146   /* we have finished parallel initialization */
7147   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7148 
7149   KMP_MB();
7150   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7151 
7152   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7153 }
7154 
7155 void __kmp_hidden_helper_initialize() {
7156   if (TCR_4(__kmp_init_hidden_helper))
7157     return;
7158 
7159   // __kmp_parallel_initialize is required before we initialize hidden helper
7160   if (!TCR_4(__kmp_init_parallel))
7161     __kmp_parallel_initialize();
7162 
7163   // Double check. Note that this double check should not be placed before
7164   // __kmp_parallel_initialize as it will cause dead lock.
7165   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7166   if (TCR_4(__kmp_init_hidden_helper)) {
7167     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7168     return;
7169   }
7170 
7171   // Set the count of hidden helper tasks to be executed to zero
7172   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7173 
7174   // Set the global variable indicating that we're initializing hidden helper
7175   // team/threads
7176   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7177 
7178   // Platform independent initialization
7179   __kmp_do_initialize_hidden_helper_threads();
7180 
7181   // Wait here for the finish of initialization of hidden helper teams
7182   __kmp_hidden_helper_threads_initz_wait();
7183 
7184   // We have finished hidden helper initialization
7185   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7186 
7187   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7188 }
7189 
7190 /* ------------------------------------------------------------------------ */
7191 
7192 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7193                                    kmp_team_t *team) {
7194   kmp_disp_t *dispatch;
7195 
7196   KMP_MB();
7197 
7198   /* none of the threads have encountered any constructs, yet. */
7199   this_thr->th.th_local.this_construct = 0;
7200 #if KMP_CACHE_MANAGE
7201   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7202 #endif /* KMP_CACHE_MANAGE */
7203   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7204   KMP_DEBUG_ASSERT(dispatch);
7205   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7206   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7207   // this_thr->th.th_info.ds.ds_tid ] );
7208 
7209   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7210   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7211   if (__kmp_env_consistency_check)
7212     __kmp_push_parallel(gtid, team->t.t_ident);
7213 
7214   KMP_MB(); /* Flush all pending memory write invalidates.  */
7215 }
7216 
7217 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7218                                   kmp_team_t *team) {
7219   if (__kmp_env_consistency_check)
7220     __kmp_pop_parallel(gtid, team->t.t_ident);
7221 
7222   __kmp_finish_implicit_task(this_thr);
7223 }
7224 
7225 int __kmp_invoke_task_func(int gtid) {
7226   int rc;
7227   int tid = __kmp_tid_from_gtid(gtid);
7228   kmp_info_t *this_thr = __kmp_threads[gtid];
7229   kmp_team_t *team = this_thr->th.th_team;
7230 
7231   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7232 #if USE_ITT_BUILD
7233   if (__itt_stack_caller_create_ptr) {
7234     __kmp_itt_stack_callee_enter(
7235         (__itt_caller)
7236             team->t.t_stack_id); // inform ittnotify about entering user's code
7237   }
7238 #endif /* USE_ITT_BUILD */
7239 #if INCLUDE_SSC_MARKS
7240   SSC_MARK_INVOKING();
7241 #endif
7242 
7243 #if OMPT_SUPPORT
7244   void *dummy;
7245   void **exit_frame_p;
7246   ompt_data_t *my_task_data;
7247   ompt_data_t *my_parallel_data;
7248   int ompt_team_size;
7249 
7250   if (ompt_enabled.enabled) {
7251     exit_frame_p = &(
7252         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7253   } else {
7254     exit_frame_p = &dummy;
7255   }
7256 
7257   my_task_data =
7258       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7259   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7260   if (ompt_enabled.ompt_callback_implicit_task) {
7261     ompt_team_size = team->t.t_nproc;
7262     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7263         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7264         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7265     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7266   }
7267 #endif
7268 
7269 #if KMP_STATS_ENABLED
7270   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7271   if (previous_state == stats_state_e::TEAMS_REGION) {
7272     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7273   } else {
7274     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7275   }
7276   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7277 #endif
7278 
7279   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7280                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7281 #if OMPT_SUPPORT
7282                               ,
7283                               exit_frame_p
7284 #endif
7285                               );
7286 #if OMPT_SUPPORT
7287   *exit_frame_p = NULL;
7288    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7289 #endif
7290 
7291 #if KMP_STATS_ENABLED
7292   if (previous_state == stats_state_e::TEAMS_REGION) {
7293     KMP_SET_THREAD_STATE(previous_state);
7294   }
7295   KMP_POP_PARTITIONED_TIMER();
7296 #endif
7297 
7298 #if USE_ITT_BUILD
7299   if (__itt_stack_caller_create_ptr) {
7300     __kmp_itt_stack_callee_leave(
7301         (__itt_caller)
7302             team->t.t_stack_id); // inform ittnotify about leaving user's code
7303   }
7304 #endif /* USE_ITT_BUILD */
7305   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7306 
7307   return rc;
7308 }
7309 
7310 void __kmp_teams_master(int gtid) {
7311   // This routine is called by all master threads in teams construct
7312   kmp_info_t *thr = __kmp_threads[gtid];
7313   kmp_team_t *team = thr->th.th_team;
7314   ident_t *loc = team->t.t_ident;
7315   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7316   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7317   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7318   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7319                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7320 
7321   // This thread is a new CG root.  Set up the proper variables.
7322   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7323   tmp->cg_root = thr; // Make thr the CG root
7324   // Init to thread limit that was stored when league masters were forked
7325   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7326   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7327   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7328                  " cg_nthreads to 1\n",
7329                  thr, tmp));
7330   tmp->up = thr->th.th_cg_roots;
7331   thr->th.th_cg_roots = tmp;
7332 
7333 // Launch league of teams now, but not let workers execute
7334 // (they hang on fork barrier until next parallel)
7335 #if INCLUDE_SSC_MARKS
7336   SSC_MARK_FORKING();
7337 #endif
7338   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7339                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7340                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7341 #if INCLUDE_SSC_MARKS
7342   SSC_MARK_JOINING();
7343 #endif
7344   // If the team size was reduced from the limit, set it to the new size
7345   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7346     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7347   // AC: last parameter "1" eliminates join barrier which won't work because
7348   // worker threads are in a fork barrier waiting for more parallel regions
7349   __kmp_join_call(loc, gtid
7350 #if OMPT_SUPPORT
7351                   ,
7352                   fork_context_intel
7353 #endif
7354                   ,
7355                   1);
7356 }
7357 
7358 int __kmp_invoke_teams_master(int gtid) {
7359   kmp_info_t *this_thr = __kmp_threads[gtid];
7360   kmp_team_t *team = this_thr->th.th_team;
7361 #if KMP_DEBUG
7362   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7363     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7364                      (void *)__kmp_teams_master);
7365 #endif
7366   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7367 #if OMPT_SUPPORT
7368   int tid = __kmp_tid_from_gtid(gtid);
7369   ompt_data_t *task_data =
7370       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7371   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7372   if (ompt_enabled.ompt_callback_implicit_task) {
7373     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7374         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7375         ompt_task_initial);
7376     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7377   }
7378 #endif
7379   __kmp_teams_master(gtid);
7380 #if OMPT_SUPPORT
7381   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7382 #endif
7383   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7384   return 1;
7385 }
7386 
7387 /* this sets the requested number of threads for the next parallel region
7388    encountered by this team. since this should be enclosed in the forkjoin
7389    critical section it should avoid race conditions with asymmetrical nested
7390    parallelism */
7391 
7392 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7393   kmp_info_t *thr = __kmp_threads[gtid];
7394 
7395   if (num_threads > 0)
7396     thr->th.th_set_nproc = num_threads;
7397 }
7398 
7399 /* this sets the requested number of teams for the teams region and/or
7400    the number of threads for the next parallel region encountered  */
7401 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7402                           int num_threads) {
7403   kmp_info_t *thr = __kmp_threads[gtid];
7404   KMP_DEBUG_ASSERT(num_teams >= 0);
7405   KMP_DEBUG_ASSERT(num_threads >= 0);
7406 
7407   if (num_teams == 0)
7408     num_teams = 1; // default number of teams is 1.
7409   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7410     if (!__kmp_reserve_warn) {
7411       __kmp_reserve_warn = 1;
7412       __kmp_msg(kmp_ms_warning,
7413                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7414                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7415     }
7416     num_teams = __kmp_teams_max_nth;
7417   }
7418   // Set number of teams (number of threads in the outer "parallel" of the
7419   // teams)
7420   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7421 
7422   // Remember the number of threads for inner parallel regions
7423   if (!TCR_4(__kmp_init_middle))
7424     __kmp_middle_initialize(); // get internal globals calculated
7425   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7426   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7427   if (num_threads == 0) {
7428     num_threads = __kmp_avail_proc / num_teams;
7429     // adjust num_threads w/o warning as it is not user setting
7430     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7431     // no thread_limit clause specified -  do not change thread-limit-var ICV
7432     if (num_threads > __kmp_dflt_team_nth) {
7433       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7434     }
7435     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7436       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7437     } // prevent team size to exceed thread-limit-var
7438     if (num_teams * num_threads > __kmp_teams_max_nth) {
7439       num_threads = __kmp_teams_max_nth / num_teams;
7440     }
7441   } else {
7442     // This thread will be the master of the league masters
7443     // Store new thread limit; old limit is saved in th_cg_roots list
7444     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7445     // num_threads = min(num_threads, nthreads-var)
7446     if (num_threads > __kmp_dflt_team_nth) {
7447       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7448     }
7449     if (num_teams * num_threads > __kmp_teams_max_nth) {
7450       int new_threads = __kmp_teams_max_nth / num_teams;
7451       if (!__kmp_reserve_warn) { // user asked for too many threads
7452         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7453         __kmp_msg(kmp_ms_warning,
7454                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7455                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7456       }
7457       num_threads = new_threads;
7458     }
7459   }
7460   thr->th.th_teams_size.nth = num_threads;
7461 }
7462 
7463 // Set the proc_bind var to use in the following parallel region.
7464 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7465   kmp_info_t *thr = __kmp_threads[gtid];
7466   thr->th.th_set_proc_bind = proc_bind;
7467 }
7468 
7469 /* Launch the worker threads into the microtask. */
7470 
7471 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7472   kmp_info_t *this_thr = __kmp_threads[gtid];
7473 
7474 #ifdef KMP_DEBUG
7475   int f;
7476 #endif /* KMP_DEBUG */
7477 
7478   KMP_DEBUG_ASSERT(team);
7479   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7480   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7481   KMP_MB(); /* Flush all pending memory write invalidates.  */
7482 
7483   team->t.t_construct = 0; /* no single directives seen yet */
7484   team->t.t_ordered.dt.t_value =
7485       0; /* thread 0 enters the ordered section first */
7486 
7487   /* Reset the identifiers on the dispatch buffer */
7488   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7489   if (team->t.t_max_nproc > 1) {
7490     int i;
7491     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7492       team->t.t_disp_buffer[i].buffer_index = i;
7493       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7494     }
7495   } else {
7496     team->t.t_disp_buffer[0].buffer_index = 0;
7497     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7498   }
7499 
7500   KMP_MB(); /* Flush all pending memory write invalidates.  */
7501   KMP_ASSERT(this_thr->th.th_team == team);
7502 
7503 #ifdef KMP_DEBUG
7504   for (f = 0; f < team->t.t_nproc; f++) {
7505     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7506                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7507   }
7508 #endif /* KMP_DEBUG */
7509 
7510   /* release the worker threads so they may begin working */
7511   __kmp_fork_barrier(gtid, 0);
7512 }
7513 
7514 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7515   kmp_info_t *this_thr = __kmp_threads[gtid];
7516 
7517   KMP_DEBUG_ASSERT(team);
7518   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7519   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7520   KMP_MB(); /* Flush all pending memory write invalidates.  */
7521 
7522 /* Join barrier after fork */
7523 
7524 #ifdef KMP_DEBUG
7525   if (__kmp_threads[gtid] &&
7526       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7527     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7528                  __kmp_threads[gtid]);
7529     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7530                  "team->t.t_nproc=%d\n",
7531                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7532                  team->t.t_nproc);
7533     __kmp_print_structure();
7534   }
7535   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7536                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7537 #endif /* KMP_DEBUG */
7538 
7539   __kmp_join_barrier(gtid); /* wait for everyone */
7540 #if OMPT_SUPPORT
7541   if (ompt_enabled.enabled &&
7542       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7543     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7544     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7545     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7546 #if OMPT_OPTIONAL
7547     void *codeptr = NULL;
7548     if (KMP_MASTER_TID(ds_tid) &&
7549         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7550          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7551       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7552 
7553     if (ompt_enabled.ompt_callback_sync_region_wait) {
7554       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7555           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7556           codeptr);
7557     }
7558     if (ompt_enabled.ompt_callback_sync_region) {
7559       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7560           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7561           codeptr);
7562     }
7563 #endif
7564     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7565       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7566           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7567     }
7568   }
7569 #endif
7570 
7571   KMP_MB(); /* Flush all pending memory write invalidates.  */
7572   KMP_ASSERT(this_thr->th.th_team == team);
7573 }
7574 
7575 /* ------------------------------------------------------------------------ */
7576 
7577 #ifdef USE_LOAD_BALANCE
7578 
7579 // Return the worker threads actively spinning in the hot team, if we
7580 // are at the outermost level of parallelism.  Otherwise, return 0.
7581 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7582   int i;
7583   int retval;
7584   kmp_team_t *hot_team;
7585 
7586   if (root->r.r_active) {
7587     return 0;
7588   }
7589   hot_team = root->r.r_hot_team;
7590   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7591     return hot_team->t.t_nproc - 1; // Don't count master thread
7592   }
7593 
7594   // Skip the master thread - it is accounted for elsewhere.
7595   retval = 0;
7596   for (i = 1; i < hot_team->t.t_nproc; i++) {
7597     if (hot_team->t.t_threads[i]->th.th_active) {
7598       retval++;
7599     }
7600   }
7601   return retval;
7602 }
7603 
7604 // Perform an automatic adjustment to the number of
7605 // threads used by the next parallel region.
7606 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7607   int retval;
7608   int pool_active;
7609   int hot_team_active;
7610   int team_curr_active;
7611   int system_active;
7612 
7613   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7614                 set_nproc));
7615   KMP_DEBUG_ASSERT(root);
7616   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7617                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7618   KMP_DEBUG_ASSERT(set_nproc > 1);
7619 
7620   if (set_nproc == 1) {
7621     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7622     return 1;
7623   }
7624 
7625   // Threads that are active in the thread pool, active in the hot team for this
7626   // particular root (if we are at the outer par level), and the currently
7627   // executing thread (to become the master) are available to add to the new
7628   // team, but are currently contributing to the system load, and must be
7629   // accounted for.
7630   pool_active = __kmp_thread_pool_active_nth;
7631   hot_team_active = __kmp_active_hot_team_nproc(root);
7632   team_curr_active = pool_active + hot_team_active + 1;
7633 
7634   // Check the system load.
7635   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7636   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7637                 "hot team active = %d\n",
7638                 system_active, pool_active, hot_team_active));
7639 
7640   if (system_active < 0) {
7641     // There was an error reading the necessary info from /proc, so use the
7642     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7643     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7644     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7645     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7646 
7647     // Make this call behave like the thread limit algorithm.
7648     retval = __kmp_avail_proc - __kmp_nth +
7649              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7650     if (retval > set_nproc) {
7651       retval = set_nproc;
7652     }
7653     if (retval < KMP_MIN_NTH) {
7654       retval = KMP_MIN_NTH;
7655     }
7656 
7657     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7658                   retval));
7659     return retval;
7660   }
7661 
7662   // There is a slight delay in the load balance algorithm in detecting new
7663   // running procs. The real system load at this instant should be at least as
7664   // large as the #active omp thread that are available to add to the team.
7665   if (system_active < team_curr_active) {
7666     system_active = team_curr_active;
7667   }
7668   retval = __kmp_avail_proc - system_active + team_curr_active;
7669   if (retval > set_nproc) {
7670     retval = set_nproc;
7671   }
7672   if (retval < KMP_MIN_NTH) {
7673     retval = KMP_MIN_NTH;
7674   }
7675 
7676   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7677   return retval;
7678 } // __kmp_load_balance_nproc()
7679 
7680 #endif /* USE_LOAD_BALANCE */
7681 
7682 /* ------------------------------------------------------------------------ */
7683 
7684 /* NOTE: this is called with the __kmp_init_lock held */
7685 void __kmp_cleanup(void) {
7686   int f;
7687 
7688   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7689 
7690   if (TCR_4(__kmp_init_parallel)) {
7691 #if KMP_HANDLE_SIGNALS
7692     __kmp_remove_signals();
7693 #endif
7694     TCW_4(__kmp_init_parallel, FALSE);
7695   }
7696 
7697   if (TCR_4(__kmp_init_middle)) {
7698 #if KMP_AFFINITY_SUPPORTED
7699     __kmp_affinity_uninitialize();
7700 #endif /* KMP_AFFINITY_SUPPORTED */
7701     __kmp_cleanup_hierarchy();
7702     TCW_4(__kmp_init_middle, FALSE);
7703   }
7704 
7705   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7706 
7707   if (__kmp_init_serial) {
7708     __kmp_runtime_destroy();
7709     __kmp_init_serial = FALSE;
7710   }
7711 
7712   __kmp_cleanup_threadprivate_caches();
7713 
7714   for (f = 0; f < __kmp_threads_capacity; f++) {
7715     if (__kmp_root[f] != NULL) {
7716       __kmp_free(__kmp_root[f]);
7717       __kmp_root[f] = NULL;
7718     }
7719   }
7720   __kmp_free(__kmp_threads);
7721   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7722   // there is no need in freeing __kmp_root.
7723   __kmp_threads = NULL;
7724   __kmp_root = NULL;
7725   __kmp_threads_capacity = 0;
7726 
7727 #if KMP_USE_DYNAMIC_LOCK
7728   __kmp_cleanup_indirect_user_locks();
7729 #else
7730   __kmp_cleanup_user_locks();
7731 #endif
7732 
7733 #if KMP_AFFINITY_SUPPORTED
7734   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7735   __kmp_cpuinfo_file = NULL;
7736 #endif /* KMP_AFFINITY_SUPPORTED */
7737 
7738 #if KMP_USE_ADAPTIVE_LOCKS
7739 #if KMP_DEBUG_ADAPTIVE_LOCKS
7740   __kmp_print_speculative_stats();
7741 #endif
7742 #endif
7743   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7744   __kmp_nested_nth.nth = NULL;
7745   __kmp_nested_nth.size = 0;
7746   __kmp_nested_nth.used = 0;
7747   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7748   __kmp_nested_proc_bind.bind_types = NULL;
7749   __kmp_nested_proc_bind.size = 0;
7750   __kmp_nested_proc_bind.used = 0;
7751   if (__kmp_affinity_format) {
7752     KMP_INTERNAL_FREE(__kmp_affinity_format);
7753     __kmp_affinity_format = NULL;
7754   }
7755 
7756   __kmp_i18n_catclose();
7757 
7758 #if KMP_USE_HIER_SCHED
7759   __kmp_hier_scheds.deallocate();
7760 #endif
7761 
7762 #if KMP_STATS_ENABLED
7763   __kmp_stats_fini();
7764 #endif
7765 
7766   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7767 }
7768 
7769 /* ------------------------------------------------------------------------ */
7770 
7771 int __kmp_ignore_mppbeg(void) {
7772   char *env;
7773 
7774   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7775     if (__kmp_str_match_false(env))
7776       return FALSE;
7777   }
7778   // By default __kmpc_begin() is no-op.
7779   return TRUE;
7780 }
7781 
7782 int __kmp_ignore_mppend(void) {
7783   char *env;
7784 
7785   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7786     if (__kmp_str_match_false(env))
7787       return FALSE;
7788   }
7789   // By default __kmpc_end() is no-op.
7790   return TRUE;
7791 }
7792 
7793 void __kmp_internal_begin(void) {
7794   int gtid;
7795   kmp_root_t *root;
7796 
7797   /* this is a very important step as it will register new sibling threads
7798      and assign these new uber threads a new gtid */
7799   gtid = __kmp_entry_gtid();
7800   root = __kmp_threads[gtid]->th.th_root;
7801   KMP_ASSERT(KMP_UBER_GTID(gtid));
7802 
7803   if (root->r.r_begin)
7804     return;
7805   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7806   if (root->r.r_begin) {
7807     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7808     return;
7809   }
7810 
7811   root->r.r_begin = TRUE;
7812 
7813   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7814 }
7815 
7816 /* ------------------------------------------------------------------------ */
7817 
7818 void __kmp_user_set_library(enum library_type arg) {
7819   int gtid;
7820   kmp_root_t *root;
7821   kmp_info_t *thread;
7822 
7823   /* first, make sure we are initialized so we can get our gtid */
7824 
7825   gtid = __kmp_entry_gtid();
7826   thread = __kmp_threads[gtid];
7827 
7828   root = thread->th.th_root;
7829 
7830   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7831                 library_serial));
7832   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7833                                   thread */
7834     KMP_WARNING(SetLibraryIncorrectCall);
7835     return;
7836   }
7837 
7838   switch (arg) {
7839   case library_serial:
7840     thread->th.th_set_nproc = 0;
7841     set__nproc(thread, 1);
7842     break;
7843   case library_turnaround:
7844     thread->th.th_set_nproc = 0;
7845     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7846                                            : __kmp_dflt_team_nth_ub);
7847     break;
7848   case library_throughput:
7849     thread->th.th_set_nproc = 0;
7850     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7851                                            : __kmp_dflt_team_nth_ub);
7852     break;
7853   default:
7854     KMP_FATAL(UnknownLibraryType, arg);
7855   }
7856 
7857   __kmp_aux_set_library(arg);
7858 }
7859 
7860 void __kmp_aux_set_stacksize(size_t arg) {
7861   if (!__kmp_init_serial)
7862     __kmp_serial_initialize();
7863 
7864 #if KMP_OS_DARWIN
7865   if (arg & (0x1000 - 1)) {
7866     arg &= ~(0x1000 - 1);
7867     if (arg + 0x1000) /* check for overflow if we round up */
7868       arg += 0x1000;
7869   }
7870 #endif
7871   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7872 
7873   /* only change the default stacksize before the first parallel region */
7874   if (!TCR_4(__kmp_init_parallel)) {
7875     size_t value = arg; /* argument is in bytes */
7876 
7877     if (value < __kmp_sys_min_stksize)
7878       value = __kmp_sys_min_stksize;
7879     else if (value > KMP_MAX_STKSIZE)
7880       value = KMP_MAX_STKSIZE;
7881 
7882     __kmp_stksize = value;
7883 
7884     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7885   }
7886 
7887   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7888 }
7889 
7890 /* set the behaviour of the runtime library */
7891 /* TODO this can cause some odd behaviour with sibling parallelism... */
7892 void __kmp_aux_set_library(enum library_type arg) {
7893   __kmp_library = arg;
7894 
7895   switch (__kmp_library) {
7896   case library_serial: {
7897     KMP_INFORM(LibraryIsSerial);
7898   } break;
7899   case library_turnaround:
7900     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7901       __kmp_use_yield = 2; // only yield when oversubscribed
7902     break;
7903   case library_throughput:
7904     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7905       __kmp_dflt_blocktime = 200;
7906     break;
7907   default:
7908     KMP_FATAL(UnknownLibraryType, arg);
7909   }
7910 }
7911 
7912 /* Getting team information common for all team API */
7913 // Returns NULL if not in teams construct
7914 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7915   kmp_info_t *thr = __kmp_entry_thread();
7916   teams_serialized = 0;
7917   if (thr->th.th_teams_microtask) {
7918     kmp_team_t *team = thr->th.th_team;
7919     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7920     int ii = team->t.t_level;
7921     teams_serialized = team->t.t_serialized;
7922     int level = tlevel + 1;
7923     KMP_DEBUG_ASSERT(ii >= tlevel);
7924     while (ii > level) {
7925       for (teams_serialized = team->t.t_serialized;
7926            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7927       }
7928       if (team->t.t_serialized && (!teams_serialized)) {
7929         team = team->t.t_parent;
7930         continue;
7931       }
7932       if (ii > level) {
7933         team = team->t.t_parent;
7934         ii--;
7935       }
7936     }
7937     return team;
7938   }
7939   return NULL;
7940 }
7941 
7942 int __kmp_aux_get_team_num() {
7943   int serialized;
7944   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7945   if (team) {
7946     if (serialized > 1) {
7947       return 0; // teams region is serialized ( 1 team of 1 thread ).
7948     } else {
7949       return team->t.t_master_tid;
7950     }
7951   }
7952   return 0;
7953 }
7954 
7955 int __kmp_aux_get_num_teams() {
7956   int serialized;
7957   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7958   if (team) {
7959     if (serialized > 1) {
7960       return 1;
7961     } else {
7962       return team->t.t_parent->t.t_nproc;
7963     }
7964   }
7965   return 1;
7966 }
7967 
7968 /* ------------------------------------------------------------------------ */
7969 
7970 /*
7971  * Affinity Format Parser
7972  *
7973  * Field is in form of: %[[[0].]size]type
7974  * % and type are required (%% means print a literal '%')
7975  * type is either single char or long name surrounded by {},
7976  * e.g., N or {num_threads}
7977  * 0 => leading zeros
7978  * . => right justified when size is specified
7979  * by default output is left justified
7980  * size is the *minimum* field length
7981  * All other characters are printed as is
7982  *
7983  * Available field types:
7984  * L {thread_level}      - omp_get_level()
7985  * n {thread_num}        - omp_get_thread_num()
7986  * h {host}              - name of host machine
7987  * P {process_id}        - process id (integer)
7988  * T {thread_identifier} - native thread identifier (integer)
7989  * N {num_threads}       - omp_get_num_threads()
7990  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7991  * a {thread_affinity}   - comma separated list of integers or integer ranges
7992  *                         (values of affinity mask)
7993  *
7994  * Implementation-specific field types can be added
7995  * If a type is unknown, print "undefined"
7996 */
7997 
7998 // Structure holding the short name, long name, and corresponding data type
7999 // for snprintf.  A table of these will represent the entire valid keyword
8000 // field types.
8001 typedef struct kmp_affinity_format_field_t {
8002   char short_name; // from spec e.g., L -> thread level
8003   const char *long_name; // from spec thread_level -> thread level
8004   char field_format; // data type for snprintf (typically 'd' or 's'
8005   // for integer or string)
8006 } kmp_affinity_format_field_t;
8007 
8008 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8009 #if KMP_AFFINITY_SUPPORTED
8010     {'A', "thread_affinity", 's'},
8011 #endif
8012     {'t', "team_num", 'd'},
8013     {'T', "num_teams", 'd'},
8014     {'L', "nesting_level", 'd'},
8015     {'n', "thread_num", 'd'},
8016     {'N', "num_threads", 'd'},
8017     {'a', "ancestor_tnum", 'd'},
8018     {'H', "host", 's'},
8019     {'P', "process_id", 'd'},
8020     {'i', "native_thread_id", 'd'}};
8021 
8022 // Return the number of characters it takes to hold field
8023 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8024                                             const char **ptr,
8025                                             kmp_str_buf_t *field_buffer) {
8026   int rc, format_index, field_value;
8027   const char *width_left, *width_right;
8028   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8029   static const int FORMAT_SIZE = 20;
8030   char format[FORMAT_SIZE] = {0};
8031   char absolute_short_name = 0;
8032 
8033   KMP_DEBUG_ASSERT(gtid >= 0);
8034   KMP_DEBUG_ASSERT(th);
8035   KMP_DEBUG_ASSERT(**ptr == '%');
8036   KMP_DEBUG_ASSERT(field_buffer);
8037 
8038   __kmp_str_buf_clear(field_buffer);
8039 
8040   // Skip the initial %
8041   (*ptr)++;
8042 
8043   // Check for %% first
8044   if (**ptr == '%') {
8045     __kmp_str_buf_cat(field_buffer, "%", 1);
8046     (*ptr)++; // skip over the second %
8047     return 1;
8048   }
8049 
8050   // Parse field modifiers if they are present
8051   pad_zeros = false;
8052   if (**ptr == '0') {
8053     pad_zeros = true;
8054     (*ptr)++; // skip over 0
8055   }
8056   right_justify = false;
8057   if (**ptr == '.') {
8058     right_justify = true;
8059     (*ptr)++; // skip over .
8060   }
8061   // Parse width of field: [width_left, width_right)
8062   width_left = width_right = NULL;
8063   if (**ptr >= '0' && **ptr <= '9') {
8064     width_left = *ptr;
8065     SKIP_DIGITS(*ptr);
8066     width_right = *ptr;
8067   }
8068 
8069   // Create the format for KMP_SNPRINTF based on flags parsed above
8070   format_index = 0;
8071   format[format_index++] = '%';
8072   if (!right_justify)
8073     format[format_index++] = '-';
8074   if (pad_zeros)
8075     format[format_index++] = '0';
8076   if (width_left && width_right) {
8077     int i = 0;
8078     // Only allow 8 digit number widths.
8079     // This also prevents overflowing format variable
8080     while (i < 8 && width_left < width_right) {
8081       format[format_index++] = *width_left;
8082       width_left++;
8083       i++;
8084     }
8085   }
8086 
8087   // Parse a name (long or short)
8088   // Canonicalize the name into absolute_short_name
8089   found_valid_name = false;
8090   parse_long_name = (**ptr == '{');
8091   if (parse_long_name)
8092     (*ptr)++; // skip initial left brace
8093   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8094                              sizeof(__kmp_affinity_format_table[0]);
8095        ++i) {
8096     char short_name = __kmp_affinity_format_table[i].short_name;
8097     const char *long_name = __kmp_affinity_format_table[i].long_name;
8098     char field_format = __kmp_affinity_format_table[i].field_format;
8099     if (parse_long_name) {
8100       size_t length = KMP_STRLEN(long_name);
8101       if (strncmp(*ptr, long_name, length) == 0) {
8102         found_valid_name = true;
8103         (*ptr) += length; // skip the long name
8104       }
8105     } else if (**ptr == short_name) {
8106       found_valid_name = true;
8107       (*ptr)++; // skip the short name
8108     }
8109     if (found_valid_name) {
8110       format[format_index++] = field_format;
8111       format[format_index++] = '\0';
8112       absolute_short_name = short_name;
8113       break;
8114     }
8115   }
8116   if (parse_long_name) {
8117     if (**ptr != '}') {
8118       absolute_short_name = 0;
8119     } else {
8120       (*ptr)++; // skip over the right brace
8121     }
8122   }
8123 
8124   // Attempt to fill the buffer with the requested
8125   // value using snprintf within __kmp_str_buf_print()
8126   switch (absolute_short_name) {
8127   case 't':
8128     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8129     break;
8130   case 'T':
8131     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8132     break;
8133   case 'L':
8134     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8135     break;
8136   case 'n':
8137     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8138     break;
8139   case 'H': {
8140     static const int BUFFER_SIZE = 256;
8141     char buf[BUFFER_SIZE];
8142     __kmp_expand_host_name(buf, BUFFER_SIZE);
8143     rc = __kmp_str_buf_print(field_buffer, format, buf);
8144   } break;
8145   case 'P':
8146     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8147     break;
8148   case 'i':
8149     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8150     break;
8151   case 'N':
8152     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8153     break;
8154   case 'a':
8155     field_value =
8156         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8157     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8158     break;
8159 #if KMP_AFFINITY_SUPPORTED
8160   case 'A': {
8161     kmp_str_buf_t buf;
8162     __kmp_str_buf_init(&buf);
8163     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8164     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8165     __kmp_str_buf_free(&buf);
8166   } break;
8167 #endif
8168   default:
8169     // According to spec, If an implementation does not have info for field
8170     // type, then "undefined" is printed
8171     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8172     // Skip the field
8173     if (parse_long_name) {
8174       SKIP_TOKEN(*ptr);
8175       if (**ptr == '}')
8176         (*ptr)++;
8177     } else {
8178       (*ptr)++;
8179     }
8180   }
8181 
8182   KMP_ASSERT(format_index <= FORMAT_SIZE);
8183   return rc;
8184 }
8185 
8186 /*
8187  * Return number of characters needed to hold the affinity string
8188  * (not including null byte character)
8189  * The resultant string is printed to buffer, which the caller can then
8190  * handle afterwards
8191 */
8192 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8193                                   kmp_str_buf_t *buffer) {
8194   const char *parse_ptr;
8195   size_t retval;
8196   const kmp_info_t *th;
8197   kmp_str_buf_t field;
8198 
8199   KMP_DEBUG_ASSERT(buffer);
8200   KMP_DEBUG_ASSERT(gtid >= 0);
8201 
8202   __kmp_str_buf_init(&field);
8203   __kmp_str_buf_clear(buffer);
8204 
8205   th = __kmp_threads[gtid];
8206   retval = 0;
8207 
8208   // If format is NULL or zero-length string, then we use
8209   // affinity-format-var ICV
8210   parse_ptr = format;
8211   if (parse_ptr == NULL || *parse_ptr == '\0') {
8212     parse_ptr = __kmp_affinity_format;
8213   }
8214   KMP_DEBUG_ASSERT(parse_ptr);
8215 
8216   while (*parse_ptr != '\0') {
8217     // Parse a field
8218     if (*parse_ptr == '%') {
8219       // Put field in the buffer
8220       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8221       __kmp_str_buf_catbuf(buffer, &field);
8222       retval += rc;
8223     } else {
8224       // Put literal character in buffer
8225       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8226       retval++;
8227       parse_ptr++;
8228     }
8229   }
8230   __kmp_str_buf_free(&field);
8231   return retval;
8232 }
8233 
8234 // Displays the affinity string to stdout
8235 void __kmp_aux_display_affinity(int gtid, const char *format) {
8236   kmp_str_buf_t buf;
8237   __kmp_str_buf_init(&buf);
8238   __kmp_aux_capture_affinity(gtid, format, &buf);
8239   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8240   __kmp_str_buf_free(&buf);
8241 }
8242 
8243 /* ------------------------------------------------------------------------ */
8244 
8245 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8246   int blocktime = arg; /* argument is in milliseconds */
8247 #if KMP_USE_MONITOR
8248   int bt_intervals;
8249 #endif
8250   kmp_int8 bt_set;
8251 
8252   __kmp_save_internal_controls(thread);
8253 
8254   /* Normalize and set blocktime for the teams */
8255   if (blocktime < KMP_MIN_BLOCKTIME)
8256     blocktime = KMP_MIN_BLOCKTIME;
8257   else if (blocktime > KMP_MAX_BLOCKTIME)
8258     blocktime = KMP_MAX_BLOCKTIME;
8259 
8260   set__blocktime_team(thread->th.th_team, tid, blocktime);
8261   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8262 
8263 #if KMP_USE_MONITOR
8264   /* Calculate and set blocktime intervals for the teams */
8265   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8266 
8267   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8268   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8269 #endif
8270 
8271   /* Set whether blocktime has been set to "TRUE" */
8272   bt_set = TRUE;
8273 
8274   set__bt_set_team(thread->th.th_team, tid, bt_set);
8275   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8276 #if KMP_USE_MONITOR
8277   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8278                 "bt_intervals=%d, monitor_updates=%d\n",
8279                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8280                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8281                 __kmp_monitor_wakeups));
8282 #else
8283   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8284                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8285                 thread->th.th_team->t.t_id, tid, blocktime));
8286 #endif
8287 }
8288 
8289 void __kmp_aux_set_defaults(char const *str, size_t len) {
8290   if (!__kmp_init_serial) {
8291     __kmp_serial_initialize();
8292   }
8293   __kmp_env_initialize(str);
8294 
8295   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8296     __kmp_env_print();
8297   }
8298 } // __kmp_aux_set_defaults
8299 
8300 /* ------------------------------------------------------------------------ */
8301 /* internal fast reduction routines */
8302 
8303 PACKED_REDUCTION_METHOD_T
8304 __kmp_determine_reduction_method(
8305     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8306     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8307     kmp_critical_name *lck) {
8308 
8309   // Default reduction method: critical construct ( lck != NULL, like in current
8310   // PAROPT )
8311   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8312   // can be selected by RTL
8313   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8314   // can be selected by RTL
8315   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8316   // among generated by PAROPT.
8317 
8318   PACKED_REDUCTION_METHOD_T retval;
8319 
8320   int team_size;
8321 
8322   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8323   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8324 
8325 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8326   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8327 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8328 
8329   retval = critical_reduce_block;
8330 
8331   // another choice of getting a team size (with 1 dynamic deference) is slower
8332   team_size = __kmp_get_team_num_threads(global_tid);
8333   if (team_size == 1) {
8334 
8335     retval = empty_reduce_block;
8336 
8337   } else {
8338 
8339     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8340 
8341 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8342     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8343 
8344 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8345     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8346 
8347     int teamsize_cutoff = 4;
8348 
8349 #if KMP_MIC_SUPPORTED
8350     if (__kmp_mic_type != non_mic) {
8351       teamsize_cutoff = 8;
8352     }
8353 #endif
8354     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8355     if (tree_available) {
8356       if (team_size <= teamsize_cutoff) {
8357         if (atomic_available) {
8358           retval = atomic_reduce_block;
8359         }
8360       } else {
8361         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8362       }
8363     } else if (atomic_available) {
8364       retval = atomic_reduce_block;
8365     }
8366 #else
8367 #error "Unknown or unsupported OS"
8368 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8369        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8370 
8371 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8372 
8373 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8374 
8375     // basic tuning
8376 
8377     if (atomic_available) {
8378       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8379         retval = atomic_reduce_block;
8380       }
8381     } // otherwise: use critical section
8382 
8383 #elif KMP_OS_DARWIN
8384 
8385     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8386     if (atomic_available && (num_vars <= 3)) {
8387       retval = atomic_reduce_block;
8388     } else if (tree_available) {
8389       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8390           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8391         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8392       }
8393     } // otherwise: use critical section
8394 
8395 #else
8396 #error "Unknown or unsupported OS"
8397 #endif
8398 
8399 #else
8400 #error "Unknown or unsupported architecture"
8401 #endif
8402   }
8403 
8404   // KMP_FORCE_REDUCTION
8405 
8406   // If the team is serialized (team_size == 1), ignore the forced reduction
8407   // method and stay with the unsynchronized method (empty_reduce_block)
8408   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8409       team_size != 1) {
8410 
8411     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8412 
8413     int atomic_available, tree_available;
8414 
8415     switch ((forced_retval = __kmp_force_reduction_method)) {
8416     case critical_reduce_block:
8417       KMP_ASSERT(lck); // lck should be != 0
8418       break;
8419 
8420     case atomic_reduce_block:
8421       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8422       if (!atomic_available) {
8423         KMP_WARNING(RedMethodNotSupported, "atomic");
8424         forced_retval = critical_reduce_block;
8425       }
8426       break;
8427 
8428     case tree_reduce_block:
8429       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8430       if (!tree_available) {
8431         KMP_WARNING(RedMethodNotSupported, "tree");
8432         forced_retval = critical_reduce_block;
8433       } else {
8434 #if KMP_FAST_REDUCTION_BARRIER
8435         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8436 #endif
8437       }
8438       break;
8439 
8440     default:
8441       KMP_ASSERT(0); // "unsupported method specified"
8442     }
8443 
8444     retval = forced_retval;
8445   }
8446 
8447   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8448 
8449 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8450 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8451 
8452   return (retval);
8453 }
8454 // this function is for testing set/get/determine reduce method
8455 kmp_int32 __kmp_get_reduce_method(void) {
8456   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8457 }
8458 
8459 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8460 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8461 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8462 
8463 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8464 // OpenMP is used subsequently.
8465 void __kmp_hard_pause() {
8466   __kmp_pause_status = kmp_hard_paused;
8467   __kmp_internal_end_thread(-1);
8468 }
8469 
8470 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8471 void __kmp_resume_if_soft_paused() {
8472   if (__kmp_pause_status == kmp_soft_paused) {
8473     __kmp_pause_status = kmp_not_paused;
8474 
8475     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8476       kmp_info_t *thread = __kmp_threads[gtid];
8477       if (thread) { // Wake it if sleeping
8478         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8479                          thread);
8480         if (fl.is_sleeping())
8481           fl.resume(gtid);
8482         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8483           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8484         } else { // thread holds the lock and may sleep soon
8485           do { // until either the thread sleeps, or we can get the lock
8486             if (fl.is_sleeping()) {
8487               fl.resume(gtid);
8488               break;
8489             } else if (__kmp_try_suspend_mx(thread)) {
8490               __kmp_unlock_suspend_mx(thread);
8491               break;
8492             }
8493           } while (1);
8494         }
8495       }
8496     }
8497   }
8498 }
8499 
8500 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8501 // TODO: add warning messages
8502 int __kmp_pause_resource(kmp_pause_status_t level) {
8503   if (level == kmp_not_paused) { // requesting resume
8504     if (__kmp_pause_status == kmp_not_paused) {
8505       // error message about runtime not being paused, so can't resume
8506       return 1;
8507     } else {
8508       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8509                        __kmp_pause_status == kmp_hard_paused);
8510       __kmp_pause_status = kmp_not_paused;
8511       return 0;
8512     }
8513   } else if (level == kmp_soft_paused) { // requesting soft pause
8514     if (__kmp_pause_status != kmp_not_paused) {
8515       // error message about already being paused
8516       return 1;
8517     } else {
8518       __kmp_soft_pause();
8519       return 0;
8520     }
8521   } else if (level == kmp_hard_paused) { // requesting hard pause
8522     if (__kmp_pause_status != kmp_not_paused) {
8523       // error message about already being paused
8524       return 1;
8525     } else {
8526       __kmp_hard_pause();
8527       return 0;
8528     }
8529   } else {
8530     // error message about invalid level
8531     return 1;
8532   }
8533 }
8534 
8535 void __kmp_omp_display_env(int verbose) {
8536   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8537   if (__kmp_init_serial == 0)
8538     __kmp_do_serial_initialize();
8539   __kmp_display_env_impl(!verbose, verbose);
8540   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8541 }
8542 
8543 // Globals and functions for hidden helper task
8544 kmp_info_t **__kmp_hidden_helper_threads;
8545 kmp_info_t *__kmp_hidden_helper_main_thread;
8546 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8547 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8548 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8549 
8550 namespace {
8551 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8552 
8553 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8554   // This is an explicit synchronization on all hidden helper threads in case
8555   // that when a regular thread pushes a hidden helper task to one hidden
8556   // helper thread, the thread has not been awaken once since they're released
8557   // by the main thread after creating the team.
8558   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8559   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8560          __kmp_hidden_helper_threads_num)
8561     ;
8562 
8563   // If main thread, then wait for signal
8564   if (__kmpc_master(nullptr, *gtid)) {
8565     // First, unset the initial state and release the initial thread
8566     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8567     __kmp_hidden_helper_initz_release();
8568     __kmp_hidden_helper_main_thread_wait();
8569     // Now wake up all worker threads
8570     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8571       __kmp_hidden_helper_worker_thread_signal();
8572     }
8573   }
8574 }
8575 } // namespace
8576 
8577 void __kmp_hidden_helper_threads_initz_routine() {
8578   // Create a new root for hidden helper team/threads
8579   const int gtid = __kmp_register_root(TRUE);
8580   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8581   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8582   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8583       __kmp_hidden_helper_threads_num;
8584 
8585   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8586 
8587   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8588 
8589   // Set the initialization flag to FALSE
8590   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8591 
8592   __kmp_hidden_helper_threads_deinitz_release();
8593 }
8594