1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if KMP_OS_WINDOWS
45 // windows does not need include files as it doesn't use shared memory
46 #else
47 #include <sys/mman.h>
48 #include <sys/stat.h>
49 #include <fcntl.h>
50 #define SHM_SIZE 1024
51 #endif
52 
53 #if defined(KMP_GOMP_COMPAT)
54 char const __kmp_version_alt_comp[] =
55     KMP_VERSION_PREFIX "alternative compiler support: yes";
56 #endif /* defined(KMP_GOMP_COMPAT) */
57 
58 char const __kmp_version_omp_api[] =
59     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63     KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79                                   int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81                                   kmp_internal_control_t *new_icvs,
82                                   ident_t *loc);
83 #if KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85                                    int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91                           kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
102 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
103 
104 /* Calculate the identifier of the current thread */
105 /* fast (and somewhat portable) way to get unique identifier of executing
106    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
107 int __kmp_get_global_thread_id() {
108   int i;
109   kmp_info_t **other_threads;
110   size_t stack_data;
111   char *stack_addr;
112   size_t stack_size;
113   char *stack_base;
114 
115   KA_TRACE(
116       1000,
117       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
118        __kmp_nth, __kmp_all_nth));
119 
120   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
121      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
122      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
123      __kmp_init_gtid for this to work. */
124 
125   if (!TCR_4(__kmp_init_gtid))
126     return KMP_GTID_DNE;
127 
128 #ifdef KMP_TDATA_GTID
129   if (TCR_4(__kmp_gtid_mode) >= 3) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
131     return __kmp_gtid;
132   }
133 #endif
134   if (TCR_4(__kmp_gtid_mode) >= 2) {
135     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
136     return __kmp_gtid_get_specific();
137   }
138   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
139 
140   stack_addr = (char *)&stack_data;
141   other_threads = __kmp_threads;
142 
143   /* ATT: The code below is a source of potential bugs due to unsynchronized
144      access to __kmp_threads array. For example:
145      1. Current thread loads other_threads[i] to thr and checks it, it is
146         non-NULL.
147      2. Current thread is suspended by OS.
148      3. Another thread unregisters and finishes (debug versions of free()
149         may fill memory with something like 0xEF).
150      4. Current thread is resumed.
151      5. Current thread reads junk from *thr.
152      TODO: Fix it.  --ln  */
153 
154   for (i = 0; i < __kmp_threads_capacity; i++) {
155 
156     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
157     if (!thr)
158       continue;
159 
160     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
161     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
162 
163     /* stack grows down -- search through all of the active threads */
164 
165     if (stack_addr <= stack_base) {
166       size_t stack_diff = stack_base - stack_addr;
167 
168       if (stack_diff <= stack_size) {
169         /* The only way we can be closer than the allocated */
170         /* stack size is if we are running on this thread. */
171         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
172         return i;
173       }
174     }
175   }
176 
177   /* get specific to try and determine our gtid */
178   KA_TRACE(1000,
179            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
180             "thread, using TLS\n"));
181   i = __kmp_gtid_get_specific();
182 
183   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
184 
185   /* if we havn't been assigned a gtid, then return code */
186   if (i < 0)
187     return i;
188 
189   /* dynamically updated stack window for uber threads to avoid get_specific
190      call */
191   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
192     KMP_FATAL(StackOverflow, i);
193   }
194 
195   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
196   if (stack_addr > stack_base) {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
198     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
199             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
200                 stack_base);
201   } else {
202     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
203             stack_base - stack_addr);
204   }
205 
206   /* Reprint stack bounds for ubermaster since they have been refined */
207   if (__kmp_storage_map) {
208     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
209     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
210     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
211                                  other_threads[i]->th.th_info.ds.ds_stacksize,
212                                  "th_%d stack (refinement)", i);
213   }
214   return i;
215 }
216 
217 int __kmp_get_global_thread_id_reg() {
218   int gtid;
219 
220   if (!__kmp_init_serial) {
221     gtid = KMP_GTID_DNE;
222   } else
223 #ifdef KMP_TDATA_GTID
224       if (TCR_4(__kmp_gtid_mode) >= 3) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
226     gtid = __kmp_gtid;
227   } else
228 #endif
229       if (TCR_4(__kmp_gtid_mode) >= 2) {
230     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
231     gtid = __kmp_gtid_get_specific();
232   } else {
233     KA_TRACE(1000,
234              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
235     gtid = __kmp_get_global_thread_id();
236   }
237 
238   /* we must be a new uber master sibling thread */
239   if (gtid == KMP_GTID_DNE) {
240     KA_TRACE(10,
241              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
242               "Registering a new gtid.\n"));
243     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
244     if (!__kmp_init_serial) {
245       __kmp_do_serial_initialize();
246       gtid = __kmp_gtid_get_specific();
247     } else {
248       gtid = __kmp_register_root(FALSE);
249     }
250     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
251     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
252   }
253 
254   KMP_DEBUG_ASSERT(gtid >= 0);
255 
256   return gtid;
257 }
258 
259 /* caller must hold forkjoin_lock */
260 void __kmp_check_stack_overlap(kmp_info_t *th) {
261   int f;
262   char *stack_beg = NULL;
263   char *stack_end = NULL;
264   int gtid;
265 
266   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
267   if (__kmp_storage_map) {
268     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
269     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
270 
271     gtid = __kmp_gtid_from_thread(th);
272 
273     if (gtid == KMP_GTID_MONITOR) {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%s stack (%s)", "mon",
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     } else {
279       __kmp_print_storage_map_gtid(
280           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281           "th_%d stack (%s)", gtid,
282           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283     }
284   }
285 
286   /* No point in checking ubermaster threads since they use refinement and
287    * cannot overlap */
288   gtid = __kmp_gtid_from_thread(th);
289   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
290     KA_TRACE(10,
291              ("__kmp_check_stack_overlap: performing extensive checking\n"));
292     if (stack_beg == NULL) {
293       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
294       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
295     }
296 
297     for (f = 0; f < __kmp_threads_capacity; f++) {
298       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
299 
300       if (f_th && f_th != th) {
301         char *other_stack_end =
302             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
303         char *other_stack_beg =
304             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
305         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
306             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
307 
308           /* Print the other stack values before the abort */
309           if (__kmp_storage_map)
310             __kmp_print_storage_map_gtid(
311                 -1, other_stack_beg, other_stack_end,
312                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
313                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
314 
315           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
316                       __kmp_msg_null);
317         }
318       }
319     }
320   }
321   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
322 }
323 
324 /* ------------------------------------------------------------------------ */
325 
326 void __kmp_infinite_loop(void) {
327   static int done = FALSE;
328 
329   while (!done) {
330     KMP_YIELD(TRUE);
331   }
332 }
333 
334 #define MAX_MESSAGE 512
335 
336 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
337                                   char const *format, ...) {
338   char buffer[MAX_MESSAGE];
339   va_list ap;
340 
341   va_start(ap, format);
342   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
343                p2, (unsigned long)size, format);
344   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
345   __kmp_vprintf(kmp_err, buffer, ap);
346 #if KMP_PRINT_DATA_PLACEMENT
347   int node;
348   if (gtid >= 0) {
349     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
350       if (__kmp_storage_map_verbose) {
351         node = __kmp_get_host_node(p1);
352         if (node < 0) /* doesn't work, so don't try this next time */
353           __kmp_storage_map_verbose = FALSE;
354         else {
355           char *last;
356           int lastNode;
357           int localProc = __kmp_get_cpu_from_gtid(gtid);
358 
359           const int page_size = KMP_GET_PAGE_SIZE();
360 
361           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
362           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
363           if (localProc >= 0)
364             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
365                                  localProc >> 1);
366           else
367             __kmp_printf_no_lock("  GTID %d\n", gtid);
368 #if KMP_USE_PRCTL
369           /* The more elaborate format is disabled for now because of the prctl
370            * hanging bug. */
371           do {
372             last = p1;
373             lastNode = node;
374             /* This loop collates adjacent pages with the same host node. */
375             do {
376               (char *)p1 += page_size;
377             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
378             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
379                                  lastNode);
380           } while (p1 <= p2);
381 #else
382           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
383                                (char *)p1 + (page_size - 1),
384                                __kmp_get_host_node(p1));
385           if (p1 < p2) {
386             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
387                                  (char *)p2 + (page_size - 1),
388                                  __kmp_get_host_node(p2));
389           }
390 #endif
391         }
392       }
393     } else
394       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
395   }
396 #endif /* KMP_PRINT_DATA_PLACEMENT */
397   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
398 }
399 
400 void __kmp_warn(char const *format, ...) {
401   char buffer[MAX_MESSAGE];
402   va_list ap;
403 
404   if (__kmp_generate_warnings == kmp_warnings_off) {
405     return;
406   }
407 
408   va_start(ap, format);
409 
410   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
411   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
412   __kmp_vprintf(kmp_err, buffer, ap);
413   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
414 
415   va_end(ap);
416 }
417 
418 void __kmp_abort_process() {
419   // Later threads may stall here, but that's ok because abort() will kill them.
420   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
421 
422   if (__kmp_debug_buf) {
423     __kmp_dump_debug_buffer();
424   }
425 
426   if (KMP_OS_WINDOWS) {
427     // Let other threads know of abnormal termination and prevent deadlock
428     // if abort happened during library initialization or shutdown
429     __kmp_global.g.g_abort = SIGABRT;
430 
431     /* On Windows* OS by default abort() causes pop-up error box, which stalls
432        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
433        boxes. _set_abort_behavior() works well, but this function is not
434        available in VS7 (this is not problem for DLL, but it is a problem for
435        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
436        help, at least in some versions of MS C RTL.
437 
438        It seems following sequence is the only way to simulate abort() and
439        avoid pop-up error box. */
440     raise(SIGABRT);
441     _exit(3); // Just in case, if signal ignored, exit anyway.
442   } else {
443     __kmp_unregister_library();
444     abort();
445   }
446 
447   __kmp_infinite_loop();
448   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449 
450 } // __kmp_abort_process
451 
452 void __kmp_abort_thread(void) {
453   // TODO: Eliminate g_abort global variable and this function.
454   // In case of abort just call abort(), it will kill all the threads.
455   __kmp_infinite_loop();
456 } // __kmp_abort_thread
457 
458 /* Print out the storage map for the major kmp_info_t thread data structures
459    that are allocated together. */
460 
461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463                                gtid);
464 
465   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467 
468   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
470 
471   __kmp_print_storage_map_gtid(
472       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474 
475   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476                                &thr->th.th_bar[bs_plain_barrier + 1],
477                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478                                gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483                                gtid);
484 
485 #if KMP_FAST_REDUCTION_BARRIER
486   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487                                &thr->th.th_bar[bs_reduction_barrier + 1],
488                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489                                gtid);
490 #endif // KMP_FAST_REDUCTION_BARRIER
491 }
492 
493 /* Print out the storage map for the major kmp_team_t team data structures
494    that are allocated together. */
495 
496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497                                          int team_id, int num_thr) {
498   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500                                header, team_id);
501 
502   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503                                &team->t.t_bar[bs_last_barrier],
504                                sizeof(kmp_balign_team_t) * bs_last_barrier,
505                                "%s_%d.t_bar", header, team_id);
506 
507   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508                                &team->t.t_bar[bs_plain_barrier + 1],
509                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510                                header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513                                &team->t.t_bar[bs_forkjoin_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[forkjoin]", header, team_id);
516 
517 #if KMP_FAST_REDUCTION_BARRIER
518   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519                                &team->t.t_bar[bs_reduction_barrier + 1],
520                                sizeof(kmp_balign_team_t),
521                                "%s_%d.t_bar[reduction]", header, team_id);
522 #endif // KMP_FAST_REDUCTION_BARRIER
523 
524   __kmp_print_storage_map_gtid(
525       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527 
528   __kmp_print_storage_map_gtid(
529       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531 
532   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533                                &team->t.t_disp_buffer[num_disp_buff],
534                                sizeof(dispatch_shared_info_t) * num_disp_buff,
535                                "%s_%d.t_disp_buffer", header, team_id);
536 }
537 
538 static void __kmp_init_allocator() { __kmp_init_memkind(); }
539 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
540 
541 /* ------------------------------------------------------------------------ */
542 
543 #if KMP_DYNAMIC_LIB
544 #if KMP_OS_WINDOWS
545 
546 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
547   // TODO: Change to __kmp_break_bootstrap_lock().
548   __kmp_init_bootstrap_lock(lck); // make the lock released
549 }
550 
551 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
552   int i;
553   int thread_count;
554 
555   // PROCESS_DETACH is expected to be called by a thread that executes
556   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
557   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
558   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
559   // threads can be still alive here, although being about to be terminated. The
560   // threads in the array with ds_thread==0 are most suspicious. Actually, it
561   // can be not safe to access the __kmp_threads[].
562 
563   // TODO: does it make sense to check __kmp_roots[] ?
564 
565   // Let's check that there are no other alive threads registered with the OMP
566   // lib.
567   while (1) {
568     thread_count = 0;
569     for (i = 0; i < __kmp_threads_capacity; ++i) {
570       if (!__kmp_threads)
571         continue;
572       kmp_info_t *th = __kmp_threads[i];
573       if (th == NULL)
574         continue;
575       int gtid = th->th.th_info.ds.ds_gtid;
576       if (gtid == gtid_req)
577         continue;
578       if (gtid < 0)
579         continue;
580       DWORD exit_val;
581       int alive = __kmp_is_thread_alive(th, &exit_val);
582       if (alive) {
583         ++thread_count;
584       }
585     }
586     if (thread_count == 0)
587       break; // success
588   }
589 
590   // Assume that I'm alone. Now it might be safe to check and reset locks.
591   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
592   __kmp_reset_lock(&__kmp_forkjoin_lock);
593 #ifdef KMP_DEBUG
594   __kmp_reset_lock(&__kmp_stdio_lock);
595 #endif // KMP_DEBUG
596 }
597 
598 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
599   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
600 
601   switch (fdwReason) {
602 
603   case DLL_PROCESS_ATTACH:
604     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
605 
606     return TRUE;
607 
608   case DLL_PROCESS_DETACH:
609     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
610 
611     if (lpReserved != NULL) {
612       // lpReserved is used for telling the difference:
613       //   lpReserved == NULL when FreeLibrary() was called,
614       //   lpReserved != NULL when the process terminates.
615       // When FreeLibrary() is called, worker threads remain alive. So they will
616       // release the forkjoin lock by themselves. When the process terminates,
617       // worker threads disappear triggering the problem of unreleased forkjoin
618       // lock as described below.
619 
620       // A worker thread can take the forkjoin lock. The problem comes up if
621       // that worker thread becomes dead before it releases the forkjoin lock.
622       // The forkjoin lock remains taken, while the thread executing
623       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
624       // to take the forkjoin lock and will always fail, so that the application
625       // will never finish [normally]. This scenario is possible if
626       // __kmpc_end() has not been executed. It looks like it's not a corner
627       // case, but common cases:
628       // - the main function was compiled by an alternative compiler;
629       // - the main function was compiled by icl but without /Qopenmp
630       //   (application with plugins);
631       // - application terminates by calling C exit(), Fortran CALL EXIT() or
632       //   Fortran STOP.
633       // - alive foreign thread prevented __kmpc_end from doing cleanup.
634       //
635       // This is a hack to work around the problem.
636       // TODO: !!! figure out something better.
637       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
638     }
639 
640     __kmp_internal_end_library(__kmp_gtid_get_specific());
641 
642     return TRUE;
643 
644   case DLL_THREAD_ATTACH:
645     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
646 
647     /* if we want to register new siblings all the time here call
648      * __kmp_get_gtid(); */
649     return TRUE;
650 
651   case DLL_THREAD_DETACH:
652     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
653 
654     __kmp_internal_end_thread(__kmp_gtid_get_specific());
655     return TRUE;
656   }
657 
658   return TRUE;
659 }
660 
661 #endif /* KMP_OS_WINDOWS */
662 #endif /* KMP_DYNAMIC_LIB */
663 
664 /* __kmp_parallel_deo -- Wait until it's our turn. */
665 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
666   int gtid = *gtid_ref;
667 #ifdef BUILD_PARALLEL_ORDERED
668   kmp_team_t *team = __kmp_team_from_gtid(gtid);
669 #endif /* BUILD_PARALLEL_ORDERED */
670 
671   if (__kmp_env_consistency_check) {
672     if (__kmp_threads[gtid]->th.th_root->r.r_active)
673 #if KMP_USE_DYNAMIC_LOCK
674       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
675 #else
676       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
677 #endif
678   }
679 #ifdef BUILD_PARALLEL_ORDERED
680   if (!team->t.t_serialized) {
681     KMP_MB();
682     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
683              NULL);
684     KMP_MB();
685   }
686 #endif /* BUILD_PARALLEL_ORDERED */
687 }
688 
689 /* __kmp_parallel_dxo -- Signal the next task. */
690 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
691   int gtid = *gtid_ref;
692 #ifdef BUILD_PARALLEL_ORDERED
693   int tid = __kmp_tid_from_gtid(gtid);
694   kmp_team_t *team = __kmp_team_from_gtid(gtid);
695 #endif /* BUILD_PARALLEL_ORDERED */
696 
697   if (__kmp_env_consistency_check) {
698     if (__kmp_threads[gtid]->th.th_root->r.r_active)
699       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
700   }
701 #ifdef BUILD_PARALLEL_ORDERED
702   if (!team->t.t_serialized) {
703     KMP_MB(); /* Flush all pending memory write invalidates.  */
704 
705     /* use the tid of the next thread in this team */
706     /* TODO replace with general release procedure */
707     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
708 
709     KMP_MB(); /* Flush all pending memory write invalidates.  */
710   }
711 #endif /* BUILD_PARALLEL_ORDERED */
712 }
713 
714 /* ------------------------------------------------------------------------ */
715 /* The BARRIER for a SINGLE process section is always explicit   */
716 
717 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
718   int status;
719   kmp_info_t *th;
720   kmp_team_t *team;
721 
722   if (!TCR_4(__kmp_init_parallel))
723     __kmp_parallel_initialize();
724   __kmp_resume_if_soft_paused();
725 
726   th = __kmp_threads[gtid];
727   team = th->th.th_team;
728   status = 0;
729 
730   th->th.th_ident = id_ref;
731 
732   if (team->t.t_serialized) {
733     status = 1;
734   } else {
735     kmp_int32 old_this = th->th.th_local.this_construct;
736 
737     ++th->th.th_local.this_construct;
738     /* try to set team count to thread count--success means thread got the
739        single block */
740     /* TODO: Should this be acquire or release? */
741     if (team->t.t_construct == old_this) {
742       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
743                                               th->th.th_local.this_construct);
744     }
745 #if USE_ITT_BUILD
746     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
747         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
748         team->t.t_active_level ==
749             1) { // Only report metadata by master of active team at level 1
750       __kmp_itt_metadata_single(id_ref);
751     }
752 #endif /* USE_ITT_BUILD */
753   }
754 
755   if (__kmp_env_consistency_check) {
756     if (status && push_ws) {
757       __kmp_push_workshare(gtid, ct_psingle, id_ref);
758     } else {
759       __kmp_check_workshare(gtid, ct_psingle, id_ref);
760     }
761   }
762 #if USE_ITT_BUILD
763   if (status) {
764     __kmp_itt_single_start(gtid);
765   }
766 #endif /* USE_ITT_BUILD */
767   return status;
768 }
769 
770 void __kmp_exit_single(int gtid) {
771 #if USE_ITT_BUILD
772   __kmp_itt_single_end(gtid);
773 #endif /* USE_ITT_BUILD */
774   if (__kmp_env_consistency_check)
775     __kmp_pop_workshare(gtid, ct_psingle, NULL);
776 }
777 
778 /* determine if we can go parallel or must use a serialized parallel region and
779  * how many threads we can use
780  * set_nproc is the number of threads requested for the team
781  * returns 0 if we should serialize or only use one thread,
782  * otherwise the number of threads to use
783  * The forkjoin lock is held by the caller. */
784 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
785                                  int master_tid, int set_nthreads,
786                                  int enter_teams) {
787   int capacity;
788   int new_nthreads;
789   KMP_DEBUG_ASSERT(__kmp_init_serial);
790   KMP_DEBUG_ASSERT(root && parent_team);
791   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
792 
793   // If dyn-var is set, dynamically adjust the number of desired threads,
794   // according to the method specified by dynamic_mode.
795   new_nthreads = set_nthreads;
796   if (!get__dynamic_2(parent_team, master_tid)) {
797     ;
798   }
799 #ifdef USE_LOAD_BALANCE
800   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
801     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
802     if (new_nthreads == 1) {
803       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
804                     "reservation to 1 thread\n",
805                     master_tid));
806       return 1;
807     }
808     if (new_nthreads < set_nthreads) {
809       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
810                     "reservation to %d threads\n",
811                     master_tid, new_nthreads));
812     }
813   }
814 #endif /* USE_LOAD_BALANCE */
815   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
816     new_nthreads = __kmp_avail_proc - __kmp_nth +
817                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
818     if (new_nthreads <= 1) {
819       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
820                     "reservation to 1 thread\n",
821                     master_tid));
822       return 1;
823     }
824     if (new_nthreads < set_nthreads) {
825       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
826                     "reservation to %d threads\n",
827                     master_tid, new_nthreads));
828     } else {
829       new_nthreads = set_nthreads;
830     }
831   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
832     if (set_nthreads > 2) {
833       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
834       new_nthreads = (new_nthreads % set_nthreads) + 1;
835       if (new_nthreads == 1) {
836         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
837                       "reservation to 1 thread\n",
838                       master_tid));
839         return 1;
840       }
841       if (new_nthreads < set_nthreads) {
842         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
843                       "reservation to %d threads\n",
844                       master_tid, new_nthreads));
845       }
846     }
847   } else {
848     KMP_ASSERT(0);
849   }
850 
851   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
852   if (__kmp_nth + new_nthreads -
853           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
854       __kmp_max_nth) {
855     int tl_nthreads = __kmp_max_nth - __kmp_nth +
856                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
857     if (tl_nthreads <= 0) {
858       tl_nthreads = 1;
859     }
860 
861     // If dyn-var is false, emit a 1-time warning.
862     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
863       __kmp_reserve_warn = 1;
864       __kmp_msg(kmp_ms_warning,
865                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
866                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
867     }
868     if (tl_nthreads == 1) {
869       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
870                     "reduced reservation to 1 thread\n",
871                     master_tid));
872       return 1;
873     }
874     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
875                   "reservation to %d threads\n",
876                   master_tid, tl_nthreads));
877     new_nthreads = tl_nthreads;
878   }
879 
880   // Respect OMP_THREAD_LIMIT
881   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
882   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
883   if (cg_nthreads + new_nthreads -
884           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
885       max_cg_threads) {
886     int tl_nthreads = max_cg_threads - cg_nthreads +
887                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
888     if (tl_nthreads <= 0) {
889       tl_nthreads = 1;
890     }
891 
892     // If dyn-var is false, emit a 1-time warning.
893     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
894       __kmp_reserve_warn = 1;
895       __kmp_msg(kmp_ms_warning,
896                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
897                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
898     }
899     if (tl_nthreads == 1) {
900       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
901                     "reduced reservation to 1 thread\n",
902                     master_tid));
903       return 1;
904     }
905     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
906                   "reservation to %d threads\n",
907                   master_tid, tl_nthreads));
908     new_nthreads = tl_nthreads;
909   }
910 
911   // Check if the threads array is large enough, or needs expanding.
912   // See comment in __kmp_register_root() about the adjustment if
913   // __kmp_threads[0] == NULL.
914   capacity = __kmp_threads_capacity;
915   if (TCR_PTR(__kmp_threads[0]) == NULL) {
916     --capacity;
917   }
918   if (__kmp_nth + new_nthreads -
919           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
920       capacity) {
921     // Expand the threads array.
922     int slotsRequired = __kmp_nth + new_nthreads -
923                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
924                         capacity;
925     int slotsAdded = __kmp_expand_threads(slotsRequired);
926     if (slotsAdded < slotsRequired) {
927       // The threads array was not expanded enough.
928       new_nthreads -= (slotsRequired - slotsAdded);
929       KMP_ASSERT(new_nthreads >= 1);
930 
931       // If dyn-var is false, emit a 1-time warning.
932       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
933         __kmp_reserve_warn = 1;
934         if (__kmp_tp_cached) {
935           __kmp_msg(kmp_ms_warning,
936                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
937                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
938                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
939         } else {
940           __kmp_msg(kmp_ms_warning,
941                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
942                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
943         }
944       }
945     }
946   }
947 
948 #ifdef KMP_DEBUG
949   if (new_nthreads == 1) {
950     KC_TRACE(10,
951              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
952               "dead roots and rechecking; requested %d threads\n",
953               __kmp_get_gtid(), set_nthreads));
954   } else {
955     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
956                   " %d threads\n",
957                   __kmp_get_gtid(), new_nthreads, set_nthreads));
958   }
959 #endif // KMP_DEBUG
960   return new_nthreads;
961 }
962 
963 /* Allocate threads from the thread pool and assign them to the new team. We are
964    assured that there are enough threads available, because we checked on that
965    earlier within critical section forkjoin */
966 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
967                                     kmp_info_t *master_th, int master_gtid) {
968   int i;
969   int use_hot_team;
970 
971   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
972   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
973   KMP_MB();
974 
975   /* first, let's setup the master thread */
976   master_th->th.th_info.ds.ds_tid = 0;
977   master_th->th.th_team = team;
978   master_th->th.th_team_nproc = team->t.t_nproc;
979   master_th->th.th_team_master = master_th;
980   master_th->th.th_team_serialized = FALSE;
981   master_th->th.th_dispatch = &team->t.t_dispatch[0];
982 
983 /* make sure we are not the optimized hot team */
984 #if KMP_NESTED_HOT_TEAMS
985   use_hot_team = 0;
986   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
987   if (hot_teams) { // hot teams array is not allocated if
988     // KMP_HOT_TEAMS_MAX_LEVEL=0
989     int level = team->t.t_active_level - 1; // index in array of hot teams
990     if (master_th->th.th_teams_microtask) { // are we inside the teams?
991       if (master_th->th.th_teams_size.nteams > 1) {
992         ++level; // level was not increased in teams construct for
993         // team_of_masters
994       }
995       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
996           master_th->th.th_teams_level == team->t.t_level) {
997         ++level; // level was not increased in teams construct for
998         // team_of_workers before the parallel
999       } // team->t.t_level will be increased inside parallel
1000     }
1001     if (level < __kmp_hot_teams_max_level) {
1002       if (hot_teams[level].hot_team) {
1003         // hot team has already been allocated for given level
1004         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1005         use_hot_team = 1; // the team is ready to use
1006       } else {
1007         use_hot_team = 0; // AC: threads are not allocated yet
1008         hot_teams[level].hot_team = team; // remember new hot team
1009         hot_teams[level].hot_team_nth = team->t.t_nproc;
1010       }
1011     } else {
1012       use_hot_team = 0;
1013     }
1014   }
1015 #else
1016   use_hot_team = team == root->r.r_hot_team;
1017 #endif
1018   if (!use_hot_team) {
1019 
1020     /* install the master thread */
1021     team->t.t_threads[0] = master_th;
1022     __kmp_initialize_info(master_th, team, 0, master_gtid);
1023 
1024     /* now, install the worker threads */
1025     for (i = 1; i < team->t.t_nproc; i++) {
1026 
1027       /* fork or reallocate a new thread and install it in team */
1028       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1029       team->t.t_threads[i] = thr;
1030       KMP_DEBUG_ASSERT(thr);
1031       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1032       /* align team and thread arrived states */
1033       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1034                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1035                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1036                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1037                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1038                     team->t.t_bar[bs_plain_barrier].b_arrived));
1039       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1040       thr->th.th_teams_level = master_th->th.th_teams_level;
1041       thr->th.th_teams_size = master_th->th.th_teams_size;
1042       { // Initialize threads' barrier data.
1043         int b;
1044         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1045         for (b = 0; b < bs_last_barrier; ++b) {
1046           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1047           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1048 #if USE_DEBUGGER
1049           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1050 #endif
1051         }
1052       }
1053     }
1054 
1055 #if KMP_AFFINITY_SUPPORTED
1056     __kmp_partition_places(team);
1057 #endif
1058   }
1059 
1060   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1061     for (i = 0; i < team->t.t_nproc; i++) {
1062       kmp_info_t *thr = team->t.t_threads[i];
1063       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1064           thr->th.th_prev_level != team->t.t_level) {
1065         team->t.t_display_affinity = 1;
1066         break;
1067       }
1068     }
1069   }
1070 
1071   KMP_MB();
1072 }
1073 
1074 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1075 // Propagate any changes to the floating point control registers out to the team
1076 // We try to avoid unnecessary writes to the relevant cache line in the team
1077 // structure, so we don't make changes unless they are needed.
1078 inline static void propagateFPControl(kmp_team_t *team) {
1079   if (__kmp_inherit_fp_control) {
1080     kmp_int16 x87_fpu_control_word;
1081     kmp_uint32 mxcsr;
1082 
1083     // Get master values of FPU control flags (both X87 and vector)
1084     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1085     __kmp_store_mxcsr(&mxcsr);
1086     mxcsr &= KMP_X86_MXCSR_MASK;
1087 
1088     // There is no point looking at t_fp_control_saved here.
1089     // If it is TRUE, we still have to update the values if they are different
1090     // from those we now have. If it is FALSE we didn't save anything yet, but
1091     // our objective is the same. We have to ensure that the values in the team
1092     // are the same as those we have.
1093     // So, this code achieves what we need whether or not t_fp_control_saved is
1094     // true. By checking whether the value needs updating we avoid unnecessary
1095     // writes that would put the cache-line into a written state, causing all
1096     // threads in the team to have to read it again.
1097     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1098     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1099     // Although we don't use this value, other code in the runtime wants to know
1100     // whether it should restore them. So we must ensure it is correct.
1101     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1102   } else {
1103     // Similarly here. Don't write to this cache-line in the team structure
1104     // unless we have to.
1105     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1106   }
1107 }
1108 
1109 // Do the opposite, setting the hardware registers to the updated values from
1110 // the team.
1111 inline static void updateHWFPControl(kmp_team_t *team) {
1112   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1113     // Only reset the fp control regs if they have been changed in the team.
1114     // the parallel region that we are exiting.
1115     kmp_int16 x87_fpu_control_word;
1116     kmp_uint32 mxcsr;
1117     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1118     __kmp_store_mxcsr(&mxcsr);
1119     mxcsr &= KMP_X86_MXCSR_MASK;
1120 
1121     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1122       __kmp_clear_x87_fpu_status_word();
1123       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1124     }
1125 
1126     if (team->t.t_mxcsr != mxcsr) {
1127       __kmp_load_mxcsr(&team->t.t_mxcsr);
1128     }
1129   }
1130 }
1131 #else
1132 #define propagateFPControl(x) ((void)0)
1133 #define updateHWFPControl(x) ((void)0)
1134 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1135 
1136 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1137                                      int realloc); // forward declaration
1138 
1139 /* Run a parallel region that has been serialized, so runs only in a team of the
1140    single master thread. */
1141 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1142   kmp_info_t *this_thr;
1143   kmp_team_t *serial_team;
1144 
1145   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1146 
1147   /* Skip all this code for autopar serialized loops since it results in
1148      unacceptable overhead */
1149   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1150     return;
1151 
1152   if (!TCR_4(__kmp_init_parallel))
1153     __kmp_parallel_initialize();
1154   __kmp_resume_if_soft_paused();
1155 
1156   this_thr = __kmp_threads[global_tid];
1157   serial_team = this_thr->th.th_serial_team;
1158 
1159   /* utilize the serialized team held by this thread */
1160   KMP_DEBUG_ASSERT(serial_team);
1161   KMP_MB();
1162 
1163   if (__kmp_tasking_mode != tskm_immediate_exec) {
1164     KMP_DEBUG_ASSERT(
1165         this_thr->th.th_task_team ==
1166         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1167     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1168                      NULL);
1169     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1170                   "team %p, new task_team = NULL\n",
1171                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1172     this_thr->th.th_task_team = NULL;
1173   }
1174 
1175   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1176   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1177     proc_bind = proc_bind_false;
1178   } else if (proc_bind == proc_bind_default) {
1179     // No proc_bind clause was specified, so use the current value
1180     // of proc-bind-var for this parallel region.
1181     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1182   }
1183   // Reset for next parallel region
1184   this_thr->th.th_set_proc_bind = proc_bind_default;
1185 
1186 #if OMPT_SUPPORT
1187   ompt_data_t ompt_parallel_data = ompt_data_none;
1188   ompt_data_t *implicit_task_data;
1189   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1190   if (ompt_enabled.enabled &&
1191       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1192 
1193     ompt_task_info_t *parent_task_info;
1194     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1195 
1196     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1197     if (ompt_enabled.ompt_callback_parallel_begin) {
1198       int team_size = 1;
1199 
1200       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1201           &(parent_task_info->task_data), &(parent_task_info->frame),
1202           &ompt_parallel_data, team_size,
1203           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1204     }
1205   }
1206 #endif // OMPT_SUPPORT
1207 
1208   if (this_thr->th.th_team != serial_team) {
1209     // Nested level will be an index in the nested nthreads array
1210     int level = this_thr->th.th_team->t.t_level;
1211 
1212     if (serial_team->t.t_serialized) {
1213       /* this serial team was already used
1214          TODO increase performance by making this locks more specific */
1215       kmp_team_t *new_team;
1216 
1217       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1218 
1219       new_team =
1220           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1221 #if OMPT_SUPPORT
1222                               ompt_parallel_data,
1223 #endif
1224                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1225                               0 USE_NESTED_HOT_ARG(NULL));
1226       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1227       KMP_ASSERT(new_team);
1228 
1229       /* setup new serialized team and install it */
1230       new_team->t.t_threads[0] = this_thr;
1231       new_team->t.t_parent = this_thr->th.th_team;
1232       serial_team = new_team;
1233       this_thr->th.th_serial_team = serial_team;
1234 
1235       KF_TRACE(
1236           10,
1237           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1238            global_tid, serial_team));
1239 
1240       /* TODO the above breaks the requirement that if we run out of resources,
1241          then we can still guarantee that serialized teams are ok, since we may
1242          need to allocate a new one */
1243     } else {
1244       KF_TRACE(
1245           10,
1246           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1247            global_tid, serial_team));
1248     }
1249 
1250     /* we have to initialize this serial team */
1251     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1252     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1253     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1254     serial_team->t.t_ident = loc;
1255     serial_team->t.t_serialized = 1;
1256     serial_team->t.t_nproc = 1;
1257     serial_team->t.t_parent = this_thr->th.th_team;
1258     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1259     this_thr->th.th_team = serial_team;
1260     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1261 
1262     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1263                   this_thr->th.th_current_task));
1264     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1265     this_thr->th.th_current_task->td_flags.executing = 0;
1266 
1267     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1268 
1269     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1270        implicit task for each serialized task represented by
1271        team->t.t_serialized? */
1272     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1273               &this_thr->th.th_current_task->td_parent->td_icvs);
1274 
1275     // Thread value exists in the nested nthreads array for the next nested
1276     // level
1277     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1278       this_thr->th.th_current_task->td_icvs.nproc =
1279           __kmp_nested_nth.nth[level + 1];
1280     }
1281 
1282     if (__kmp_nested_proc_bind.used &&
1283         (level + 1 < __kmp_nested_proc_bind.used)) {
1284       this_thr->th.th_current_task->td_icvs.proc_bind =
1285           __kmp_nested_proc_bind.bind_types[level + 1];
1286     }
1287 
1288 #if USE_DEBUGGER
1289     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1290 #endif
1291     this_thr->th.th_info.ds.ds_tid = 0;
1292 
1293     /* set thread cache values */
1294     this_thr->th.th_team_nproc = 1;
1295     this_thr->th.th_team_master = this_thr;
1296     this_thr->th.th_team_serialized = 1;
1297 
1298     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1299     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1300     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1301 
1302     propagateFPControl(serial_team);
1303 
1304     /* check if we need to allocate dispatch buffers stack */
1305     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1306     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1307       serial_team->t.t_dispatch->th_disp_buffer =
1308           (dispatch_private_info_t *)__kmp_allocate(
1309               sizeof(dispatch_private_info_t));
1310     }
1311     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1312 
1313     KMP_MB();
1314 
1315   } else {
1316     /* this serialized team is already being used,
1317      * that's fine, just add another nested level */
1318     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1319     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1320     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1321     ++serial_team->t.t_serialized;
1322     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1323 
1324     // Nested level will be an index in the nested nthreads array
1325     int level = this_thr->th.th_team->t.t_level;
1326     // Thread value exists in the nested nthreads array for the next nested
1327     // level
1328     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1329       this_thr->th.th_current_task->td_icvs.nproc =
1330           __kmp_nested_nth.nth[level + 1];
1331     }
1332     serial_team->t.t_level++;
1333     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1334                   "of serial team %p to %d\n",
1335                   global_tid, serial_team, serial_team->t.t_level));
1336 
1337     /* allocate/push dispatch buffers stack */
1338     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1339     {
1340       dispatch_private_info_t *disp_buffer =
1341           (dispatch_private_info_t *)__kmp_allocate(
1342               sizeof(dispatch_private_info_t));
1343       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1344       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1345     }
1346     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1347 
1348     KMP_MB();
1349   }
1350   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1351 
1352   // Perform the display affinity functionality for
1353   // serialized parallel regions
1354   if (__kmp_display_affinity) {
1355     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1356         this_thr->th.th_prev_num_threads != 1) {
1357       // NULL means use the affinity-format-var ICV
1358       __kmp_aux_display_affinity(global_tid, NULL);
1359       this_thr->th.th_prev_level = serial_team->t.t_level;
1360       this_thr->th.th_prev_num_threads = 1;
1361     }
1362   }
1363 
1364   if (__kmp_env_consistency_check)
1365     __kmp_push_parallel(global_tid, NULL);
1366 #if OMPT_SUPPORT
1367   serial_team->t.ompt_team_info.master_return_address = codeptr;
1368   if (ompt_enabled.enabled &&
1369       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1370     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1371 
1372     ompt_lw_taskteam_t lw_taskteam;
1373     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1374                             &ompt_parallel_data, codeptr);
1375 
1376     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1377     // don't use lw_taskteam after linking. content was swaped
1378 
1379     /* OMPT implicit task begin */
1380     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1381     if (ompt_enabled.ompt_callback_implicit_task) {
1382       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1383           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1384           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1385       OMPT_CUR_TASK_INFO(this_thr)
1386           ->thread_num = __kmp_tid_from_gtid(global_tid);
1387     }
1388 
1389     /* OMPT state */
1390     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1391     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1392   }
1393 #endif
1394 }
1395 
1396 /* most of the work for a fork */
1397 /* return true if we really went parallel, false if serialized */
1398 int __kmp_fork_call(ident_t *loc, int gtid,
1399                     enum fork_context_e call_context, // Intel, GNU, ...
1400                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1401                     kmp_va_list ap) {
1402   void **argv;
1403   int i;
1404   int master_tid;
1405   int master_this_cons;
1406   kmp_team_t *team;
1407   kmp_team_t *parent_team;
1408   kmp_info_t *master_th;
1409   kmp_root_t *root;
1410   int nthreads;
1411   int master_active;
1412   int master_set_numthreads;
1413   int level;
1414   int active_level;
1415   int teams_level;
1416 #if KMP_NESTED_HOT_TEAMS
1417   kmp_hot_team_ptr_t **p_hot_teams;
1418 #endif
1419   { // KMP_TIME_BLOCK
1420     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1421     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1422 
1423     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1424     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1425       /* Some systems prefer the stack for the root thread(s) to start with */
1426       /* some gap from the parent stack to prevent false sharing. */
1427       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1428       /* These 2 lines below are so this does not get optimized out */
1429       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1430         __kmp_stkpadding += (short)((kmp_int64)dummy);
1431     }
1432 
1433     /* initialize if needed */
1434     KMP_DEBUG_ASSERT(
1435         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1436     if (!TCR_4(__kmp_init_parallel))
1437       __kmp_parallel_initialize();
1438     __kmp_resume_if_soft_paused();
1439 
1440     /* setup current data */
1441     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1442     // shutdown
1443     parent_team = master_th->th.th_team;
1444     master_tid = master_th->th.th_info.ds.ds_tid;
1445     master_this_cons = master_th->th.th_local.this_construct;
1446     root = master_th->th.th_root;
1447     master_active = root->r.r_active;
1448     master_set_numthreads = master_th->th.th_set_nproc;
1449 
1450 #if OMPT_SUPPORT
1451     ompt_data_t ompt_parallel_data = ompt_data_none;
1452     ompt_data_t *parent_task_data;
1453     ompt_frame_t *ompt_frame;
1454     ompt_data_t *implicit_task_data;
1455     void *return_address = NULL;
1456 
1457     if (ompt_enabled.enabled) {
1458       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1459                                     NULL, NULL);
1460       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1461     }
1462 #endif
1463 
1464     // Nested level will be an index in the nested nthreads array
1465     level = parent_team->t.t_level;
1466     // used to launch non-serial teams even if nested is not allowed
1467     active_level = parent_team->t.t_active_level;
1468     // needed to check nesting inside the teams
1469     teams_level = master_th->th.th_teams_level;
1470 #if KMP_NESTED_HOT_TEAMS
1471     p_hot_teams = &master_th->th.th_hot_teams;
1472     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1473       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1474           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1475       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1476       // it is either actual or not needed (when active_level > 0)
1477       (*p_hot_teams)[0].hot_team_nth = 1;
1478     }
1479 #endif
1480 
1481 #if OMPT_SUPPORT
1482     if (ompt_enabled.enabled) {
1483       if (ompt_enabled.ompt_callback_parallel_begin) {
1484         int team_size = master_set_numthreads
1485                             ? master_set_numthreads
1486                             : get__nproc_2(parent_team, master_tid);
1487         int flags = OMPT_INVOKER(call_context) |
1488                     ((microtask == (microtask_t)__kmp_teams_master)
1489                          ? ompt_parallel_league
1490                          : ompt_parallel_team);
1491         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1492             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1493             return_address);
1494       }
1495       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1496     }
1497 #endif
1498 
1499     master_th->th.th_ident = loc;
1500 
1501     if (master_th->th.th_teams_microtask && ap &&
1502         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1503       // AC: This is start of parallel that is nested inside teams construct.
1504       // The team is actual (hot), all workers are ready at the fork barrier.
1505       // No lock needed to initialize the team a bit, then free workers.
1506       parent_team->t.t_ident = loc;
1507       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1508       parent_team->t.t_argc = argc;
1509       argv = (void **)parent_team->t.t_argv;
1510       for (i = argc - 1; i >= 0; --i)
1511         *argv++ = va_arg(kmp_va_deref(ap), void *);
1512       // Increment our nested depth levels, but not increase the serialization
1513       if (parent_team == master_th->th.th_serial_team) {
1514         // AC: we are in serialized parallel
1515         __kmpc_serialized_parallel(loc, gtid);
1516         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1517 
1518         if (call_context == fork_context_gnu) {
1519           // AC: need to decrement t_serialized for enquiry functions to work
1520           // correctly, will restore at join time
1521           parent_team->t.t_serialized--;
1522           return TRUE;
1523         }
1524 
1525 #if OMPT_SUPPORT
1526         void *dummy;
1527         void **exit_frame_p;
1528 
1529         ompt_lw_taskteam_t lw_taskteam;
1530 
1531         if (ompt_enabled.enabled) {
1532           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1533                                   &ompt_parallel_data, return_address);
1534           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1535 
1536           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1537           // don't use lw_taskteam after linking. content was swaped
1538 
1539           /* OMPT implicit task begin */
1540           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1541           if (ompt_enabled.ompt_callback_implicit_task) {
1542             OMPT_CUR_TASK_INFO(master_th)
1543                 ->thread_num = __kmp_tid_from_gtid(gtid);
1544             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1545                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1546                 implicit_task_data, 1,
1547                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1548           }
1549 
1550           /* OMPT state */
1551           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1552         } else {
1553           exit_frame_p = &dummy;
1554         }
1555 #endif
1556         // AC: need to decrement t_serialized for enquiry functions to work
1557         // correctly, will restore at join time
1558         parent_team->t.t_serialized--;
1559 
1560         {
1561           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1562           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1563           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1564 #if OMPT_SUPPORT
1565                                  ,
1566                                  exit_frame_p
1567 #endif
1568                                  );
1569         }
1570 
1571 #if OMPT_SUPPORT
1572         if (ompt_enabled.enabled) {
1573           *exit_frame_p = NULL;
1574           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1575           if (ompt_enabled.ompt_callback_implicit_task) {
1576             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1577                 ompt_scope_end, NULL, implicit_task_data, 1,
1578                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1579           }
1580           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1581           __ompt_lw_taskteam_unlink(master_th);
1582           if (ompt_enabled.ompt_callback_parallel_end) {
1583             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1584                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1585                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1586                 return_address);
1587           }
1588           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1589         }
1590 #endif
1591         return TRUE;
1592       }
1593 
1594       parent_team->t.t_pkfn = microtask;
1595       parent_team->t.t_invoke = invoker;
1596       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1597       parent_team->t.t_active_level++;
1598       parent_team->t.t_level++;
1599       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1600 
1601 #if OMPT_SUPPORT
1602       if (ompt_enabled.enabled) {
1603         ompt_lw_taskteam_t lw_taskteam;
1604         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1605                                 &ompt_parallel_data, return_address);
1606         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1607       }
1608 #endif
1609 
1610       /* Change number of threads in the team if requested */
1611       if (master_set_numthreads) { // The parallel has num_threads clause
1612         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1613           // AC: only can reduce number of threads dynamically, can't increase
1614           kmp_info_t **other_threads = parent_team->t.t_threads;
1615           parent_team->t.t_nproc = master_set_numthreads;
1616           for (i = 0; i < master_set_numthreads; ++i) {
1617             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1618           }
1619           // Keep extra threads hot in the team for possible next parallels
1620         }
1621         master_th->th.th_set_nproc = 0;
1622       }
1623 
1624 #if USE_DEBUGGER
1625       if (__kmp_debugging) { // Let debugger override number of threads.
1626         int nth = __kmp_omp_num_threads(loc);
1627         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1628           master_set_numthreads = nth;
1629         }
1630       }
1631 #endif
1632 
1633 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1634       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1635            KMP_ITT_DEBUG) &&
1636           __kmp_forkjoin_frames_mode == 3 &&
1637           parent_team->t.t_active_level == 1 // only report frames at level 1
1638           && master_th->th.th_teams_size.nteams == 1) {
1639         kmp_uint64 tmp_time = __itt_get_timestamp();
1640         master_th->th.th_frame_time = tmp_time;
1641         parent_team->t.t_region_time = tmp_time;
1642       }
1643       if (__itt_stack_caller_create_ptr) {
1644         // create new stack stitching id before entering fork barrier
1645         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1646       }
1647 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1648 
1649       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1650                     "master_th=%p, gtid=%d\n",
1651                     root, parent_team, master_th, gtid));
1652       __kmp_internal_fork(loc, gtid, parent_team);
1653       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1654                     "master_th=%p, gtid=%d\n",
1655                     root, parent_team, master_th, gtid));
1656 
1657       if (call_context == fork_context_gnu)
1658         return TRUE;
1659 
1660       /* Invoke microtask for MASTER thread */
1661       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1662                     parent_team->t.t_id, parent_team->t.t_pkfn));
1663 
1664       if (!parent_team->t.t_invoke(gtid)) {
1665         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1666       }
1667       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1668                     parent_team->t.t_id, parent_team->t.t_pkfn));
1669       KMP_MB(); /* Flush all pending memory write invalidates.  */
1670 
1671       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1672 
1673       return TRUE;
1674     } // Parallel closely nested in teams construct
1675 
1676 #if KMP_DEBUG
1677     if (__kmp_tasking_mode != tskm_immediate_exec) {
1678       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1679                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1680     }
1681 #endif
1682 
1683     if (parent_team->t.t_active_level >=
1684         master_th->th.th_current_task->td_icvs.max_active_levels) {
1685       nthreads = 1;
1686     } else {
1687       int enter_teams = ((ap == NULL && active_level == 0) ||
1688                          (ap && teams_level > 0 && teams_level == level));
1689       nthreads =
1690           master_set_numthreads
1691               ? master_set_numthreads
1692               : get__nproc_2(
1693                     parent_team,
1694                     master_tid); // TODO: get nproc directly from current task
1695 
1696       // Check if we need to take forkjoin lock? (no need for serialized
1697       // parallel out of teams construct). This code moved here from
1698       // __kmp_reserve_threads() to speedup nested serialized parallels.
1699       if (nthreads > 1) {
1700         if ((get__max_active_levels(master_th) == 1 &&
1701              (root->r.r_in_parallel && !enter_teams)) ||
1702             (__kmp_library == library_serial)) {
1703           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1704                         " threads\n",
1705                         gtid, nthreads));
1706           nthreads = 1;
1707         }
1708       }
1709       if (nthreads > 1) {
1710         /* determine how many new threads we can use */
1711         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1712         /* AC: If we execute teams from parallel region (on host), then teams
1713            should be created but each can only have 1 thread if nesting is
1714            disabled. If teams called from serial region, then teams and their
1715            threads should be created regardless of the nesting setting. */
1716         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1717                                          nthreads, enter_teams);
1718         if (nthreads == 1) {
1719           // Free lock for single thread execution here; for multi-thread
1720           // execution it will be freed later after team of threads created
1721           // and initialized
1722           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1723         }
1724       }
1725     }
1726     KMP_DEBUG_ASSERT(nthreads > 0);
1727 
1728     // If we temporarily changed the set number of threads then restore it now
1729     master_th->th.th_set_nproc = 0;
1730 
1731     /* create a serialized parallel region? */
1732     if (nthreads == 1) {
1733 /* josh todo: hypothetical question: what do we do for OS X*? */
1734 #if KMP_OS_LINUX &&                                                            \
1735     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1736       void *args[argc];
1737 #else
1738       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1739 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1740           KMP_ARCH_AARCH64) */
1741 
1742       KA_TRACE(20,
1743                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1744 
1745       __kmpc_serialized_parallel(loc, gtid);
1746 
1747       if (call_context == fork_context_intel) {
1748         /* TODO this sucks, use the compiler itself to pass args! :) */
1749         master_th->th.th_serial_team->t.t_ident = loc;
1750         if (!ap) {
1751           // revert change made in __kmpc_serialized_parallel()
1752           master_th->th.th_serial_team->t.t_level--;
1753 // Get args from parent team for teams construct
1754 
1755 #if OMPT_SUPPORT
1756           void *dummy;
1757           void **exit_frame_p;
1758           ompt_task_info_t *task_info;
1759 
1760           ompt_lw_taskteam_t lw_taskteam;
1761 
1762           if (ompt_enabled.enabled) {
1763             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1764                                     &ompt_parallel_data, return_address);
1765 
1766             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1767             // don't use lw_taskteam after linking. content was swaped
1768 
1769             task_info = OMPT_CUR_TASK_INFO(master_th);
1770             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1771             if (ompt_enabled.ompt_callback_implicit_task) {
1772               OMPT_CUR_TASK_INFO(master_th)
1773                   ->thread_num = __kmp_tid_from_gtid(gtid);
1774               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1775                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1776                   &(task_info->task_data), 1,
1777                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1778                   ompt_task_implicit);
1779             }
1780 
1781             /* OMPT state */
1782             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1783           } else {
1784             exit_frame_p = &dummy;
1785           }
1786 #endif
1787 
1788           {
1789             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1790             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1791             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1792                                    parent_team->t.t_argv
1793 #if OMPT_SUPPORT
1794                                    ,
1795                                    exit_frame_p
1796 #endif
1797                                    );
1798           }
1799 
1800 #if OMPT_SUPPORT
1801           if (ompt_enabled.enabled) {
1802             *exit_frame_p = NULL;
1803             if (ompt_enabled.ompt_callback_implicit_task) {
1804               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1805                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1806                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1807                   ompt_task_implicit);
1808             }
1809             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1810             __ompt_lw_taskteam_unlink(master_th);
1811             if (ompt_enabled.ompt_callback_parallel_end) {
1812               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1813                   &ompt_parallel_data, parent_task_data,
1814                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1815                   return_address);
1816             }
1817             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1818           }
1819 #endif
1820         } else if (microtask == (microtask_t)__kmp_teams_master) {
1821           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1822                            master_th->th.th_serial_team);
1823           team = master_th->th.th_team;
1824           // team->t.t_pkfn = microtask;
1825           team->t.t_invoke = invoker;
1826           __kmp_alloc_argv_entries(argc, team, TRUE);
1827           team->t.t_argc = argc;
1828           argv = (void **)team->t.t_argv;
1829           if (ap) {
1830             for (i = argc - 1; i >= 0; --i)
1831               *argv++ = va_arg(kmp_va_deref(ap), void *);
1832           } else {
1833             for (i = 0; i < argc; ++i)
1834               // Get args from parent team for teams construct
1835               argv[i] = parent_team->t.t_argv[i];
1836           }
1837           // AC: revert change made in __kmpc_serialized_parallel()
1838           //     because initial code in teams should have level=0
1839           team->t.t_level--;
1840           // AC: call special invoker for outer "parallel" of teams construct
1841           invoker(gtid);
1842 #if OMPT_SUPPORT
1843           if (ompt_enabled.enabled) {
1844             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1845             if (ompt_enabled.ompt_callback_implicit_task) {
1846               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1847                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1848                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1849             }
1850             if (ompt_enabled.ompt_callback_parallel_end) {
1851               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1852                   &ompt_parallel_data, parent_task_data,
1853                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1854                   return_address);
1855             }
1856             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1857           }
1858 #endif
1859         } else {
1860           argv = args;
1861           for (i = argc - 1; i >= 0; --i)
1862             *argv++ = va_arg(kmp_va_deref(ap), void *);
1863           KMP_MB();
1864 
1865 #if OMPT_SUPPORT
1866           void *dummy;
1867           void **exit_frame_p;
1868           ompt_task_info_t *task_info;
1869 
1870           ompt_lw_taskteam_t lw_taskteam;
1871 
1872           if (ompt_enabled.enabled) {
1873             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1874                                     &ompt_parallel_data, return_address);
1875             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1876             // don't use lw_taskteam after linking. content was swaped
1877             task_info = OMPT_CUR_TASK_INFO(master_th);
1878             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1879 
1880             /* OMPT implicit task begin */
1881             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1882             if (ompt_enabled.ompt_callback_implicit_task) {
1883               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1884                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1885                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1886                   ompt_task_implicit);
1887               OMPT_CUR_TASK_INFO(master_th)
1888                   ->thread_num = __kmp_tid_from_gtid(gtid);
1889             }
1890 
1891             /* OMPT state */
1892             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1893           } else {
1894             exit_frame_p = &dummy;
1895           }
1896 #endif
1897 
1898           {
1899             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1900             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1901             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1902 #if OMPT_SUPPORT
1903                                    ,
1904                                    exit_frame_p
1905 #endif
1906                                    );
1907           }
1908 
1909 #if OMPT_SUPPORT
1910           if (ompt_enabled.enabled) {
1911             *exit_frame_p = NULL;
1912             if (ompt_enabled.ompt_callback_implicit_task) {
1913               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1914                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1915                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1916                   ompt_task_implicit);
1917             }
1918 
1919             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1920             __ompt_lw_taskteam_unlink(master_th);
1921             if (ompt_enabled.ompt_callback_parallel_end) {
1922               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1923                   &ompt_parallel_data, parent_task_data,
1924                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1925                   return_address);
1926             }
1927             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1928           }
1929 #endif
1930         }
1931       } else if (call_context == fork_context_gnu) {
1932 #if OMPT_SUPPORT
1933         ompt_lw_taskteam_t lwt;
1934         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1935                                 return_address);
1936 
1937         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1938         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1939 // don't use lw_taskteam after linking. content was swaped
1940 #endif
1941 
1942         // we were called from GNU native code
1943         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1944         return FALSE;
1945       } else {
1946         KMP_ASSERT2(call_context < fork_context_last,
1947                     "__kmp_fork_call: unknown fork_context parameter");
1948       }
1949 
1950       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1951       KMP_MB();
1952       return FALSE;
1953     } // if (nthreads == 1)
1954 
1955     // GEH: only modify the executing flag in the case when not serialized
1956     //      serialized case is handled in kmpc_serialized_parallel
1957     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1958                   "curtask=%p, curtask_max_aclevel=%d\n",
1959                   parent_team->t.t_active_level, master_th,
1960                   master_th->th.th_current_task,
1961                   master_th->th.th_current_task->td_icvs.max_active_levels));
1962     // TODO: GEH - cannot do this assertion because root thread not set up as
1963     // executing
1964     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1965     master_th->th.th_current_task->td_flags.executing = 0;
1966 
1967     if (!master_th->th.th_teams_microtask || level > teams_level) {
1968       /* Increment our nested depth level */
1969       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1970     }
1971 
1972     // See if we need to make a copy of the ICVs.
1973     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1974     if ((level + 1 < __kmp_nested_nth.used) &&
1975         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1976       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1977     } else {
1978       nthreads_icv = 0; // don't update
1979     }
1980 
1981     // Figure out the proc_bind_policy for the new team.
1982     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1983     kmp_proc_bind_t proc_bind_icv =
1984         proc_bind_default; // proc_bind_default means don't update
1985     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1986       proc_bind = proc_bind_false;
1987     } else {
1988       if (proc_bind == proc_bind_default) {
1989         // No proc_bind clause specified; use current proc-bind-var for this
1990         // parallel region
1991         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1992       }
1993       /* else: The proc_bind policy was specified explicitly on parallel clause.
1994          This overrides proc-bind-var for this parallel region, but does not
1995          change proc-bind-var. */
1996       // Figure the value of proc-bind-var for the child threads.
1997       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1998           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1999            master_th->th.th_current_task->td_icvs.proc_bind)) {
2000         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2001       }
2002     }
2003 
2004     // Reset for next parallel region
2005     master_th->th.th_set_proc_bind = proc_bind_default;
2006 
2007     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2008       kmp_internal_control_t new_icvs;
2009       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2010       new_icvs.next = NULL;
2011       if (nthreads_icv > 0) {
2012         new_icvs.nproc = nthreads_icv;
2013       }
2014       if (proc_bind_icv != proc_bind_default) {
2015         new_icvs.proc_bind = proc_bind_icv;
2016       }
2017 
2018       /* allocate a new parallel team */
2019       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2020       team = __kmp_allocate_team(root, nthreads, nthreads,
2021 #if OMPT_SUPPORT
2022                                  ompt_parallel_data,
2023 #endif
2024                                  proc_bind, &new_icvs,
2025                                  argc USE_NESTED_HOT_ARG(master_th));
2026     } else {
2027       /* allocate a new parallel team */
2028       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2029       team = __kmp_allocate_team(root, nthreads, nthreads,
2030 #if OMPT_SUPPORT
2031                                  ompt_parallel_data,
2032 #endif
2033                                  proc_bind,
2034                                  &master_th->th.th_current_task->td_icvs,
2035                                  argc USE_NESTED_HOT_ARG(master_th));
2036     }
2037     KF_TRACE(
2038         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2039 
2040     /* setup the new team */
2041     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2042     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2043     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2044     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2045     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2046 #if OMPT_SUPPORT
2047     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2048                           return_address);
2049 #endif
2050     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2051     // TODO: parent_team->t.t_level == INT_MAX ???
2052     if (!master_th->th.th_teams_microtask || level > teams_level) {
2053       int new_level = parent_team->t.t_level + 1;
2054       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2055       new_level = parent_team->t.t_active_level + 1;
2056       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2057     } else {
2058       // AC: Do not increase parallel level at start of the teams construct
2059       int new_level = parent_team->t.t_level;
2060       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2061       new_level = parent_team->t.t_active_level;
2062       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2063     }
2064     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2065     // set master's schedule as new run-time schedule
2066     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2067 
2068     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2069     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2070 
2071     // Update the floating point rounding in the team if required.
2072     propagateFPControl(team);
2073 
2074     if (__kmp_tasking_mode != tskm_immediate_exec) {
2075       // Set master's task team to team's task team. Unless this is hot team, it
2076       // should be NULL.
2077       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2078                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2079       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2080                     "%p, new task_team %p / team %p\n",
2081                     __kmp_gtid_from_thread(master_th),
2082                     master_th->th.th_task_team, parent_team,
2083                     team->t.t_task_team[master_th->th.th_task_state], team));
2084 
2085       if (active_level || master_th->th.th_task_team) {
2086         // Take a memo of master's task_state
2087         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2088         if (master_th->th.th_task_state_top >=
2089             master_th->th.th_task_state_stack_sz) { // increase size
2090           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2091           kmp_uint8 *old_stack, *new_stack;
2092           kmp_uint32 i;
2093           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2094           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2095             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2096           }
2097           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2098                ++i) { // zero-init rest of stack
2099             new_stack[i] = 0;
2100           }
2101           old_stack = master_th->th.th_task_state_memo_stack;
2102           master_th->th.th_task_state_memo_stack = new_stack;
2103           master_th->th.th_task_state_stack_sz = new_size;
2104           __kmp_free(old_stack);
2105         }
2106         // Store master's task_state on stack
2107         master_th->th
2108             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2109             master_th->th.th_task_state;
2110         master_th->th.th_task_state_top++;
2111 #if KMP_NESTED_HOT_TEAMS
2112         if (master_th->th.th_hot_teams &&
2113             active_level < __kmp_hot_teams_max_level &&
2114             team == master_th->th.th_hot_teams[active_level].hot_team) {
2115           // Restore master's nested state if nested hot team
2116           master_th->th.th_task_state =
2117               master_th->th
2118                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2119         } else {
2120 #endif
2121           master_th->th.th_task_state = 0;
2122 #if KMP_NESTED_HOT_TEAMS
2123         }
2124 #endif
2125       }
2126 #if !KMP_NESTED_HOT_TEAMS
2127       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2128                        (team == root->r.r_hot_team));
2129 #endif
2130     }
2131 
2132     KA_TRACE(
2133         20,
2134         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2135          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2136          team->t.t_nproc));
2137     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2138                      (team->t.t_master_tid == 0 &&
2139                       (team->t.t_parent == root->r.r_root_team ||
2140                        team->t.t_parent->t.t_serialized)));
2141     KMP_MB();
2142 
2143     /* now, setup the arguments */
2144     argv = (void **)team->t.t_argv;
2145     if (ap) {
2146       for (i = argc - 1; i >= 0; --i) {
2147         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2148         KMP_CHECK_UPDATE(*argv, new_argv);
2149         argv++;
2150       }
2151     } else {
2152       for (i = 0; i < argc; ++i) {
2153         // Get args from parent team for teams construct
2154         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2155       }
2156     }
2157 
2158     /* now actually fork the threads */
2159     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2160     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2161       root->r.r_active = TRUE;
2162 
2163     __kmp_fork_team_threads(root, team, master_th, gtid);
2164     __kmp_setup_icv_copy(team, nthreads,
2165                          &master_th->th.th_current_task->td_icvs, loc);
2166 
2167 #if OMPT_SUPPORT
2168     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2169 #endif
2170 
2171     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2172 
2173 #if USE_ITT_BUILD
2174     if (team->t.t_active_level == 1 // only report frames at level 1
2175         && !master_th->th.th_teams_microtask) { // not in teams construct
2176 #if USE_ITT_NOTIFY
2177       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2178           (__kmp_forkjoin_frames_mode == 3 ||
2179            __kmp_forkjoin_frames_mode == 1)) {
2180         kmp_uint64 tmp_time = 0;
2181         if (__itt_get_timestamp_ptr)
2182           tmp_time = __itt_get_timestamp();
2183         // Internal fork - report frame begin
2184         master_th->th.th_frame_time = tmp_time;
2185         if (__kmp_forkjoin_frames_mode == 3)
2186           team->t.t_region_time = tmp_time;
2187       } else
2188 // only one notification scheme (either "submit" or "forking/joined", not both)
2189 #endif /* USE_ITT_NOTIFY */
2190           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2191               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2192         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2193         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2194       }
2195     }
2196 #endif /* USE_ITT_BUILD */
2197 
2198     /* now go on and do the work */
2199     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2200     KMP_MB();
2201     KF_TRACE(10,
2202              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2203               root, team, master_th, gtid));
2204 
2205 #if USE_ITT_BUILD
2206     if (__itt_stack_caller_create_ptr) {
2207       team->t.t_stack_id =
2208           __kmp_itt_stack_caller_create(); // create new stack stitching id
2209       // before entering fork barrier
2210     }
2211 #endif /* USE_ITT_BUILD */
2212 
2213     // AC: skip __kmp_internal_fork at teams construct, let only master
2214     // threads execute
2215     if (ap) {
2216       __kmp_internal_fork(loc, gtid, team);
2217       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2218                     "master_th=%p, gtid=%d\n",
2219                     root, team, master_th, gtid));
2220     }
2221 
2222     if (call_context == fork_context_gnu) {
2223       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2224       return TRUE;
2225     }
2226 
2227     /* Invoke microtask for MASTER thread */
2228     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2229                   team->t.t_id, team->t.t_pkfn));
2230   } // END of timer KMP_fork_call block
2231 
2232 #if KMP_STATS_ENABLED
2233   // If beginning a teams construct, then change thread state
2234   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2235   if (!ap) {
2236     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2237   }
2238 #endif
2239 
2240   if (!team->t.t_invoke(gtid)) {
2241     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2242   }
2243 
2244 #if KMP_STATS_ENABLED
2245   // If was beginning of a teams construct, then reset thread state
2246   if (!ap) {
2247     KMP_SET_THREAD_STATE(previous_state);
2248   }
2249 #endif
2250 
2251   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2252                 team->t.t_id, team->t.t_pkfn));
2253   KMP_MB(); /* Flush all pending memory write invalidates.  */
2254 
2255   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2256 
2257 #if OMPT_SUPPORT
2258   if (ompt_enabled.enabled) {
2259     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2260   }
2261 #endif
2262 
2263   return TRUE;
2264 }
2265 
2266 #if OMPT_SUPPORT
2267 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2268                                             kmp_team_t *team) {
2269   // restore state outside the region
2270   thread->th.ompt_thread_info.state =
2271       ((team->t.t_serialized) ? ompt_state_work_serial
2272                               : ompt_state_work_parallel);
2273 }
2274 
2275 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2276                                    kmp_team_t *team, ompt_data_t *parallel_data,
2277                                    int flags, void *codeptr) {
2278   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2279   if (ompt_enabled.ompt_callback_parallel_end) {
2280     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2281         parallel_data, &(task_info->task_data), flags, codeptr);
2282   }
2283 
2284   task_info->frame.enter_frame = ompt_data_none;
2285   __kmp_join_restore_state(thread, team);
2286 }
2287 #endif
2288 
2289 void __kmp_join_call(ident_t *loc, int gtid
2290 #if OMPT_SUPPORT
2291                      ,
2292                      enum fork_context_e fork_context
2293 #endif
2294                      ,
2295                      int exit_teams) {
2296   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2297   kmp_team_t *team;
2298   kmp_team_t *parent_team;
2299   kmp_info_t *master_th;
2300   kmp_root_t *root;
2301   int master_active;
2302 
2303   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2304 
2305   /* setup current data */
2306   master_th = __kmp_threads[gtid];
2307   root = master_th->th.th_root;
2308   team = master_th->th.th_team;
2309   parent_team = team->t.t_parent;
2310 
2311   master_th->th.th_ident = loc;
2312 
2313 #if OMPT_SUPPORT
2314   void *team_microtask = (void *)team->t.t_pkfn;
2315   // For GOMP interface with serialized parallel, need the
2316   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2317   // and end-parallel events.
2318   if (ompt_enabled.enabled &&
2319       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2320     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2321   }
2322 #endif
2323 
2324 #if KMP_DEBUG
2325   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2326     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2327                   "th_task_team = %p\n",
2328                   __kmp_gtid_from_thread(master_th), team,
2329                   team->t.t_task_team[master_th->th.th_task_state],
2330                   master_th->th.th_task_team));
2331     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2332                      team->t.t_task_team[master_th->th.th_task_state]);
2333   }
2334 #endif
2335 
2336   if (team->t.t_serialized) {
2337     if (master_th->th.th_teams_microtask) {
2338       // We are in teams construct
2339       int level = team->t.t_level;
2340       int tlevel = master_th->th.th_teams_level;
2341       if (level == tlevel) {
2342         // AC: we haven't incremented it earlier at start of teams construct,
2343         //     so do it here - at the end of teams construct
2344         team->t.t_level++;
2345       } else if (level == tlevel + 1) {
2346         // AC: we are exiting parallel inside teams, need to increment
2347         // serialization in order to restore it in the next call to
2348         // __kmpc_end_serialized_parallel
2349         team->t.t_serialized++;
2350       }
2351     }
2352     __kmpc_end_serialized_parallel(loc, gtid);
2353 
2354 #if OMPT_SUPPORT
2355     if (ompt_enabled.enabled) {
2356       __kmp_join_restore_state(master_th, parent_team);
2357     }
2358 #endif
2359 
2360     return;
2361   }
2362 
2363   master_active = team->t.t_master_active;
2364 
2365   if (!exit_teams) {
2366     // AC: No barrier for internal teams at exit from teams construct.
2367     //     But there is barrier for external team (league).
2368     __kmp_internal_join(loc, gtid, team);
2369   } else {
2370     master_th->th.th_task_state =
2371         0; // AC: no tasking in teams (out of any parallel)
2372   }
2373 
2374   KMP_MB();
2375 
2376 #if OMPT_SUPPORT
2377   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2378   void *codeptr = team->t.ompt_team_info.master_return_address;
2379 #endif
2380 
2381 #if USE_ITT_BUILD
2382   if (__itt_stack_caller_create_ptr) {
2383     // destroy the stack stitching id after join barrier
2384     __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2385   }
2386   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2387   if (team->t.t_active_level == 1 &&
2388       (!master_th->th.th_teams_microtask || /* not in teams construct */
2389        master_th->th.th_teams_size.nteams == 1)) {
2390     master_th->th.th_ident = loc;
2391     // only one notification scheme (either "submit" or "forking/joined", not
2392     // both)
2393     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2394         __kmp_forkjoin_frames_mode == 3)
2395       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2396                              master_th->th.th_frame_time, 0, loc,
2397                              master_th->th.th_team_nproc, 1);
2398     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2399              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2400       __kmp_itt_region_joined(gtid);
2401   } // active_level == 1
2402 #endif /* USE_ITT_BUILD */
2403 
2404   if (master_th->th.th_teams_microtask && !exit_teams &&
2405       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2406       team->t.t_level == master_th->th.th_teams_level + 1) {
2407 // AC: We need to leave the team structure intact at the end of parallel
2408 // inside the teams construct, so that at the next parallel same (hot) team
2409 // works, only adjust nesting levels
2410 #if OMPT_SUPPORT
2411     ompt_data_t ompt_parallel_data = ompt_data_none;
2412     if (ompt_enabled.enabled) {
2413       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2414       if (ompt_enabled.ompt_callback_implicit_task) {
2415         int ompt_team_size = team->t.t_nproc;
2416         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2417             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2418             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2419       }
2420       task_info->frame.exit_frame = ompt_data_none;
2421       task_info->task_data = ompt_data_none;
2422       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2423       __ompt_lw_taskteam_unlink(master_th);
2424     }
2425 #endif
2426     /* Decrement our nested depth level */
2427     team->t.t_level--;
2428     team->t.t_active_level--;
2429     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2430 
2431     // Restore number of threads in the team if needed. This code relies on
2432     // the proper adjustment of th_teams_size.nth after the fork in
2433     // __kmp_teams_master on each teams master in the case that
2434     // __kmp_reserve_threads reduced it.
2435     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2436       int old_num = master_th->th.th_team_nproc;
2437       int new_num = master_th->th.th_teams_size.nth;
2438       kmp_info_t **other_threads = team->t.t_threads;
2439       team->t.t_nproc = new_num;
2440       for (int i = 0; i < old_num; ++i) {
2441         other_threads[i]->th.th_team_nproc = new_num;
2442       }
2443       // Adjust states of non-used threads of the team
2444       for (int i = old_num; i < new_num; ++i) {
2445         // Re-initialize thread's barrier data.
2446         KMP_DEBUG_ASSERT(other_threads[i]);
2447         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2448         for (int b = 0; b < bs_last_barrier; ++b) {
2449           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2450           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2451 #if USE_DEBUGGER
2452           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2453 #endif
2454         }
2455         if (__kmp_tasking_mode != tskm_immediate_exec) {
2456           // Synchronize thread's task state
2457           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2458         }
2459       }
2460     }
2461 
2462 #if OMPT_SUPPORT
2463     if (ompt_enabled.enabled) {
2464       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2465                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2466     }
2467 #endif
2468 
2469     return;
2470   }
2471 
2472   /* do cleanup and restore the parent team */
2473   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2474   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2475 
2476   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2477 
2478   /* jc: The following lock has instructions with REL and ACQ semantics,
2479      separating the parallel user code called in this parallel region
2480      from the serial user code called after this function returns. */
2481   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2482 
2483   if (!master_th->th.th_teams_microtask ||
2484       team->t.t_level > master_th->th.th_teams_level) {
2485     /* Decrement our nested depth level */
2486     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2487   }
2488   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2489 
2490 #if OMPT_SUPPORT
2491   if (ompt_enabled.enabled) {
2492     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2493     if (ompt_enabled.ompt_callback_implicit_task) {
2494       int flags = (team_microtask == (void *)__kmp_teams_master)
2495                       ? ompt_task_initial
2496                       : ompt_task_implicit;
2497       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2498       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2499           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2500           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2501     }
2502     task_info->frame.exit_frame = ompt_data_none;
2503     task_info->task_data = ompt_data_none;
2504   }
2505 #endif
2506 
2507   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2508                 master_th, team));
2509   __kmp_pop_current_task_from_thread(master_th);
2510 
2511 #if KMP_AFFINITY_SUPPORTED
2512   // Restore master thread's partition.
2513   master_th->th.th_first_place = team->t.t_first_place;
2514   master_th->th.th_last_place = team->t.t_last_place;
2515 #endif // KMP_AFFINITY_SUPPORTED
2516   master_th->th.th_def_allocator = team->t.t_def_allocator;
2517 
2518   updateHWFPControl(team);
2519 
2520   if (root->r.r_active != master_active)
2521     root->r.r_active = master_active;
2522 
2523   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2524                             master_th)); // this will free worker threads
2525 
2526   /* this race was fun to find. make sure the following is in the critical
2527      region otherwise assertions may fail occasionally since the old team may be
2528      reallocated and the hierarchy appears inconsistent. it is actually safe to
2529      run and won't cause any bugs, but will cause those assertion failures. it's
2530      only one deref&assign so might as well put this in the critical region */
2531   master_th->th.th_team = parent_team;
2532   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2533   master_th->th.th_team_master = parent_team->t.t_threads[0];
2534   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2535 
2536   /* restore serialized team, if need be */
2537   if (parent_team->t.t_serialized &&
2538       parent_team != master_th->th.th_serial_team &&
2539       parent_team != root->r.r_root_team) {
2540     __kmp_free_team(root,
2541                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2542     master_th->th.th_serial_team = parent_team;
2543   }
2544 
2545   if (__kmp_tasking_mode != tskm_immediate_exec) {
2546     if (master_th->th.th_task_state_top >
2547         0) { // Restore task state from memo stack
2548       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2549       // Remember master's state if we re-use this nested hot team
2550       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2551           master_th->th.th_task_state;
2552       --master_th->th.th_task_state_top; // pop
2553       // Now restore state at this level
2554       master_th->th.th_task_state =
2555           master_th->th
2556               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2557     }
2558     // Copy the task team from the parent team to the master thread
2559     master_th->th.th_task_team =
2560         parent_team->t.t_task_team[master_th->th.th_task_state];
2561     KA_TRACE(20,
2562              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2563               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2564               parent_team));
2565   }
2566 
2567   // TODO: GEH - cannot do this assertion because root thread not set up as
2568   // executing
2569   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2570   master_th->th.th_current_task->td_flags.executing = 1;
2571 
2572   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2573 
2574 #if OMPT_SUPPORT
2575   int flags =
2576       OMPT_INVOKER(fork_context) |
2577       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2578                                                       : ompt_parallel_team);
2579   if (ompt_enabled.enabled) {
2580     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2581                     codeptr);
2582   }
2583 #endif
2584 
2585   KMP_MB();
2586   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2587 }
2588 
2589 /* Check whether we should push an internal control record onto the
2590    serial team stack.  If so, do it.  */
2591 void __kmp_save_internal_controls(kmp_info_t *thread) {
2592 
2593   if (thread->th.th_team != thread->th.th_serial_team) {
2594     return;
2595   }
2596   if (thread->th.th_team->t.t_serialized > 1) {
2597     int push = 0;
2598 
2599     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2600       push = 1;
2601     } else {
2602       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2603           thread->th.th_team->t.t_serialized) {
2604         push = 1;
2605       }
2606     }
2607     if (push) { /* push a record on the serial team's stack */
2608       kmp_internal_control_t *control =
2609           (kmp_internal_control_t *)__kmp_allocate(
2610               sizeof(kmp_internal_control_t));
2611 
2612       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2613 
2614       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2615 
2616       control->next = thread->th.th_team->t.t_control_stack_top;
2617       thread->th.th_team->t.t_control_stack_top = control;
2618     }
2619   }
2620 }
2621 
2622 /* Changes set_nproc */
2623 void __kmp_set_num_threads(int new_nth, int gtid) {
2624   kmp_info_t *thread;
2625   kmp_root_t *root;
2626 
2627   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2628   KMP_DEBUG_ASSERT(__kmp_init_serial);
2629 
2630   if (new_nth < 1)
2631     new_nth = 1;
2632   else if (new_nth > __kmp_max_nth)
2633     new_nth = __kmp_max_nth;
2634 
2635   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2636   thread = __kmp_threads[gtid];
2637   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2638     return; // nothing to do
2639 
2640   __kmp_save_internal_controls(thread);
2641 
2642   set__nproc(thread, new_nth);
2643 
2644   // If this omp_set_num_threads() call will cause the hot team size to be
2645   // reduced (in the absence of a num_threads clause), then reduce it now,
2646   // rather than waiting for the next parallel region.
2647   root = thread->th.th_root;
2648   if (__kmp_init_parallel && (!root->r.r_active) &&
2649       (root->r.r_hot_team->t.t_nproc > new_nth)
2650 #if KMP_NESTED_HOT_TEAMS
2651       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2652 #endif
2653       ) {
2654     kmp_team_t *hot_team = root->r.r_hot_team;
2655     int f;
2656 
2657     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2658 
2659     // Release the extra threads we don't need any more.
2660     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2661       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2662       if (__kmp_tasking_mode != tskm_immediate_exec) {
2663         // When decreasing team size, threads no longer in the team should unref
2664         // task team.
2665         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2666       }
2667       __kmp_free_thread(hot_team->t.t_threads[f]);
2668       hot_team->t.t_threads[f] = NULL;
2669     }
2670     hot_team->t.t_nproc = new_nth;
2671 #if KMP_NESTED_HOT_TEAMS
2672     if (thread->th.th_hot_teams) {
2673       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2674       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2675     }
2676 #endif
2677 
2678     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2679 
2680     // Update the t_nproc field in the threads that are still active.
2681     for (f = 0; f < new_nth; f++) {
2682       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2683       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2684     }
2685     // Special flag in case omp_set_num_threads() call
2686     hot_team->t.t_size_changed = -1;
2687   }
2688 }
2689 
2690 /* Changes max_active_levels */
2691 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2692   kmp_info_t *thread;
2693 
2694   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2695                 "%d = (%d)\n",
2696                 gtid, max_active_levels));
2697   KMP_DEBUG_ASSERT(__kmp_init_serial);
2698 
2699   // validate max_active_levels
2700   if (max_active_levels < 0) {
2701     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2702     // We ignore this call if the user has specified a negative value.
2703     // The current setting won't be changed. The last valid setting will be
2704     // used. A warning will be issued (if warnings are allowed as controlled by
2705     // the KMP_WARNINGS env var).
2706     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2707                   "max_active_levels for thread %d = (%d)\n",
2708                   gtid, max_active_levels));
2709     return;
2710   }
2711   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2712     // it's OK, the max_active_levels is within the valid range: [ 0;
2713     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2714     // We allow a zero value. (implementation defined behavior)
2715   } else {
2716     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2717                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2718     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2719     // Current upper limit is MAX_INT. (implementation defined behavior)
2720     // If the input exceeds the upper limit, we correct the input to be the
2721     // upper limit. (implementation defined behavior)
2722     // Actually, the flow should never get here until we use MAX_INT limit.
2723   }
2724   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2725                 "max_active_levels for thread %d = (%d)\n",
2726                 gtid, max_active_levels));
2727 
2728   thread = __kmp_threads[gtid];
2729 
2730   __kmp_save_internal_controls(thread);
2731 
2732   set__max_active_levels(thread, max_active_levels);
2733 }
2734 
2735 /* Gets max_active_levels */
2736 int __kmp_get_max_active_levels(int gtid) {
2737   kmp_info_t *thread;
2738 
2739   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2740   KMP_DEBUG_ASSERT(__kmp_init_serial);
2741 
2742   thread = __kmp_threads[gtid];
2743   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2744   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2745                 "curtask_maxaclevel=%d\n",
2746                 gtid, thread->th.th_current_task,
2747                 thread->th.th_current_task->td_icvs.max_active_levels));
2748   return thread->th.th_current_task->td_icvs.max_active_levels;
2749 }
2750 
2751 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2752 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2753 
2754 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2755 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2756   kmp_info_t *thread;
2757   kmp_sched_t orig_kind;
2758   //    kmp_team_t *team;
2759 
2760   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2761                 gtid, (int)kind, chunk));
2762   KMP_DEBUG_ASSERT(__kmp_init_serial);
2763 
2764   // Check if the kind parameter is valid, correct if needed.
2765   // Valid parameters should fit in one of two intervals - standard or extended:
2766   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2767   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2768   orig_kind = kind;
2769   kind = __kmp_sched_without_mods(kind);
2770 
2771   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2772       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2773     // TODO: Hint needs attention in case we change the default schedule.
2774     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2775               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2776               __kmp_msg_null);
2777     kind = kmp_sched_default;
2778     chunk = 0; // ignore chunk value in case of bad kind
2779   }
2780 
2781   thread = __kmp_threads[gtid];
2782 
2783   __kmp_save_internal_controls(thread);
2784 
2785   if (kind < kmp_sched_upper_std) {
2786     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2787       // differ static chunked vs. unchunked:  chunk should be invalid to
2788       // indicate unchunked schedule (which is the default)
2789       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2790     } else {
2791       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2792           __kmp_sch_map[kind - kmp_sched_lower - 1];
2793     }
2794   } else {
2795     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2796     //    kmp_sched_lower - 2 ];
2797     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2798         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2799                       kmp_sched_lower - 2];
2800   }
2801   __kmp_sched_apply_mods_intkind(
2802       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2803   if (kind == kmp_sched_auto || chunk < 1) {
2804     // ignore parameter chunk for schedule auto
2805     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2806   } else {
2807     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2808   }
2809 }
2810 
2811 /* Gets def_sched_var ICV values */
2812 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2813   kmp_info_t *thread;
2814   enum sched_type th_type;
2815 
2816   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2817   KMP_DEBUG_ASSERT(__kmp_init_serial);
2818 
2819   thread = __kmp_threads[gtid];
2820 
2821   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2822   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2823   case kmp_sch_static:
2824   case kmp_sch_static_greedy:
2825   case kmp_sch_static_balanced:
2826     *kind = kmp_sched_static;
2827     __kmp_sched_apply_mods_stdkind(kind, th_type);
2828     *chunk = 0; // chunk was not set, try to show this fact via zero value
2829     return;
2830   case kmp_sch_static_chunked:
2831     *kind = kmp_sched_static;
2832     break;
2833   case kmp_sch_dynamic_chunked:
2834     *kind = kmp_sched_dynamic;
2835     break;
2836   case kmp_sch_guided_chunked:
2837   case kmp_sch_guided_iterative_chunked:
2838   case kmp_sch_guided_analytical_chunked:
2839     *kind = kmp_sched_guided;
2840     break;
2841   case kmp_sch_auto:
2842     *kind = kmp_sched_auto;
2843     break;
2844   case kmp_sch_trapezoidal:
2845     *kind = kmp_sched_trapezoidal;
2846     break;
2847 #if KMP_STATIC_STEAL_ENABLED
2848   case kmp_sch_static_steal:
2849     *kind = kmp_sched_static_steal;
2850     break;
2851 #endif
2852   default:
2853     KMP_FATAL(UnknownSchedulingType, th_type);
2854   }
2855 
2856   __kmp_sched_apply_mods_stdkind(kind, th_type);
2857   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2858 }
2859 
2860 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2861 
2862   int ii, dd;
2863   kmp_team_t *team;
2864   kmp_info_t *thr;
2865 
2866   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2867   KMP_DEBUG_ASSERT(__kmp_init_serial);
2868 
2869   // validate level
2870   if (level == 0)
2871     return 0;
2872   if (level < 0)
2873     return -1;
2874   thr = __kmp_threads[gtid];
2875   team = thr->th.th_team;
2876   ii = team->t.t_level;
2877   if (level > ii)
2878     return -1;
2879 
2880   if (thr->th.th_teams_microtask) {
2881     // AC: we are in teams region where multiple nested teams have same level
2882     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2883     if (level <=
2884         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2885       KMP_DEBUG_ASSERT(ii >= tlevel);
2886       // AC: As we need to pass by the teams league, we need to artificially
2887       // increase ii
2888       if (ii == tlevel) {
2889         ii += 2; // three teams have same level
2890       } else {
2891         ii++; // two teams have same level
2892       }
2893     }
2894   }
2895 
2896   if (ii == level)
2897     return __kmp_tid_from_gtid(gtid);
2898 
2899   dd = team->t.t_serialized;
2900   level++;
2901   while (ii > level) {
2902     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2903     }
2904     if ((team->t.t_serialized) && (!dd)) {
2905       team = team->t.t_parent;
2906       continue;
2907     }
2908     if (ii > level) {
2909       team = team->t.t_parent;
2910       dd = team->t.t_serialized;
2911       ii--;
2912     }
2913   }
2914 
2915   return (dd > 1) ? (0) : (team->t.t_master_tid);
2916 }
2917 
2918 int __kmp_get_team_size(int gtid, int level) {
2919 
2920   int ii, dd;
2921   kmp_team_t *team;
2922   kmp_info_t *thr;
2923 
2924   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2925   KMP_DEBUG_ASSERT(__kmp_init_serial);
2926 
2927   // validate level
2928   if (level == 0)
2929     return 1;
2930   if (level < 0)
2931     return -1;
2932   thr = __kmp_threads[gtid];
2933   team = thr->th.th_team;
2934   ii = team->t.t_level;
2935   if (level > ii)
2936     return -1;
2937 
2938   if (thr->th.th_teams_microtask) {
2939     // AC: we are in teams region where multiple nested teams have same level
2940     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2941     if (level <=
2942         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2943       KMP_DEBUG_ASSERT(ii >= tlevel);
2944       // AC: As we need to pass by the teams league, we need to artificially
2945       // increase ii
2946       if (ii == tlevel) {
2947         ii += 2; // three teams have same level
2948       } else {
2949         ii++; // two teams have same level
2950       }
2951     }
2952   }
2953 
2954   while (ii > level) {
2955     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2956     }
2957     if (team->t.t_serialized && (!dd)) {
2958       team = team->t.t_parent;
2959       continue;
2960     }
2961     if (ii > level) {
2962       team = team->t.t_parent;
2963       ii--;
2964     }
2965   }
2966 
2967   return team->t.t_nproc;
2968 }
2969 
2970 kmp_r_sched_t __kmp_get_schedule_global() {
2971   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2972   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2973   // independently. So one can get the updated schedule here.
2974 
2975   kmp_r_sched_t r_sched;
2976 
2977   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2978   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2979   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2980   // different roots (even in OMP 2.5)
2981   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2982   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2983   if (s == kmp_sch_static) {
2984     // replace STATIC with more detailed schedule (balanced or greedy)
2985     r_sched.r_sched_type = __kmp_static;
2986   } else if (s == kmp_sch_guided_chunked) {
2987     // replace GUIDED with more detailed schedule (iterative or analytical)
2988     r_sched.r_sched_type = __kmp_guided;
2989   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2990     r_sched.r_sched_type = __kmp_sched;
2991   }
2992   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2993 
2994   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2995     // __kmp_chunk may be wrong here (if it was not ever set)
2996     r_sched.chunk = KMP_DEFAULT_CHUNK;
2997   } else {
2998     r_sched.chunk = __kmp_chunk;
2999   }
3000 
3001   return r_sched;
3002 }
3003 
3004 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3005    at least argc number of *t_argv entries for the requested team. */
3006 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3007 
3008   KMP_DEBUG_ASSERT(team);
3009   if (!realloc || argc > team->t.t_max_argc) {
3010 
3011     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3012                    "current entries=%d\n",
3013                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3014     /* if previously allocated heap space for args, free them */
3015     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3016       __kmp_free((void *)team->t.t_argv);
3017 
3018     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3019       /* use unused space in the cache line for arguments */
3020       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3021       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3022                      "argv entries\n",
3023                      team->t.t_id, team->t.t_max_argc));
3024       team->t.t_argv = &team->t.t_inline_argv[0];
3025       if (__kmp_storage_map) {
3026         __kmp_print_storage_map_gtid(
3027             -1, &team->t.t_inline_argv[0],
3028             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3029             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3030             team->t.t_id);
3031       }
3032     } else {
3033       /* allocate space for arguments in the heap */
3034       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3035                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3036                                : 2 * argc;
3037       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3038                      "argv entries\n",
3039                      team->t.t_id, team->t.t_max_argc));
3040       team->t.t_argv =
3041           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3042       if (__kmp_storage_map) {
3043         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3044                                      &team->t.t_argv[team->t.t_max_argc],
3045                                      sizeof(void *) * team->t.t_max_argc,
3046                                      "team_%d.t_argv", team->t.t_id);
3047       }
3048     }
3049   }
3050 }
3051 
3052 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3053   int i;
3054   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3055   team->t.t_threads =
3056       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3057   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3058       sizeof(dispatch_shared_info_t) * num_disp_buff);
3059   team->t.t_dispatch =
3060       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3061   team->t.t_implicit_task_taskdata =
3062       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3063   team->t.t_max_nproc = max_nth;
3064 
3065   /* setup dispatch buffers */
3066   for (i = 0; i < num_disp_buff; ++i) {
3067     team->t.t_disp_buffer[i].buffer_index = i;
3068     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3069   }
3070 }
3071 
3072 static void __kmp_free_team_arrays(kmp_team_t *team) {
3073   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3074   int i;
3075   for (i = 0; i < team->t.t_max_nproc; ++i) {
3076     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3077       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3078       team->t.t_dispatch[i].th_disp_buffer = NULL;
3079     }
3080   }
3081 #if KMP_USE_HIER_SCHED
3082   __kmp_dispatch_free_hierarchies(team);
3083 #endif
3084   __kmp_free(team->t.t_threads);
3085   __kmp_free(team->t.t_disp_buffer);
3086   __kmp_free(team->t.t_dispatch);
3087   __kmp_free(team->t.t_implicit_task_taskdata);
3088   team->t.t_threads = NULL;
3089   team->t.t_disp_buffer = NULL;
3090   team->t.t_dispatch = NULL;
3091   team->t.t_implicit_task_taskdata = 0;
3092 }
3093 
3094 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3095   kmp_info_t **oldThreads = team->t.t_threads;
3096 
3097   __kmp_free(team->t.t_disp_buffer);
3098   __kmp_free(team->t.t_dispatch);
3099   __kmp_free(team->t.t_implicit_task_taskdata);
3100   __kmp_allocate_team_arrays(team, max_nth);
3101 
3102   KMP_MEMCPY(team->t.t_threads, oldThreads,
3103              team->t.t_nproc * sizeof(kmp_info_t *));
3104 
3105   __kmp_free(oldThreads);
3106 }
3107 
3108 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3109 
3110   kmp_r_sched_t r_sched =
3111       __kmp_get_schedule_global(); // get current state of scheduling globals
3112 
3113   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3114 
3115   kmp_internal_control_t g_icvs = {
3116     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3117     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3118     // adjustment of threads (per thread)
3119     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3120     // whether blocktime is explicitly set
3121     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3122 #if KMP_USE_MONITOR
3123     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3124 // intervals
3125 #endif
3126     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3127     // next parallel region (per thread)
3128     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3129     __kmp_cg_max_nth, // int thread_limit;
3130     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3131     // for max_active_levels
3132     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3133     // {sched,chunk} pair
3134     __kmp_nested_proc_bind.bind_types[0],
3135     __kmp_default_device,
3136     NULL // struct kmp_internal_control *next;
3137   };
3138 
3139   return g_icvs;
3140 }
3141 
3142 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3143 
3144   kmp_internal_control_t gx_icvs;
3145   gx_icvs.serial_nesting_level =
3146       0; // probably =team->t.t_serial like in save_inter_controls
3147   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3148   gx_icvs.next = NULL;
3149 
3150   return gx_icvs;
3151 }
3152 
3153 static void __kmp_initialize_root(kmp_root_t *root) {
3154   int f;
3155   kmp_team_t *root_team;
3156   kmp_team_t *hot_team;
3157   int hot_team_max_nth;
3158   kmp_r_sched_t r_sched =
3159       __kmp_get_schedule_global(); // get current state of scheduling globals
3160   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3161   KMP_DEBUG_ASSERT(root);
3162   KMP_ASSERT(!root->r.r_begin);
3163 
3164   /* setup the root state structure */
3165   __kmp_init_lock(&root->r.r_begin_lock);
3166   root->r.r_begin = FALSE;
3167   root->r.r_active = FALSE;
3168   root->r.r_in_parallel = 0;
3169   root->r.r_blocktime = __kmp_dflt_blocktime;
3170 
3171   /* setup the root team for this task */
3172   /* allocate the root team structure */
3173   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3174 
3175   root_team =
3176       __kmp_allocate_team(root,
3177                           1, // new_nproc
3178                           1, // max_nproc
3179 #if OMPT_SUPPORT
3180                           ompt_data_none, // root parallel id
3181 #endif
3182                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3183                           0 // argc
3184                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3185                           );
3186 #if USE_DEBUGGER
3187   // Non-NULL value should be assigned to make the debugger display the root
3188   // team.
3189   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3190 #endif
3191 
3192   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3193 
3194   root->r.r_root_team = root_team;
3195   root_team->t.t_control_stack_top = NULL;
3196 
3197   /* initialize root team */
3198   root_team->t.t_threads[0] = NULL;
3199   root_team->t.t_nproc = 1;
3200   root_team->t.t_serialized = 1;
3201   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3202   root_team->t.t_sched.sched = r_sched.sched;
3203   KA_TRACE(
3204       20,
3205       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3206        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3207 
3208   /* setup the  hot team for this task */
3209   /* allocate the hot team structure */
3210   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3211 
3212   hot_team =
3213       __kmp_allocate_team(root,
3214                           1, // new_nproc
3215                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3216 #if OMPT_SUPPORT
3217                           ompt_data_none, // root parallel id
3218 #endif
3219                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3220                           0 // argc
3221                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3222                           );
3223   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3224 
3225   root->r.r_hot_team = hot_team;
3226   root_team->t.t_control_stack_top = NULL;
3227 
3228   /* first-time initialization */
3229   hot_team->t.t_parent = root_team;
3230 
3231   /* initialize hot team */
3232   hot_team_max_nth = hot_team->t.t_max_nproc;
3233   for (f = 0; f < hot_team_max_nth; ++f) {
3234     hot_team->t.t_threads[f] = NULL;
3235   }
3236   hot_team->t.t_nproc = 1;
3237   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3238   hot_team->t.t_sched.sched = r_sched.sched;
3239   hot_team->t.t_size_changed = 0;
3240 }
3241 
3242 #ifdef KMP_DEBUG
3243 
3244 typedef struct kmp_team_list_item {
3245   kmp_team_p const *entry;
3246   struct kmp_team_list_item *next;
3247 } kmp_team_list_item_t;
3248 typedef kmp_team_list_item_t *kmp_team_list_t;
3249 
3250 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3251     kmp_team_list_t list, // List of teams.
3252     kmp_team_p const *team // Team to add.
3253     ) {
3254 
3255   // List must terminate with item where both entry and next are NULL.
3256   // Team is added to the list only once.
3257   // List is sorted in ascending order by team id.
3258   // Team id is *not* a key.
3259 
3260   kmp_team_list_t l;
3261 
3262   KMP_DEBUG_ASSERT(list != NULL);
3263   if (team == NULL) {
3264     return;
3265   }
3266 
3267   __kmp_print_structure_team_accum(list, team->t.t_parent);
3268   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3269 
3270   // Search list for the team.
3271   l = list;
3272   while (l->next != NULL && l->entry != team) {
3273     l = l->next;
3274   }
3275   if (l->next != NULL) {
3276     return; // Team has been added before, exit.
3277   }
3278 
3279   // Team is not found. Search list again for insertion point.
3280   l = list;
3281   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3282     l = l->next;
3283   }
3284 
3285   // Insert team.
3286   {
3287     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3288         sizeof(kmp_team_list_item_t));
3289     *item = *l;
3290     l->entry = team;
3291     l->next = item;
3292   }
3293 }
3294 
3295 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3296 
3297                                        ) {
3298   __kmp_printf("%s", title);
3299   if (team != NULL) {
3300     __kmp_printf("%2x %p\n", team->t.t_id, team);
3301   } else {
3302     __kmp_printf(" - (nil)\n");
3303   }
3304 }
3305 
3306 static void __kmp_print_structure_thread(char const *title,
3307                                          kmp_info_p const *thread) {
3308   __kmp_printf("%s", title);
3309   if (thread != NULL) {
3310     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3311   } else {
3312     __kmp_printf(" - (nil)\n");
3313   }
3314 }
3315 
3316 void __kmp_print_structure(void) {
3317 
3318   kmp_team_list_t list;
3319 
3320   // Initialize list of teams.
3321   list =
3322       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3323   list->entry = NULL;
3324   list->next = NULL;
3325 
3326   __kmp_printf("\n------------------------------\nGlobal Thread "
3327                "Table\n------------------------------\n");
3328   {
3329     int gtid;
3330     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3331       __kmp_printf("%2d", gtid);
3332       if (__kmp_threads != NULL) {
3333         __kmp_printf(" %p", __kmp_threads[gtid]);
3334       }
3335       if (__kmp_root != NULL) {
3336         __kmp_printf(" %p", __kmp_root[gtid]);
3337       }
3338       __kmp_printf("\n");
3339     }
3340   }
3341 
3342   // Print out __kmp_threads array.
3343   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3344                "----------\n");
3345   if (__kmp_threads != NULL) {
3346     int gtid;
3347     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3348       kmp_info_t const *thread = __kmp_threads[gtid];
3349       if (thread != NULL) {
3350         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3351         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3352         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3353         __kmp_print_structure_team("    Serial Team:  ",
3354                                    thread->th.th_serial_team);
3355         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3356         __kmp_print_structure_thread("    Master:       ",
3357                                      thread->th.th_team_master);
3358         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3359         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3360         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3361         __kmp_print_structure_thread("    Next in pool: ",
3362                                      thread->th.th_next_pool);
3363         __kmp_printf("\n");
3364         __kmp_print_structure_team_accum(list, thread->th.th_team);
3365         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3366       }
3367     }
3368   } else {
3369     __kmp_printf("Threads array is not allocated.\n");
3370   }
3371 
3372   // Print out __kmp_root array.
3373   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3374                "--------\n");
3375   if (__kmp_root != NULL) {
3376     int gtid;
3377     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3378       kmp_root_t const *root = __kmp_root[gtid];
3379       if (root != NULL) {
3380         __kmp_printf("GTID %2d %p:\n", gtid, root);
3381         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3382         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3383         __kmp_print_structure_thread("    Uber Thread:  ",
3384                                      root->r.r_uber_thread);
3385         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3386         __kmp_printf("    In Parallel:  %2d\n",
3387                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3388         __kmp_printf("\n");
3389         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3390         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3391       }
3392     }
3393   } else {
3394     __kmp_printf("Ubers array is not allocated.\n");
3395   }
3396 
3397   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3398                "--------\n");
3399   while (list->next != NULL) {
3400     kmp_team_p const *team = list->entry;
3401     int i;
3402     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3403     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3404     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3405     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3406     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3407     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3408     for (i = 0; i < team->t.t_nproc; ++i) {
3409       __kmp_printf("    Thread %2d:      ", i);
3410       __kmp_print_structure_thread("", team->t.t_threads[i]);
3411     }
3412     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3413     __kmp_printf("\n");
3414     list = list->next;
3415   }
3416 
3417   // Print out __kmp_thread_pool and __kmp_team_pool.
3418   __kmp_printf("\n------------------------------\nPools\n----------------------"
3419                "--------\n");
3420   __kmp_print_structure_thread("Thread pool:          ",
3421                                CCAST(kmp_info_t *, __kmp_thread_pool));
3422   __kmp_print_structure_team("Team pool:            ",
3423                              CCAST(kmp_team_t *, __kmp_team_pool));
3424   __kmp_printf("\n");
3425 
3426   // Free team list.
3427   while (list != NULL) {
3428     kmp_team_list_item_t *item = list;
3429     list = list->next;
3430     KMP_INTERNAL_FREE(item);
3431   }
3432 }
3433 
3434 #endif
3435 
3436 //---------------------------------------------------------------------------
3437 //  Stuff for per-thread fast random number generator
3438 //  Table of primes
3439 static const unsigned __kmp_primes[] = {
3440     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3441     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3442     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3443     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3444     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3445     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3446     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3447     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3448     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3449     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3450     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3451 
3452 //---------------------------------------------------------------------------
3453 //  __kmp_get_random: Get a random number using a linear congruential method.
3454 unsigned short __kmp_get_random(kmp_info_t *thread) {
3455   unsigned x = thread->th.th_x;
3456   unsigned short r = (unsigned short)(x >> 16);
3457 
3458   thread->th.th_x = x * thread->th.th_a + 1;
3459 
3460   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3461                 thread->th.th_info.ds.ds_tid, r));
3462 
3463   return r;
3464 }
3465 //--------------------------------------------------------
3466 // __kmp_init_random: Initialize a random number generator
3467 void __kmp_init_random(kmp_info_t *thread) {
3468   unsigned seed = thread->th.th_info.ds.ds_tid;
3469 
3470   thread->th.th_a =
3471       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3472   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3473   KA_TRACE(30,
3474            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3475 }
3476 
3477 #if KMP_OS_WINDOWS
3478 /* reclaim array entries for root threads that are already dead, returns number
3479  * reclaimed */
3480 static int __kmp_reclaim_dead_roots(void) {
3481   int i, r = 0;
3482 
3483   for (i = 0; i < __kmp_threads_capacity; ++i) {
3484     if (KMP_UBER_GTID(i) &&
3485         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3486         !__kmp_root[i]
3487              ->r.r_active) { // AC: reclaim only roots died in non-active state
3488       r += __kmp_unregister_root_other_thread(i);
3489     }
3490   }
3491   return r;
3492 }
3493 #endif
3494 
3495 /* This function attempts to create free entries in __kmp_threads and
3496    __kmp_root, and returns the number of free entries generated.
3497 
3498    For Windows* OS static library, the first mechanism used is to reclaim array
3499    entries for root threads that are already dead.
3500 
3501    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3502    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3503    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3504    threadprivate cache array has been created. Synchronization with
3505    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3506 
3507    After any dead root reclamation, if the clipping value allows array expansion
3508    to result in the generation of a total of nNeed free slots, the function does
3509    that expansion. If not, nothing is done beyond the possible initial root
3510    thread reclamation.
3511 
3512    If any argument is negative, the behavior is undefined. */
3513 static int __kmp_expand_threads(int nNeed) {
3514   int added = 0;
3515   int minimumRequiredCapacity;
3516   int newCapacity;
3517   kmp_info_t **newThreads;
3518   kmp_root_t **newRoot;
3519 
3520 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3521 // resizing __kmp_threads does not need additional protection if foreign
3522 // threads are present
3523 
3524 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3525   /* only for Windows static library */
3526   /* reclaim array entries for root threads that are already dead */
3527   added = __kmp_reclaim_dead_roots();
3528 
3529   if (nNeed) {
3530     nNeed -= added;
3531     if (nNeed < 0)
3532       nNeed = 0;
3533   }
3534 #endif
3535   if (nNeed <= 0)
3536     return added;
3537 
3538   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3539   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3540   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3541   // > __kmp_max_nth in one of two ways:
3542   //
3543   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3544   //    may not be reused by another thread, so we may need to increase
3545   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3546   //
3547   // 2) New foreign root(s) are encountered.  We always register new foreign
3548   //    roots. This may cause a smaller # of threads to be allocated at
3549   //    subsequent parallel regions, but the worker threads hang around (and
3550   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3551   //
3552   // Anyway, that is the reason for moving the check to see if
3553   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3554   // instead of having it performed here. -BB
3555 
3556   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3557 
3558   /* compute expansion headroom to check if we can expand */
3559   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3560     /* possible expansion too small -- give up */
3561     return added;
3562   }
3563   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3564 
3565   newCapacity = __kmp_threads_capacity;
3566   do {
3567     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3568                                                           : __kmp_sys_max_nth;
3569   } while (newCapacity < minimumRequiredCapacity);
3570   newThreads = (kmp_info_t **)__kmp_allocate(
3571       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3572   newRoot =
3573       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3574   KMP_MEMCPY(newThreads, __kmp_threads,
3575              __kmp_threads_capacity * sizeof(kmp_info_t *));
3576   KMP_MEMCPY(newRoot, __kmp_root,
3577              __kmp_threads_capacity * sizeof(kmp_root_t *));
3578 
3579   kmp_info_t **temp_threads = __kmp_threads;
3580   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3581   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3582   __kmp_free(temp_threads);
3583   added += newCapacity - __kmp_threads_capacity;
3584   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3585 
3586   if (newCapacity > __kmp_tp_capacity) {
3587     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3588     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3589       __kmp_threadprivate_resize_cache(newCapacity);
3590     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3591       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3592     }
3593     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3594   }
3595 
3596   return added;
3597 }
3598 
3599 /* Register the current thread as a root thread and obtain our gtid. We must
3600    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3601    thread that calls from __kmp_do_serial_initialize() */
3602 int __kmp_register_root(int initial_thread) {
3603   kmp_info_t *root_thread;
3604   kmp_root_t *root;
3605   int gtid;
3606   int capacity;
3607   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3608   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3609   KMP_MB();
3610 
3611   /* 2007-03-02:
3612      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3613      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3614      work as expected -- it may return false (that means there is at least one
3615      empty slot in __kmp_threads array), but it is possible the only free slot
3616      is #0, which is reserved for initial thread and so cannot be used for this
3617      one. Following code workarounds this bug.
3618 
3619      However, right solution seems to be not reserving slot #0 for initial
3620      thread because:
3621      (1) there is no magic in slot #0,
3622      (2) we cannot detect initial thread reliably (the first thread which does
3623         serial initialization may be not a real initial thread).
3624   */
3625   capacity = __kmp_threads_capacity;
3626   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3627     --capacity;
3628   }
3629 
3630   /* see if there are too many threads */
3631   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3632     if (__kmp_tp_cached) {
3633       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3634                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3635                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3636     } else {
3637       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3638                   __kmp_msg_null);
3639     }
3640   }
3641 
3642   /* find an available thread slot */
3643   /* Don't reassign the zero slot since we need that to only be used by initial
3644      thread */
3645   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3646        gtid++)
3647     ;
3648   KA_TRACE(1,
3649            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3650   KMP_ASSERT(gtid < __kmp_threads_capacity);
3651 
3652   /* update global accounting */
3653   __kmp_all_nth++;
3654   TCW_4(__kmp_nth, __kmp_nth + 1);
3655 
3656   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3657   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3658   if (__kmp_adjust_gtid_mode) {
3659     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3660       if (TCR_4(__kmp_gtid_mode) != 2) {
3661         TCW_4(__kmp_gtid_mode, 2);
3662       }
3663     } else {
3664       if (TCR_4(__kmp_gtid_mode) != 1) {
3665         TCW_4(__kmp_gtid_mode, 1);
3666       }
3667     }
3668   }
3669 
3670 #ifdef KMP_ADJUST_BLOCKTIME
3671   /* Adjust blocktime to zero if necessary            */
3672   /* Middle initialization might not have occurred yet */
3673   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3674     if (__kmp_nth > __kmp_avail_proc) {
3675       __kmp_zero_bt = TRUE;
3676     }
3677   }
3678 #endif /* KMP_ADJUST_BLOCKTIME */
3679 
3680   /* setup this new hierarchy */
3681   if (!(root = __kmp_root[gtid])) {
3682     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3683     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3684   }
3685 
3686 #if KMP_STATS_ENABLED
3687   // Initialize stats as soon as possible (right after gtid assignment).
3688   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3689   __kmp_stats_thread_ptr->startLife();
3690   KMP_SET_THREAD_STATE(SERIAL_REGION);
3691   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3692 #endif
3693   __kmp_initialize_root(root);
3694 
3695   /* setup new root thread structure */
3696   if (root->r.r_uber_thread) {
3697     root_thread = root->r.r_uber_thread;
3698   } else {
3699     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3700     if (__kmp_storage_map) {
3701       __kmp_print_thread_storage_map(root_thread, gtid);
3702     }
3703     root_thread->th.th_info.ds.ds_gtid = gtid;
3704 #if OMPT_SUPPORT
3705     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3706 #endif
3707     root_thread->th.th_root = root;
3708     if (__kmp_env_consistency_check) {
3709       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3710     }
3711 #if USE_FAST_MEMORY
3712     __kmp_initialize_fast_memory(root_thread);
3713 #endif /* USE_FAST_MEMORY */
3714 
3715 #if KMP_USE_BGET
3716     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3717     __kmp_initialize_bget(root_thread);
3718 #endif
3719     __kmp_init_random(root_thread); // Initialize random number generator
3720   }
3721 
3722   /* setup the serial team held in reserve by the root thread */
3723   if (!root_thread->th.th_serial_team) {
3724     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3725     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3726     root_thread->th.th_serial_team = __kmp_allocate_team(
3727         root, 1, 1,
3728 #if OMPT_SUPPORT
3729         ompt_data_none, // root parallel id
3730 #endif
3731         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3732   }
3733   KMP_ASSERT(root_thread->th.th_serial_team);
3734   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3735                 root_thread->th.th_serial_team));
3736 
3737   /* drop root_thread into place */
3738   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3739 
3740   root->r.r_root_team->t.t_threads[0] = root_thread;
3741   root->r.r_hot_team->t.t_threads[0] = root_thread;
3742   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3743   // AC: the team created in reserve, not for execution (it is unused for now).
3744   root_thread->th.th_serial_team->t.t_serialized = 0;
3745   root->r.r_uber_thread = root_thread;
3746 
3747   /* initialize the thread, get it ready to go */
3748   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3749   TCW_4(__kmp_init_gtid, TRUE);
3750 
3751   /* prepare the master thread for get_gtid() */
3752   __kmp_gtid_set_specific(gtid);
3753 
3754 #if USE_ITT_BUILD
3755   __kmp_itt_thread_name(gtid);
3756 #endif /* USE_ITT_BUILD */
3757 
3758 #ifdef KMP_TDATA_GTID
3759   __kmp_gtid = gtid;
3760 #endif
3761   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3762   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3763 
3764   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3765                 "plain=%u\n",
3766                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3767                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3768                 KMP_INIT_BARRIER_STATE));
3769   { // Initialize barrier data.
3770     int b;
3771     for (b = 0; b < bs_last_barrier; ++b) {
3772       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3773 #if USE_DEBUGGER
3774       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3775 #endif
3776     }
3777   }
3778   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3779                    KMP_INIT_BARRIER_STATE);
3780 
3781 #if KMP_AFFINITY_SUPPORTED
3782   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3783   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3784   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3785   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3786   if (TCR_4(__kmp_init_middle)) {
3787     __kmp_affinity_set_init_mask(gtid, TRUE);
3788   }
3789 #endif /* KMP_AFFINITY_SUPPORTED */
3790   root_thread->th.th_def_allocator = __kmp_def_allocator;
3791   root_thread->th.th_prev_level = 0;
3792   root_thread->th.th_prev_num_threads = 1;
3793 
3794   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3795   tmp->cg_root = root_thread;
3796   tmp->cg_thread_limit = __kmp_cg_max_nth;
3797   tmp->cg_nthreads = 1;
3798   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3799                  " cg_nthreads init to 1\n",
3800                  root_thread, tmp));
3801   tmp->up = NULL;
3802   root_thread->th.th_cg_roots = tmp;
3803 
3804   __kmp_root_counter++;
3805 
3806 #if OMPT_SUPPORT
3807   if (!initial_thread && ompt_enabled.enabled) {
3808 
3809     kmp_info_t *root_thread = ompt_get_thread();
3810 
3811     ompt_set_thread_state(root_thread, ompt_state_overhead);
3812 
3813     if (ompt_enabled.ompt_callback_thread_begin) {
3814       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3815           ompt_thread_initial, __ompt_get_thread_data_internal());
3816     }
3817     ompt_data_t *task_data;
3818     ompt_data_t *parallel_data;
3819     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3820     if (ompt_enabled.ompt_callback_implicit_task) {
3821       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3822           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3823     }
3824 
3825     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3826   }
3827 #endif
3828 
3829   KMP_MB();
3830   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3831 
3832   return gtid;
3833 }
3834 
3835 #if KMP_NESTED_HOT_TEAMS
3836 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3837                                 const int max_level) {
3838   int i, n, nth;
3839   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3840   if (!hot_teams || !hot_teams[level].hot_team) {
3841     return 0;
3842   }
3843   KMP_DEBUG_ASSERT(level < max_level);
3844   kmp_team_t *team = hot_teams[level].hot_team;
3845   nth = hot_teams[level].hot_team_nth;
3846   n = nth - 1; // master is not freed
3847   if (level < max_level - 1) {
3848     for (i = 0; i < nth; ++i) {
3849       kmp_info_t *th = team->t.t_threads[i];
3850       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3851       if (i > 0 && th->th.th_hot_teams) {
3852         __kmp_free(th->th.th_hot_teams);
3853         th->th.th_hot_teams = NULL;
3854       }
3855     }
3856   }
3857   __kmp_free_team(root, team, NULL);
3858   return n;
3859 }
3860 #endif
3861 
3862 // Resets a root thread and clear its root and hot teams.
3863 // Returns the number of __kmp_threads entries directly and indirectly freed.
3864 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3865   kmp_team_t *root_team = root->r.r_root_team;
3866   kmp_team_t *hot_team = root->r.r_hot_team;
3867   int n = hot_team->t.t_nproc;
3868   int i;
3869 
3870   KMP_DEBUG_ASSERT(!root->r.r_active);
3871 
3872   root->r.r_root_team = NULL;
3873   root->r.r_hot_team = NULL;
3874   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3875   // before call to __kmp_free_team().
3876   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3877 #if KMP_NESTED_HOT_TEAMS
3878   if (__kmp_hot_teams_max_level >
3879       0) { // need to free nested hot teams and their threads if any
3880     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3881       kmp_info_t *th = hot_team->t.t_threads[i];
3882       if (__kmp_hot_teams_max_level > 1) {
3883         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3884       }
3885       if (th->th.th_hot_teams) {
3886         __kmp_free(th->th.th_hot_teams);
3887         th->th.th_hot_teams = NULL;
3888       }
3889     }
3890   }
3891 #endif
3892   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3893 
3894   // Before we can reap the thread, we need to make certain that all other
3895   // threads in the teams that had this root as ancestor have stopped trying to
3896   // steal tasks.
3897   if (__kmp_tasking_mode != tskm_immediate_exec) {
3898     __kmp_wait_to_unref_task_teams();
3899   }
3900 
3901 #if KMP_OS_WINDOWS
3902   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3903   KA_TRACE(
3904       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3905            "\n",
3906            (LPVOID) & (root->r.r_uber_thread->th),
3907            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3908   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3909 #endif /* KMP_OS_WINDOWS */
3910 
3911 #if OMPT_SUPPORT
3912   ompt_data_t *task_data;
3913   ompt_data_t *parallel_data;
3914   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3915   if (ompt_enabled.ompt_callback_implicit_task) {
3916     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3917         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3918   }
3919   if (ompt_enabled.ompt_callback_thread_end) {
3920     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3921         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3922   }
3923 #endif
3924 
3925   TCW_4(__kmp_nth,
3926         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3927   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3928   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3929                  " to %d\n",
3930                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3931                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3932   if (i == 1) {
3933     // need to free contention group structure
3934     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3935                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3936     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3937     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3938     root->r.r_uber_thread->th.th_cg_roots = NULL;
3939   }
3940   __kmp_reap_thread(root->r.r_uber_thread, 1);
3941 
3942   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3943   // instead of freeing.
3944   root->r.r_uber_thread = NULL;
3945   /* mark root as no longer in use */
3946   root->r.r_begin = FALSE;
3947 
3948   return n;
3949 }
3950 
3951 void __kmp_unregister_root_current_thread(int gtid) {
3952   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3953   /* this lock should be ok, since unregister_root_current_thread is never
3954      called during an abort, only during a normal close. furthermore, if you
3955      have the forkjoin lock, you should never try to get the initz lock */
3956   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3957   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3958     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3959                   "exiting T#%d\n",
3960                   gtid));
3961     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3962     return;
3963   }
3964   kmp_root_t *root = __kmp_root[gtid];
3965 
3966   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3967   KMP_ASSERT(KMP_UBER_GTID(gtid));
3968   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3969   KMP_ASSERT(root->r.r_active == FALSE);
3970 
3971   KMP_MB();
3972 
3973   kmp_info_t *thread = __kmp_threads[gtid];
3974   kmp_team_t *team = thread->th.th_team;
3975   kmp_task_team_t *task_team = thread->th.th_task_team;
3976 
3977   // we need to wait for the proxy tasks before finishing the thread
3978   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3979 #if OMPT_SUPPORT
3980     // the runtime is shutting down so we won't report any events
3981     thread->th.ompt_thread_info.state = ompt_state_undefined;
3982 #endif
3983     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3984   }
3985 
3986   __kmp_reset_root(gtid, root);
3987 
3988   KMP_MB();
3989   KC_TRACE(10,
3990            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3991 
3992   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3993 }
3994 
3995 #if KMP_OS_WINDOWS
3996 /* __kmp_forkjoin_lock must be already held
3997    Unregisters a root thread that is not the current thread.  Returns the number
3998    of __kmp_threads entries freed as a result. */
3999 static int __kmp_unregister_root_other_thread(int gtid) {
4000   kmp_root_t *root = __kmp_root[gtid];
4001   int r;
4002 
4003   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4004   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4005   KMP_ASSERT(KMP_UBER_GTID(gtid));
4006   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4007   KMP_ASSERT(root->r.r_active == FALSE);
4008 
4009   r = __kmp_reset_root(gtid, root);
4010   KC_TRACE(10,
4011            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4012   return r;
4013 }
4014 #endif
4015 
4016 #if KMP_DEBUG
4017 void __kmp_task_info() {
4018 
4019   kmp_int32 gtid = __kmp_entry_gtid();
4020   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4021   kmp_info_t *this_thr = __kmp_threads[gtid];
4022   kmp_team_t *steam = this_thr->th.th_serial_team;
4023   kmp_team_t *team = this_thr->th.th_team;
4024 
4025   __kmp_printf(
4026       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4027       "ptask=%p\n",
4028       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4029       team->t.t_implicit_task_taskdata[tid].td_parent);
4030 }
4031 #endif // KMP_DEBUG
4032 
4033 /* TODO optimize with one big memclr, take out what isn't needed, split
4034    responsibility to workers as much as possible, and delay initialization of
4035    features as much as possible  */
4036 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4037                                   int tid, int gtid) {
4038   /* this_thr->th.th_info.ds.ds_gtid is setup in
4039      kmp_allocate_thread/create_worker.
4040      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4041   kmp_info_t *master = team->t.t_threads[0];
4042   KMP_DEBUG_ASSERT(this_thr != NULL);
4043   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4044   KMP_DEBUG_ASSERT(team);
4045   KMP_DEBUG_ASSERT(team->t.t_threads);
4046   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4047   KMP_DEBUG_ASSERT(master);
4048   KMP_DEBUG_ASSERT(master->th.th_root);
4049 
4050   KMP_MB();
4051 
4052   TCW_SYNC_PTR(this_thr->th.th_team, team);
4053 
4054   this_thr->th.th_info.ds.ds_tid = tid;
4055   this_thr->th.th_set_nproc = 0;
4056   if (__kmp_tasking_mode != tskm_immediate_exec)
4057     // When tasking is possible, threads are not safe to reap until they are
4058     // done tasking; this will be set when tasking code is exited in wait
4059     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4060   else // no tasking --> always safe to reap
4061     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4062   this_thr->th.th_set_proc_bind = proc_bind_default;
4063 #if KMP_AFFINITY_SUPPORTED
4064   this_thr->th.th_new_place = this_thr->th.th_current_place;
4065 #endif
4066   this_thr->th.th_root = master->th.th_root;
4067 
4068   /* setup the thread's cache of the team structure */
4069   this_thr->th.th_team_nproc = team->t.t_nproc;
4070   this_thr->th.th_team_master = master;
4071   this_thr->th.th_team_serialized = team->t.t_serialized;
4072   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4073 
4074   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4075 
4076   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4077                 tid, gtid, this_thr, this_thr->th.th_current_task));
4078 
4079   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4080                            team, tid, TRUE);
4081 
4082   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4083                 tid, gtid, this_thr, this_thr->th.th_current_task));
4084   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4085   // __kmp_initialize_team()?
4086 
4087   /* TODO no worksharing in speculative threads */
4088   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4089 
4090   this_thr->th.th_local.this_construct = 0;
4091 
4092   if (!this_thr->th.th_pri_common) {
4093     this_thr->th.th_pri_common =
4094         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4095     if (__kmp_storage_map) {
4096       __kmp_print_storage_map_gtid(
4097           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4098           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4099     }
4100     this_thr->th.th_pri_head = NULL;
4101   }
4102 
4103   if (this_thr != master && // Master's CG root is initialized elsewhere
4104       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4105     // Make new thread's CG root same as master's
4106     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4107     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4108     if (tmp) {
4109       // worker changes CG, need to check if old CG should be freed
4110       int i = tmp->cg_nthreads--;
4111       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4112                      " on node %p of thread %p to %d\n",
4113                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4114       if (i == 1) {
4115         __kmp_free(tmp); // last thread left CG --> free it
4116       }
4117     }
4118     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4119     // Increment new thread's CG root's counter to add the new thread
4120     this_thr->th.th_cg_roots->cg_nthreads++;
4121     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4122                    " node %p of thread %p to %d\n",
4123                    this_thr, this_thr->th.th_cg_roots,
4124                    this_thr->th.th_cg_roots->cg_root,
4125                    this_thr->th.th_cg_roots->cg_nthreads));
4126     this_thr->th.th_current_task->td_icvs.thread_limit =
4127         this_thr->th.th_cg_roots->cg_thread_limit;
4128   }
4129 
4130   /* Initialize dynamic dispatch */
4131   {
4132     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4133     // Use team max_nproc since this will never change for the team.
4134     size_t disp_size =
4135         sizeof(dispatch_private_info_t) *
4136         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4137     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4138                   team->t.t_max_nproc));
4139     KMP_ASSERT(dispatch);
4140     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4141     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4142 
4143     dispatch->th_disp_index = 0;
4144     dispatch->th_doacross_buf_idx = 0;
4145     if (!dispatch->th_disp_buffer) {
4146       dispatch->th_disp_buffer =
4147           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4148 
4149       if (__kmp_storage_map) {
4150         __kmp_print_storage_map_gtid(
4151             gtid, &dispatch->th_disp_buffer[0],
4152             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4153                                           ? 1
4154                                           : __kmp_dispatch_num_buffers],
4155             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4156                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4157             gtid, team->t.t_id, gtid);
4158       }
4159     } else {
4160       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4161     }
4162 
4163     dispatch->th_dispatch_pr_current = 0;
4164     dispatch->th_dispatch_sh_current = 0;
4165 
4166     dispatch->th_deo_fcn = 0; /* ORDERED     */
4167     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4168   }
4169 
4170   this_thr->th.th_next_pool = NULL;
4171 
4172   if (!this_thr->th.th_task_state_memo_stack) {
4173     size_t i;
4174     this_thr->th.th_task_state_memo_stack =
4175         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4176     this_thr->th.th_task_state_top = 0;
4177     this_thr->th.th_task_state_stack_sz = 4;
4178     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4179          ++i) // zero init the stack
4180       this_thr->th.th_task_state_memo_stack[i] = 0;
4181   }
4182 
4183   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4184   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4185 
4186   KMP_MB();
4187 }
4188 
4189 /* allocate a new thread for the requesting team. this is only called from
4190    within a forkjoin critical section. we will first try to get an available
4191    thread from the thread pool. if none is available, we will fork a new one
4192    assuming we are able to create a new one. this should be assured, as the
4193    caller should check on this first. */
4194 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4195                                   int new_tid) {
4196   kmp_team_t *serial_team;
4197   kmp_info_t *new_thr;
4198   int new_gtid;
4199 
4200   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4201   KMP_DEBUG_ASSERT(root && team);
4202 #if !KMP_NESTED_HOT_TEAMS
4203   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4204 #endif
4205   KMP_MB();
4206 
4207   /* first, try to get one from the thread pool */
4208   if (__kmp_thread_pool) {
4209     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4210     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4211     if (new_thr == __kmp_thread_pool_insert_pt) {
4212       __kmp_thread_pool_insert_pt = NULL;
4213     }
4214     TCW_4(new_thr->th.th_in_pool, FALSE);
4215     __kmp_suspend_initialize_thread(new_thr);
4216     __kmp_lock_suspend_mx(new_thr);
4217     if (new_thr->th.th_active_in_pool == TRUE) {
4218       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4219       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4220       new_thr->th.th_active_in_pool = FALSE;
4221     }
4222     __kmp_unlock_suspend_mx(new_thr);
4223 
4224     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4225                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4226     KMP_ASSERT(!new_thr->th.th_team);
4227     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4228 
4229     /* setup the thread structure */
4230     __kmp_initialize_info(new_thr, team, new_tid,
4231                           new_thr->th.th_info.ds.ds_gtid);
4232     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4233 
4234     TCW_4(__kmp_nth, __kmp_nth + 1);
4235 
4236     new_thr->th.th_task_state = 0;
4237     new_thr->th.th_task_state_top = 0;
4238     new_thr->th.th_task_state_stack_sz = 4;
4239 
4240 #ifdef KMP_ADJUST_BLOCKTIME
4241     /* Adjust blocktime back to zero if necessary */
4242     /* Middle initialization might not have occurred yet */
4243     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4244       if (__kmp_nth > __kmp_avail_proc) {
4245         __kmp_zero_bt = TRUE;
4246       }
4247     }
4248 #endif /* KMP_ADJUST_BLOCKTIME */
4249 
4250 #if KMP_DEBUG
4251     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4252     // KMP_BARRIER_PARENT_FLAG.
4253     int b;
4254     kmp_balign_t *balign = new_thr->th.th_bar;
4255     for (b = 0; b < bs_last_barrier; ++b)
4256       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4257 #endif
4258 
4259     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4260                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4261 
4262     KMP_MB();
4263     return new_thr;
4264   }
4265 
4266   /* no, well fork a new one */
4267   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4268   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4269 
4270 #if KMP_USE_MONITOR
4271   // If this is the first worker thread the RTL is creating, then also
4272   // launch the monitor thread.  We try to do this as early as possible.
4273   if (!TCR_4(__kmp_init_monitor)) {
4274     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4275     if (!TCR_4(__kmp_init_monitor)) {
4276       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4277       TCW_4(__kmp_init_monitor, 1);
4278       __kmp_create_monitor(&__kmp_monitor);
4279       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4280 #if KMP_OS_WINDOWS
4281       // AC: wait until monitor has started. This is a fix for CQ232808.
4282       // The reason is that if the library is loaded/unloaded in a loop with
4283       // small (parallel) work in between, then there is high probability that
4284       // monitor thread started after the library shutdown. At shutdown it is
4285       // too late to cope with the problem, because when the master is in
4286       // DllMain (process detach) the monitor has no chances to start (it is
4287       // blocked), and master has no means to inform the monitor that the
4288       // library has gone, because all the memory which the monitor can access
4289       // is going to be released/reset.
4290       while (TCR_4(__kmp_init_monitor) < 2) {
4291         KMP_YIELD(TRUE);
4292       }
4293       KF_TRACE(10, ("after monitor thread has started\n"));
4294 #endif
4295     }
4296     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4297   }
4298 #endif
4299 
4300   KMP_MB();
4301   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4302     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4303   }
4304 
4305   /* allocate space for it. */
4306   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4307 
4308   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4309 
4310 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4311   // suppress race conditions detection on synchronization flags in debug mode
4312   // this helps to analyze library internals eliminating false positives
4313   __itt_suppress_mark_range(
4314       __itt_suppress_range, __itt_suppress_threading_errors,
4315       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4316   __itt_suppress_mark_range(
4317       __itt_suppress_range, __itt_suppress_threading_errors,
4318       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4319 #if KMP_OS_WINDOWS
4320   __itt_suppress_mark_range(
4321       __itt_suppress_range, __itt_suppress_threading_errors,
4322       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4323 #else
4324   __itt_suppress_mark_range(__itt_suppress_range,
4325                             __itt_suppress_threading_errors,
4326                             &new_thr->th.th_suspend_init_count,
4327                             sizeof(new_thr->th.th_suspend_init_count));
4328 #endif
4329   // TODO: check if we need to also suppress b_arrived flags
4330   __itt_suppress_mark_range(__itt_suppress_range,
4331                             __itt_suppress_threading_errors,
4332                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4333                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4334   __itt_suppress_mark_range(__itt_suppress_range,
4335                             __itt_suppress_threading_errors,
4336                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4337                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4338   __itt_suppress_mark_range(__itt_suppress_range,
4339                             __itt_suppress_threading_errors,
4340                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4341                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4342 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4343   if (__kmp_storage_map) {
4344     __kmp_print_thread_storage_map(new_thr, new_gtid);
4345   }
4346 
4347   // add the reserve serialized team, initialized from the team's master thread
4348   {
4349     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4350     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4351     new_thr->th.th_serial_team = serial_team =
4352         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4353 #if OMPT_SUPPORT
4354                                           ompt_data_none, // root parallel id
4355 #endif
4356                                           proc_bind_default, &r_icvs,
4357                                           0 USE_NESTED_HOT_ARG(NULL));
4358   }
4359   KMP_ASSERT(serial_team);
4360   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4361   // execution (it is unused for now).
4362   serial_team->t.t_threads[0] = new_thr;
4363   KF_TRACE(10,
4364            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4365             new_thr));
4366 
4367   /* setup the thread structures */
4368   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4369 
4370 #if USE_FAST_MEMORY
4371   __kmp_initialize_fast_memory(new_thr);
4372 #endif /* USE_FAST_MEMORY */
4373 
4374 #if KMP_USE_BGET
4375   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4376   __kmp_initialize_bget(new_thr);
4377 #endif
4378 
4379   __kmp_init_random(new_thr); // Initialize random number generator
4380 
4381   /* Initialize these only once when thread is grabbed for a team allocation */
4382   KA_TRACE(20,
4383            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4384             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4385 
4386   int b;
4387   kmp_balign_t *balign = new_thr->th.th_bar;
4388   for (b = 0; b < bs_last_barrier; ++b) {
4389     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4390     balign[b].bb.team = NULL;
4391     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4392     balign[b].bb.use_oncore_barrier = 0;
4393   }
4394 
4395   new_thr->th.th_spin_here = FALSE;
4396   new_thr->th.th_next_waiting = 0;
4397 #if KMP_OS_UNIX
4398   new_thr->th.th_blocking = false;
4399 #endif
4400 
4401 #if KMP_AFFINITY_SUPPORTED
4402   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4403   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4404   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4405   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4406 #endif
4407   new_thr->th.th_def_allocator = __kmp_def_allocator;
4408   new_thr->th.th_prev_level = 0;
4409   new_thr->th.th_prev_num_threads = 1;
4410 
4411   TCW_4(new_thr->th.th_in_pool, FALSE);
4412   new_thr->th.th_active_in_pool = FALSE;
4413   TCW_4(new_thr->th.th_active, TRUE);
4414 
4415   /* adjust the global counters */
4416   __kmp_all_nth++;
4417   __kmp_nth++;
4418 
4419   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4420   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4421   if (__kmp_adjust_gtid_mode) {
4422     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4423       if (TCR_4(__kmp_gtid_mode) != 2) {
4424         TCW_4(__kmp_gtid_mode, 2);
4425       }
4426     } else {
4427       if (TCR_4(__kmp_gtid_mode) != 1) {
4428         TCW_4(__kmp_gtid_mode, 1);
4429       }
4430     }
4431   }
4432 
4433 #ifdef KMP_ADJUST_BLOCKTIME
4434   /* Adjust blocktime back to zero if necessary       */
4435   /* Middle initialization might not have occurred yet */
4436   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4437     if (__kmp_nth > __kmp_avail_proc) {
4438       __kmp_zero_bt = TRUE;
4439     }
4440   }
4441 #endif /* KMP_ADJUST_BLOCKTIME */
4442 
4443   /* actually fork it and create the new worker thread */
4444   KF_TRACE(
4445       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4446   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4447   KF_TRACE(10,
4448            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4449 
4450   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4451                 new_gtid));
4452   KMP_MB();
4453   return new_thr;
4454 }
4455 
4456 /* Reinitialize team for reuse.
4457    The hot team code calls this case at every fork barrier, so EPCC barrier
4458    test are extremely sensitive to changes in it, esp. writes to the team
4459    struct, which cause a cache invalidation in all threads.
4460    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4461 static void __kmp_reinitialize_team(kmp_team_t *team,
4462                                     kmp_internal_control_t *new_icvs,
4463                                     ident_t *loc) {
4464   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4465                 team->t.t_threads[0], team));
4466   KMP_DEBUG_ASSERT(team && new_icvs);
4467   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4468   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4469 
4470   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4471   // Copy ICVs to the master thread's implicit taskdata
4472   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4473   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4474 
4475   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4476                 team->t.t_threads[0], team));
4477 }
4478 
4479 /* Initialize the team data structure.
4480    This assumes the t_threads and t_max_nproc are already set.
4481    Also, we don't touch the arguments */
4482 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4483                                   kmp_internal_control_t *new_icvs,
4484                                   ident_t *loc) {
4485   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4486 
4487   /* verify */
4488   KMP_DEBUG_ASSERT(team);
4489   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4490   KMP_DEBUG_ASSERT(team->t.t_threads);
4491   KMP_MB();
4492 
4493   team->t.t_master_tid = 0; /* not needed */
4494   /* team->t.t_master_bar;        not needed */
4495   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4496   team->t.t_nproc = new_nproc;
4497 
4498   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4499   team->t.t_next_pool = NULL;
4500   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4501    * up hot team */
4502 
4503   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4504   team->t.t_invoke = NULL; /* not needed */
4505 
4506   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4507   team->t.t_sched.sched = new_icvs->sched.sched;
4508 
4509 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4510   team->t.t_fp_control_saved = FALSE; /* not needed */
4511   team->t.t_x87_fpu_control_word = 0; /* not needed */
4512   team->t.t_mxcsr = 0; /* not needed */
4513 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4514 
4515   team->t.t_construct = 0;
4516 
4517   team->t.t_ordered.dt.t_value = 0;
4518   team->t.t_master_active = FALSE;
4519 
4520 #ifdef KMP_DEBUG
4521   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4522 #endif
4523 #if KMP_OS_WINDOWS
4524   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4525 #endif
4526 
4527   team->t.t_control_stack_top = NULL;
4528 
4529   __kmp_reinitialize_team(team, new_icvs, loc);
4530 
4531   KMP_MB();
4532   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4533 }
4534 
4535 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4536 /* Sets full mask for thread and returns old mask, no changes to structures. */
4537 static void
4538 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4539   if (KMP_AFFINITY_CAPABLE()) {
4540     int status;
4541     if (old_mask != NULL) {
4542       status = __kmp_get_system_affinity(old_mask, TRUE);
4543       int error = errno;
4544       if (status != 0) {
4545         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4546                     __kmp_msg_null);
4547       }
4548     }
4549     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4550   }
4551 }
4552 #endif
4553 
4554 #if KMP_AFFINITY_SUPPORTED
4555 
4556 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4557 // It calculates the worker + master thread's partition based upon the parent
4558 // thread's partition, and binds each worker to a thread in their partition.
4559 // The master thread's partition should already include its current binding.
4560 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4561   // Copy the master thread's place partition to the team struct
4562   kmp_info_t *master_th = team->t.t_threads[0];
4563   KMP_DEBUG_ASSERT(master_th != NULL);
4564   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4565   int first_place = master_th->th.th_first_place;
4566   int last_place = master_th->th.th_last_place;
4567   int masters_place = master_th->th.th_current_place;
4568   team->t.t_first_place = first_place;
4569   team->t.t_last_place = last_place;
4570 
4571   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4572                 "bound to place %d partition = [%d,%d]\n",
4573                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4574                 team->t.t_id, masters_place, first_place, last_place));
4575 
4576   switch (proc_bind) {
4577 
4578   case proc_bind_default:
4579     // serial teams might have the proc_bind policy set to proc_bind_default. It
4580     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4581     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4582     break;
4583 
4584   case proc_bind_master: {
4585     int f;
4586     int n_th = team->t.t_nproc;
4587     for (f = 1; f < n_th; f++) {
4588       kmp_info_t *th = team->t.t_threads[f];
4589       KMP_DEBUG_ASSERT(th != NULL);
4590       th->th.th_first_place = first_place;
4591       th->th.th_last_place = last_place;
4592       th->th.th_new_place = masters_place;
4593       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4594           team->t.t_display_affinity != 1) {
4595         team->t.t_display_affinity = 1;
4596       }
4597 
4598       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4599                      "partition = [%d,%d]\n",
4600                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4601                      f, masters_place, first_place, last_place));
4602     }
4603   } break;
4604 
4605   case proc_bind_close: {
4606     int f;
4607     int n_th = team->t.t_nproc;
4608     int n_places;
4609     if (first_place <= last_place) {
4610       n_places = last_place - first_place + 1;
4611     } else {
4612       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4613     }
4614     if (n_th <= n_places) {
4615       int place = masters_place;
4616       for (f = 1; f < n_th; f++) {
4617         kmp_info_t *th = team->t.t_threads[f];
4618         KMP_DEBUG_ASSERT(th != NULL);
4619 
4620         if (place == last_place) {
4621           place = first_place;
4622         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4623           place = 0;
4624         } else {
4625           place++;
4626         }
4627         th->th.th_first_place = first_place;
4628         th->th.th_last_place = last_place;
4629         th->th.th_new_place = place;
4630         if (__kmp_display_affinity && place != th->th.th_current_place &&
4631             team->t.t_display_affinity != 1) {
4632           team->t.t_display_affinity = 1;
4633         }
4634 
4635         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4636                        "partition = [%d,%d]\n",
4637                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4638                        team->t.t_id, f, place, first_place, last_place));
4639       }
4640     } else {
4641       int S, rem, gap, s_count;
4642       S = n_th / n_places;
4643       s_count = 0;
4644       rem = n_th - (S * n_places);
4645       gap = rem > 0 ? n_places / rem : n_places;
4646       int place = masters_place;
4647       int gap_ct = gap;
4648       for (f = 0; f < n_th; f++) {
4649         kmp_info_t *th = team->t.t_threads[f];
4650         KMP_DEBUG_ASSERT(th != NULL);
4651 
4652         th->th.th_first_place = first_place;
4653         th->th.th_last_place = last_place;
4654         th->th.th_new_place = place;
4655         if (__kmp_display_affinity && place != th->th.th_current_place &&
4656             team->t.t_display_affinity != 1) {
4657           team->t.t_display_affinity = 1;
4658         }
4659         s_count++;
4660 
4661         if ((s_count == S) && rem && (gap_ct == gap)) {
4662           // do nothing, add an extra thread to place on next iteration
4663         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4664           // we added an extra thread to this place; move to next place
4665           if (place == last_place) {
4666             place = first_place;
4667           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4668             place = 0;
4669           } else {
4670             place++;
4671           }
4672           s_count = 0;
4673           gap_ct = 1;
4674           rem--;
4675         } else if (s_count == S) { // place full; don't add extra
4676           if (place == last_place) {
4677             place = first_place;
4678           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4679             place = 0;
4680           } else {
4681             place++;
4682           }
4683           gap_ct++;
4684           s_count = 0;
4685         }
4686 
4687         KA_TRACE(100,
4688                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4689                   "partition = [%d,%d]\n",
4690                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4691                   th->th.th_new_place, first_place, last_place));
4692       }
4693       KMP_DEBUG_ASSERT(place == masters_place);
4694     }
4695   } break;
4696 
4697   case proc_bind_spread: {
4698     int f;
4699     int n_th = team->t.t_nproc;
4700     int n_places;
4701     int thidx;
4702     if (first_place <= last_place) {
4703       n_places = last_place - first_place + 1;
4704     } else {
4705       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4706     }
4707     if (n_th <= n_places) {
4708       int place = -1;
4709 
4710       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4711         int S = n_places / n_th;
4712         int s_count, rem, gap, gap_ct;
4713 
4714         place = masters_place;
4715         rem = n_places - n_th * S;
4716         gap = rem ? n_th / rem : 1;
4717         gap_ct = gap;
4718         thidx = n_th;
4719         if (update_master_only == 1)
4720           thidx = 1;
4721         for (f = 0; f < thidx; f++) {
4722           kmp_info_t *th = team->t.t_threads[f];
4723           KMP_DEBUG_ASSERT(th != NULL);
4724 
4725           th->th.th_first_place = place;
4726           th->th.th_new_place = place;
4727           if (__kmp_display_affinity && place != th->th.th_current_place &&
4728               team->t.t_display_affinity != 1) {
4729             team->t.t_display_affinity = 1;
4730           }
4731           s_count = 1;
4732           while (s_count < S) {
4733             if (place == last_place) {
4734               place = first_place;
4735             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4736               place = 0;
4737             } else {
4738               place++;
4739             }
4740             s_count++;
4741           }
4742           if (rem && (gap_ct == gap)) {
4743             if (place == last_place) {
4744               place = first_place;
4745             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4746               place = 0;
4747             } else {
4748               place++;
4749             }
4750             rem--;
4751             gap_ct = 0;
4752           }
4753           th->th.th_last_place = place;
4754           gap_ct++;
4755 
4756           if (place == last_place) {
4757             place = first_place;
4758           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4759             place = 0;
4760           } else {
4761             place++;
4762           }
4763 
4764           KA_TRACE(100,
4765                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4766                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4767                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4768                     f, th->th.th_new_place, th->th.th_first_place,
4769                     th->th.th_last_place, __kmp_affinity_num_masks));
4770         }
4771       } else {
4772         /* Having uniform space of available computation places I can create
4773            T partitions of round(P/T) size and put threads into the first
4774            place of each partition. */
4775         double current = static_cast<double>(masters_place);
4776         double spacing =
4777             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4778         int first, last;
4779         kmp_info_t *th;
4780 
4781         thidx = n_th + 1;
4782         if (update_master_only == 1)
4783           thidx = 1;
4784         for (f = 0; f < thidx; f++) {
4785           first = static_cast<int>(current);
4786           last = static_cast<int>(current + spacing) - 1;
4787           KMP_DEBUG_ASSERT(last >= first);
4788           if (first >= n_places) {
4789             if (masters_place) {
4790               first -= n_places;
4791               last -= n_places;
4792               if (first == (masters_place + 1)) {
4793                 KMP_DEBUG_ASSERT(f == n_th);
4794                 first--;
4795               }
4796               if (last == masters_place) {
4797                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4798                 last--;
4799               }
4800             } else {
4801               KMP_DEBUG_ASSERT(f == n_th);
4802               first = 0;
4803               last = 0;
4804             }
4805           }
4806           if (last >= n_places) {
4807             last = (n_places - 1);
4808           }
4809           place = first;
4810           current += spacing;
4811           if (f < n_th) {
4812             KMP_DEBUG_ASSERT(0 <= first);
4813             KMP_DEBUG_ASSERT(n_places > first);
4814             KMP_DEBUG_ASSERT(0 <= last);
4815             KMP_DEBUG_ASSERT(n_places > last);
4816             KMP_DEBUG_ASSERT(last_place >= first_place);
4817             th = team->t.t_threads[f];
4818             KMP_DEBUG_ASSERT(th);
4819             th->th.th_first_place = first;
4820             th->th.th_new_place = place;
4821             th->th.th_last_place = last;
4822             if (__kmp_display_affinity && place != th->th.th_current_place &&
4823                 team->t.t_display_affinity != 1) {
4824               team->t.t_display_affinity = 1;
4825             }
4826             KA_TRACE(100,
4827                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4828                       "partition = [%d,%d], spacing = %.4f\n",
4829                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4830                       team->t.t_id, f, th->th.th_new_place,
4831                       th->th.th_first_place, th->th.th_last_place, spacing));
4832           }
4833         }
4834       }
4835       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4836     } else {
4837       int S, rem, gap, s_count;
4838       S = n_th / n_places;
4839       s_count = 0;
4840       rem = n_th - (S * n_places);
4841       gap = rem > 0 ? n_places / rem : n_places;
4842       int place = masters_place;
4843       int gap_ct = gap;
4844       thidx = n_th;
4845       if (update_master_only == 1)
4846         thidx = 1;
4847       for (f = 0; f < thidx; f++) {
4848         kmp_info_t *th = team->t.t_threads[f];
4849         KMP_DEBUG_ASSERT(th != NULL);
4850 
4851         th->th.th_first_place = place;
4852         th->th.th_last_place = place;
4853         th->th.th_new_place = place;
4854         if (__kmp_display_affinity && place != th->th.th_current_place &&
4855             team->t.t_display_affinity != 1) {
4856           team->t.t_display_affinity = 1;
4857         }
4858         s_count++;
4859 
4860         if ((s_count == S) && rem && (gap_ct == gap)) {
4861           // do nothing, add an extra thread to place on next iteration
4862         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4863           // we added an extra thread to this place; move on to next place
4864           if (place == last_place) {
4865             place = first_place;
4866           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4867             place = 0;
4868           } else {
4869             place++;
4870           }
4871           s_count = 0;
4872           gap_ct = 1;
4873           rem--;
4874         } else if (s_count == S) { // place is full; don't add extra thread
4875           if (place == last_place) {
4876             place = first_place;
4877           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4878             place = 0;
4879           } else {
4880             place++;
4881           }
4882           gap_ct++;
4883           s_count = 0;
4884         }
4885 
4886         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4887                        "partition = [%d,%d]\n",
4888                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4889                        team->t.t_id, f, th->th.th_new_place,
4890                        th->th.th_first_place, th->th.th_last_place));
4891       }
4892       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4893     }
4894   } break;
4895 
4896   default:
4897     break;
4898   }
4899 
4900   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4901 }
4902 
4903 #endif // KMP_AFFINITY_SUPPORTED
4904 
4905 /* allocate a new team data structure to use.  take one off of the free pool if
4906    available */
4907 kmp_team_t *
4908 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4909 #if OMPT_SUPPORT
4910                     ompt_data_t ompt_parallel_data,
4911 #endif
4912                     kmp_proc_bind_t new_proc_bind,
4913                     kmp_internal_control_t *new_icvs,
4914                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4915   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4916   int f;
4917   kmp_team_t *team;
4918   int use_hot_team = !root->r.r_active;
4919   int level = 0;
4920 
4921   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4922   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4923   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4924   KMP_MB();
4925 
4926 #if KMP_NESTED_HOT_TEAMS
4927   kmp_hot_team_ptr_t *hot_teams;
4928   if (master) {
4929     team = master->th.th_team;
4930     level = team->t.t_active_level;
4931     if (master->th.th_teams_microtask) { // in teams construct?
4932       if (master->th.th_teams_size.nteams > 1 &&
4933           ( // #teams > 1
4934               team->t.t_pkfn ==
4935                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4936               master->th.th_teams_level <
4937                   team->t.t_level)) { // or nested parallel inside the teams
4938         ++level; // not increment if #teams==1, or for outer fork of the teams;
4939         // increment otherwise
4940       }
4941     }
4942     hot_teams = master->th.th_hot_teams;
4943     if (level < __kmp_hot_teams_max_level && hot_teams &&
4944         hot_teams[level].hot_team) {
4945       // hot team has already been allocated for given level
4946       use_hot_team = 1;
4947     } else {
4948       use_hot_team = 0;
4949     }
4950   } else {
4951     // check we won't access uninitialized hot_teams, just in case
4952     KMP_DEBUG_ASSERT(new_nproc == 1);
4953   }
4954 #endif
4955   // Optimization to use a "hot" team
4956   if (use_hot_team && new_nproc > 1) {
4957     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4958 #if KMP_NESTED_HOT_TEAMS
4959     team = hot_teams[level].hot_team;
4960 #else
4961     team = root->r.r_hot_team;
4962 #endif
4963 #if KMP_DEBUG
4964     if (__kmp_tasking_mode != tskm_immediate_exec) {
4965       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4966                     "task_team[1] = %p before reinit\n",
4967                     team->t.t_task_team[0], team->t.t_task_team[1]));
4968     }
4969 #endif
4970 
4971     // Has the number of threads changed?
4972     /* Let's assume the most common case is that the number of threads is
4973        unchanged, and put that case first. */
4974     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4975       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4976       // This case can mean that omp_set_num_threads() was called and the hot
4977       // team size was already reduced, so we check the special flag
4978       if (team->t.t_size_changed == -1) {
4979         team->t.t_size_changed = 1;
4980       } else {
4981         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4982       }
4983 
4984       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4985       kmp_r_sched_t new_sched = new_icvs->sched;
4986       // set master's schedule as new run-time schedule
4987       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4988 
4989       __kmp_reinitialize_team(team, new_icvs,
4990                               root->r.r_uber_thread->th.th_ident);
4991 
4992       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4993                     team->t.t_threads[0], team));
4994       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4995 
4996 #if KMP_AFFINITY_SUPPORTED
4997       if ((team->t.t_size_changed == 0) &&
4998           (team->t.t_proc_bind == new_proc_bind)) {
4999         if (new_proc_bind == proc_bind_spread) {
5000           __kmp_partition_places(
5001               team, 1); // add flag to update only master for spread
5002         }
5003         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5004                        "proc_bind = %d, partition = [%d,%d]\n",
5005                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5006                        team->t.t_last_place));
5007       } else {
5008         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5009         __kmp_partition_places(team);
5010       }
5011 #else
5012       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5013 #endif /* KMP_AFFINITY_SUPPORTED */
5014     } else if (team->t.t_nproc > new_nproc) {
5015       KA_TRACE(20,
5016                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5017                 new_nproc));
5018 
5019       team->t.t_size_changed = 1;
5020 #if KMP_NESTED_HOT_TEAMS
5021       if (__kmp_hot_teams_mode == 0) {
5022         // AC: saved number of threads should correspond to team's value in this
5023         // mode, can be bigger in mode 1, when hot team has threads in reserve
5024         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5025         hot_teams[level].hot_team_nth = new_nproc;
5026 #endif // KMP_NESTED_HOT_TEAMS
5027         /* release the extra threads we don't need any more */
5028         for (f = new_nproc; f < team->t.t_nproc; f++) {
5029           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5030           if (__kmp_tasking_mode != tskm_immediate_exec) {
5031             // When decreasing team size, threads no longer in the team should
5032             // unref task team.
5033             team->t.t_threads[f]->th.th_task_team = NULL;
5034           }
5035           __kmp_free_thread(team->t.t_threads[f]);
5036           team->t.t_threads[f] = NULL;
5037         }
5038 #if KMP_NESTED_HOT_TEAMS
5039       } // (__kmp_hot_teams_mode == 0)
5040       else {
5041         // When keeping extra threads in team, switch threads to wait on own
5042         // b_go flag
5043         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5044           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5045           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5046           for (int b = 0; b < bs_last_barrier; ++b) {
5047             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5048               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5049             }
5050             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5051           }
5052         }
5053       }
5054 #endif // KMP_NESTED_HOT_TEAMS
5055       team->t.t_nproc = new_nproc;
5056       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5057       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5058       __kmp_reinitialize_team(team, new_icvs,
5059                               root->r.r_uber_thread->th.th_ident);
5060 
5061       // Update remaining threads
5062       for (f = 0; f < new_nproc; ++f) {
5063         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5064       }
5065 
5066       // restore the current task state of the master thread: should be the
5067       // implicit task
5068       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5069                     team->t.t_threads[0], team));
5070 
5071       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5072 
5073 #ifdef KMP_DEBUG
5074       for (f = 0; f < team->t.t_nproc; f++) {
5075         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5076                          team->t.t_threads[f]->th.th_team_nproc ==
5077                              team->t.t_nproc);
5078       }
5079 #endif
5080 
5081       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5082 #if KMP_AFFINITY_SUPPORTED
5083       __kmp_partition_places(team);
5084 #endif
5085     } else { // team->t.t_nproc < new_nproc
5086 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5087       kmp_affin_mask_t *old_mask;
5088       if (KMP_AFFINITY_CAPABLE()) {
5089         KMP_CPU_ALLOC(old_mask);
5090       }
5091 #endif
5092 
5093       KA_TRACE(20,
5094                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5095                 new_nproc));
5096 
5097       team->t.t_size_changed = 1;
5098 
5099 #if KMP_NESTED_HOT_TEAMS
5100       int avail_threads = hot_teams[level].hot_team_nth;
5101       if (new_nproc < avail_threads)
5102         avail_threads = new_nproc;
5103       kmp_info_t **other_threads = team->t.t_threads;
5104       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5105         // Adjust barrier data of reserved threads (if any) of the team
5106         // Other data will be set in __kmp_initialize_info() below.
5107         int b;
5108         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5109         for (b = 0; b < bs_last_barrier; ++b) {
5110           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5111           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5112 #if USE_DEBUGGER
5113           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5114 #endif
5115         }
5116       }
5117       if (hot_teams[level].hot_team_nth >= new_nproc) {
5118         // we have all needed threads in reserve, no need to allocate any
5119         // this only possible in mode 1, cannot have reserved threads in mode 0
5120         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5121         team->t.t_nproc = new_nproc; // just get reserved threads involved
5122       } else {
5123         // we may have some threads in reserve, but not enough
5124         team->t.t_nproc =
5125             hot_teams[level]
5126                 .hot_team_nth; // get reserved threads involved if any
5127         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5128 #endif // KMP_NESTED_HOT_TEAMS
5129         if (team->t.t_max_nproc < new_nproc) {
5130           /* reallocate larger arrays */
5131           __kmp_reallocate_team_arrays(team, new_nproc);
5132           __kmp_reinitialize_team(team, new_icvs, NULL);
5133         }
5134 
5135 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5136         /* Temporarily set full mask for master thread before creation of
5137            workers. The reason is that workers inherit the affinity from master,
5138            so if a lot of workers are created on the single core quickly, they
5139            don't get a chance to set their own affinity for a long time. */
5140         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5141 #endif
5142 
5143         /* allocate new threads for the hot team */
5144         for (f = team->t.t_nproc; f < new_nproc; f++) {
5145           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5146           KMP_DEBUG_ASSERT(new_worker);
5147           team->t.t_threads[f] = new_worker;
5148 
5149           KA_TRACE(20,
5150                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5151                     "join=%llu, plain=%llu\n",
5152                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5153                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5154                     team->t.t_bar[bs_plain_barrier].b_arrived));
5155 
5156           { // Initialize barrier data for new threads.
5157             int b;
5158             kmp_balign_t *balign = new_worker->th.th_bar;
5159             for (b = 0; b < bs_last_barrier; ++b) {
5160               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5161               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5162                                KMP_BARRIER_PARENT_FLAG);
5163 #if USE_DEBUGGER
5164               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5165 #endif
5166             }
5167           }
5168         }
5169 
5170 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5171         if (KMP_AFFINITY_CAPABLE()) {
5172           /* Restore initial master thread's affinity mask */
5173           __kmp_set_system_affinity(old_mask, TRUE);
5174           KMP_CPU_FREE(old_mask);
5175         }
5176 #endif
5177 #if KMP_NESTED_HOT_TEAMS
5178       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5179 #endif // KMP_NESTED_HOT_TEAMS
5180       /* make sure everyone is syncronized */
5181       int old_nproc = team->t.t_nproc; // save old value and use to update only
5182       // new threads below
5183       __kmp_initialize_team(team, new_nproc, new_icvs,
5184                             root->r.r_uber_thread->th.th_ident);
5185 
5186       /* reinitialize the threads */
5187       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5188       for (f = 0; f < team->t.t_nproc; ++f)
5189         __kmp_initialize_info(team->t.t_threads[f], team, f,
5190                               __kmp_gtid_from_tid(f, team));
5191 
5192       if (level) { // set th_task_state for new threads in nested hot team
5193         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5194         // only need to set the th_task_state for the new threads. th_task_state
5195         // for master thread will not be accurate until after this in
5196         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5197         // correct value.
5198         for (f = old_nproc; f < team->t.t_nproc; ++f)
5199           team->t.t_threads[f]->th.th_task_state =
5200               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5201       } else { // set th_task_state for new threads in non-nested hot team
5202         kmp_uint8 old_state =
5203             team->t.t_threads[0]->th.th_task_state; // copy master's state
5204         for (f = old_nproc; f < team->t.t_nproc; ++f)
5205           team->t.t_threads[f]->th.th_task_state = old_state;
5206       }
5207 
5208 #ifdef KMP_DEBUG
5209       for (f = 0; f < team->t.t_nproc; ++f) {
5210         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5211                          team->t.t_threads[f]->th.th_team_nproc ==
5212                              team->t.t_nproc);
5213       }
5214 #endif
5215 
5216       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5217 #if KMP_AFFINITY_SUPPORTED
5218       __kmp_partition_places(team);
5219 #endif
5220     } // Check changes in number of threads
5221 
5222     kmp_info_t *master = team->t.t_threads[0];
5223     if (master->th.th_teams_microtask) {
5224       for (f = 1; f < new_nproc; ++f) {
5225         // propagate teams construct specific info to workers
5226         kmp_info_t *thr = team->t.t_threads[f];
5227         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5228         thr->th.th_teams_level = master->th.th_teams_level;
5229         thr->th.th_teams_size = master->th.th_teams_size;
5230       }
5231     }
5232 #if KMP_NESTED_HOT_TEAMS
5233     if (level) {
5234       // Sync barrier state for nested hot teams, not needed for outermost hot
5235       // team.
5236       for (f = 1; f < new_nproc; ++f) {
5237         kmp_info_t *thr = team->t.t_threads[f];
5238         int b;
5239         kmp_balign_t *balign = thr->th.th_bar;
5240         for (b = 0; b < bs_last_barrier; ++b) {
5241           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5242           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5243 #if USE_DEBUGGER
5244           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5245 #endif
5246         }
5247       }
5248     }
5249 #endif // KMP_NESTED_HOT_TEAMS
5250 
5251     /* reallocate space for arguments if necessary */
5252     __kmp_alloc_argv_entries(argc, team, TRUE);
5253     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5254     // The hot team re-uses the previous task team,
5255     // if untouched during the previous release->gather phase.
5256 
5257     KF_TRACE(10, (" hot_team = %p\n", team));
5258 
5259 #if KMP_DEBUG
5260     if (__kmp_tasking_mode != tskm_immediate_exec) {
5261       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5262                     "task_team[1] = %p after reinit\n",
5263                     team->t.t_task_team[0], team->t.t_task_team[1]));
5264     }
5265 #endif
5266 
5267 #if OMPT_SUPPORT
5268     __ompt_team_assign_id(team, ompt_parallel_data);
5269 #endif
5270 
5271     KMP_MB();
5272 
5273     return team;
5274   }
5275 
5276   /* next, let's try to take one from the team pool */
5277   KMP_MB();
5278   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5279     /* TODO: consider resizing undersized teams instead of reaping them, now
5280        that we have a resizing mechanism */
5281     if (team->t.t_max_nproc >= max_nproc) {
5282       /* take this team from the team pool */
5283       __kmp_team_pool = team->t.t_next_pool;
5284 
5285       /* setup the team for fresh use */
5286       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5287 
5288       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5289                     "task_team[1] %p to NULL\n",
5290                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5291       team->t.t_task_team[0] = NULL;
5292       team->t.t_task_team[1] = NULL;
5293 
5294       /* reallocate space for arguments if necessary */
5295       __kmp_alloc_argv_entries(argc, team, TRUE);
5296       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5297 
5298       KA_TRACE(
5299           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5300                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5301       { // Initialize barrier data.
5302         int b;
5303         for (b = 0; b < bs_last_barrier; ++b) {
5304           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5305 #if USE_DEBUGGER
5306           team->t.t_bar[b].b_master_arrived = 0;
5307           team->t.t_bar[b].b_team_arrived = 0;
5308 #endif
5309         }
5310       }
5311 
5312       team->t.t_proc_bind = new_proc_bind;
5313 
5314       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5315                     team->t.t_id));
5316 
5317 #if OMPT_SUPPORT
5318       __ompt_team_assign_id(team, ompt_parallel_data);
5319 #endif
5320 
5321       KMP_MB();
5322 
5323       return team;
5324     }
5325 
5326     /* reap team if it is too small, then loop back and check the next one */
5327     // not sure if this is wise, but, will be redone during the hot-teams
5328     // rewrite.
5329     /* TODO: Use technique to find the right size hot-team, don't reap them */
5330     team = __kmp_reap_team(team);
5331     __kmp_team_pool = team;
5332   }
5333 
5334   /* nothing available in the pool, no matter, make a new team! */
5335   KMP_MB();
5336   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5337 
5338   /* and set it up */
5339   team->t.t_max_nproc = max_nproc;
5340   /* NOTE well, for some reason allocating one big buffer and dividing it up
5341      seems to really hurt performance a lot on the P4, so, let's not use this */
5342   __kmp_allocate_team_arrays(team, max_nproc);
5343 
5344   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5345   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5346 
5347   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5348                 "%p to NULL\n",
5349                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5350   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5351   // memory, no need to duplicate
5352   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5353   // memory, no need to duplicate
5354 
5355   if (__kmp_storage_map) {
5356     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5357   }
5358 
5359   /* allocate space for arguments */
5360   __kmp_alloc_argv_entries(argc, team, FALSE);
5361   team->t.t_argc = argc;
5362 
5363   KA_TRACE(20,
5364            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5365             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5366   { // Initialize barrier data.
5367     int b;
5368     for (b = 0; b < bs_last_barrier; ++b) {
5369       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5370 #if USE_DEBUGGER
5371       team->t.t_bar[b].b_master_arrived = 0;
5372       team->t.t_bar[b].b_team_arrived = 0;
5373 #endif
5374     }
5375   }
5376 
5377   team->t.t_proc_bind = new_proc_bind;
5378 
5379 #if OMPT_SUPPORT
5380   __ompt_team_assign_id(team, ompt_parallel_data);
5381   team->t.ompt_serialized_team_info = NULL;
5382 #endif
5383 
5384   KMP_MB();
5385 
5386   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5387                 team->t.t_id));
5388 
5389   return team;
5390 }
5391 
5392 /* TODO implement hot-teams at all levels */
5393 /* TODO implement lazy thread release on demand (disband request) */
5394 
5395 /* free the team.  return it to the team pool.  release all the threads
5396  * associated with it */
5397 void __kmp_free_team(kmp_root_t *root,
5398                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5399   int f;
5400   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5401                 team->t.t_id));
5402 
5403   /* verify state */
5404   KMP_DEBUG_ASSERT(root);
5405   KMP_DEBUG_ASSERT(team);
5406   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5407   KMP_DEBUG_ASSERT(team->t.t_threads);
5408 
5409   int use_hot_team = team == root->r.r_hot_team;
5410 #if KMP_NESTED_HOT_TEAMS
5411   int level;
5412   kmp_hot_team_ptr_t *hot_teams;
5413   if (master) {
5414     level = team->t.t_active_level - 1;
5415     if (master->th.th_teams_microtask) { // in teams construct?
5416       if (master->th.th_teams_size.nteams > 1) {
5417         ++level; // level was not increased in teams construct for
5418         // team_of_masters
5419       }
5420       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5421           master->th.th_teams_level == team->t.t_level) {
5422         ++level; // level was not increased in teams construct for
5423         // team_of_workers before the parallel
5424       } // team->t.t_level will be increased inside parallel
5425     }
5426     hot_teams = master->th.th_hot_teams;
5427     if (level < __kmp_hot_teams_max_level) {
5428       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5429       use_hot_team = 1;
5430     }
5431   }
5432 #endif // KMP_NESTED_HOT_TEAMS
5433 
5434   /* team is done working */
5435   TCW_SYNC_PTR(team->t.t_pkfn,
5436                NULL); // Important for Debugging Support Library.
5437 #if KMP_OS_WINDOWS
5438   team->t.t_copyin_counter = 0; // init counter for possible reuse
5439 #endif
5440   // Do not reset pointer to parent team to NULL for hot teams.
5441 
5442   /* if we are non-hot team, release our threads */
5443   if (!use_hot_team) {
5444     if (__kmp_tasking_mode != tskm_immediate_exec) {
5445       // Wait for threads to reach reapable state
5446       for (f = 1; f < team->t.t_nproc; ++f) {
5447         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5448         kmp_info_t *th = team->t.t_threads[f];
5449         volatile kmp_uint32 *state = &th->th.th_reap_state;
5450         while (*state != KMP_SAFE_TO_REAP) {
5451 #if KMP_OS_WINDOWS
5452           // On Windows a thread can be killed at any time, check this
5453           DWORD ecode;
5454           if (!__kmp_is_thread_alive(th, &ecode)) {
5455             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5456             break;
5457           }
5458 #endif
5459           // first check if thread is sleeping
5460           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5461           if (fl.is_sleeping())
5462             fl.resume(__kmp_gtid_from_thread(th));
5463           KMP_CPU_PAUSE();
5464         }
5465       }
5466 
5467       // Delete task teams
5468       int tt_idx;
5469       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5470         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5471         if (task_team != NULL) {
5472           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5473             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5474             team->t.t_threads[f]->th.th_task_team = NULL;
5475           }
5476           KA_TRACE(
5477               20,
5478               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5479                __kmp_get_gtid(), task_team, team->t.t_id));
5480 #if KMP_NESTED_HOT_TEAMS
5481           __kmp_free_task_team(master, task_team);
5482 #endif
5483           team->t.t_task_team[tt_idx] = NULL;
5484         }
5485       }
5486     }
5487 
5488     // Reset pointer to parent team only for non-hot teams.
5489     team->t.t_parent = NULL;
5490     team->t.t_level = 0;
5491     team->t.t_active_level = 0;
5492 
5493     /* free the worker threads */
5494     for (f = 1; f < team->t.t_nproc; ++f) {
5495       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5496       __kmp_free_thread(team->t.t_threads[f]);
5497       team->t.t_threads[f] = NULL;
5498     }
5499 
5500     /* put the team back in the team pool */
5501     /* TODO limit size of team pool, call reap_team if pool too large */
5502     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5503     __kmp_team_pool = (volatile kmp_team_t *)team;
5504   } else { // Check if team was created for the masters in a teams construct
5505     // See if first worker is a CG root
5506     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5507                      team->t.t_threads[1]->th.th_cg_roots);
5508     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5509       // Clean up the CG root nodes on workers so that this team can be re-used
5510       for (f = 1; f < team->t.t_nproc; ++f) {
5511         kmp_info_t *thr = team->t.t_threads[f];
5512         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5513                          thr->th.th_cg_roots->cg_root == thr);
5514         // Pop current CG root off list
5515         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5516         thr->th.th_cg_roots = tmp->up;
5517         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5518                        " up to node %p. cg_nthreads was %d\n",
5519                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5520         int i = tmp->cg_nthreads--;
5521         if (i == 1) {
5522           __kmp_free(tmp); // free CG if we are the last thread in it
5523         }
5524         // Restore current task's thread_limit from CG root
5525         if (thr->th.th_cg_roots)
5526           thr->th.th_current_task->td_icvs.thread_limit =
5527               thr->th.th_cg_roots->cg_thread_limit;
5528       }
5529     }
5530   }
5531 
5532   KMP_MB();
5533 }
5534 
5535 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5536 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5537   kmp_team_t *next_pool = team->t.t_next_pool;
5538 
5539   KMP_DEBUG_ASSERT(team);
5540   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5541   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5542   KMP_DEBUG_ASSERT(team->t.t_threads);
5543   KMP_DEBUG_ASSERT(team->t.t_argv);
5544 
5545   /* TODO clean the threads that are a part of this? */
5546 
5547   /* free stuff */
5548   __kmp_free_team_arrays(team);
5549   if (team->t.t_argv != &team->t.t_inline_argv[0])
5550     __kmp_free((void *)team->t.t_argv);
5551   __kmp_free(team);
5552 
5553   KMP_MB();
5554   return next_pool;
5555 }
5556 
5557 // Free the thread.  Don't reap it, just place it on the pool of available
5558 // threads.
5559 //
5560 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5561 // binding for the affinity mechanism to be useful.
5562 //
5563 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5564 // However, we want to avoid a potential performance problem by always
5565 // scanning through the list to find the correct point at which to insert
5566 // the thread (potential N**2 behavior).  To do this we keep track of the
5567 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5568 // With single-level parallelism, threads will always be added to the tail
5569 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5570 // parallelism, all bets are off and we may need to scan through the entire
5571 // free list.
5572 //
5573 // This change also has a potentially large performance benefit, for some
5574 // applications.  Previously, as threads were freed from the hot team, they
5575 // would be placed back on the free list in inverse order.  If the hot team
5576 // grew back to it's original size, then the freed thread would be placed
5577 // back on the hot team in reverse order.  This could cause bad cache
5578 // locality problems on programs where the size of the hot team regularly
5579 // grew and shrunk.
5580 //
5581 // Now, for single-level parallelism, the OMP tid is always == gtid.
5582 void __kmp_free_thread(kmp_info_t *this_th) {
5583   int gtid;
5584   kmp_info_t **scan;
5585 
5586   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5587                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5588 
5589   KMP_DEBUG_ASSERT(this_th);
5590 
5591   // When moving thread to pool, switch thread to wait on own b_go flag, and
5592   // uninitialized (NULL team).
5593   int b;
5594   kmp_balign_t *balign = this_th->th.th_bar;
5595   for (b = 0; b < bs_last_barrier; ++b) {
5596     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5597       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5598     balign[b].bb.team = NULL;
5599     balign[b].bb.leaf_kids = 0;
5600   }
5601   this_th->th.th_task_state = 0;
5602   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5603 
5604   /* put thread back on the free pool */
5605   TCW_PTR(this_th->th.th_team, NULL);
5606   TCW_PTR(this_th->th.th_root, NULL);
5607   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5608 
5609   while (this_th->th.th_cg_roots) {
5610     this_th->th.th_cg_roots->cg_nthreads--;
5611     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5612                    " %p of thread  %p to %d\n",
5613                    this_th, this_th->th.th_cg_roots,
5614                    this_th->th.th_cg_roots->cg_root,
5615                    this_th->th.th_cg_roots->cg_nthreads));
5616     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5617     if (tmp->cg_root == this_th) { // Thread is a cg_root
5618       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5619       KA_TRACE(
5620           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5621       this_th->th.th_cg_roots = tmp->up;
5622       __kmp_free(tmp);
5623     } else { // Worker thread
5624       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5625         __kmp_free(tmp);
5626       }
5627       this_th->th.th_cg_roots = NULL;
5628       break;
5629     }
5630   }
5631 
5632   /* If the implicit task assigned to this thread can be used by other threads
5633    * -> multiple threads can share the data and try to free the task at
5634    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5635    * with higher probability when hot team is disabled but can occurs even when
5636    * the hot team is enabled */
5637   __kmp_free_implicit_task(this_th);
5638   this_th->th.th_current_task = NULL;
5639 
5640   // If the __kmp_thread_pool_insert_pt is already past the new insert
5641   // point, then we need to re-scan the entire list.
5642   gtid = this_th->th.th_info.ds.ds_gtid;
5643   if (__kmp_thread_pool_insert_pt != NULL) {
5644     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5645     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5646       __kmp_thread_pool_insert_pt = NULL;
5647     }
5648   }
5649 
5650   // Scan down the list to find the place to insert the thread.
5651   // scan is the address of a link in the list, possibly the address of
5652   // __kmp_thread_pool itself.
5653   //
5654   // In the absence of nested parallelism, the for loop will have 0 iterations.
5655   if (__kmp_thread_pool_insert_pt != NULL) {
5656     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5657   } else {
5658     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5659   }
5660   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5661        scan = &((*scan)->th.th_next_pool))
5662     ;
5663 
5664   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5665   // to its address.
5666   TCW_PTR(this_th->th.th_next_pool, *scan);
5667   __kmp_thread_pool_insert_pt = *scan = this_th;
5668   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5669                    (this_th->th.th_info.ds.ds_gtid <
5670                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5671   TCW_4(this_th->th.th_in_pool, TRUE);
5672   __kmp_suspend_initialize_thread(this_th);
5673   __kmp_lock_suspend_mx(this_th);
5674   if (this_th->th.th_active == TRUE) {
5675     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5676     this_th->th.th_active_in_pool = TRUE;
5677   }
5678 #if KMP_DEBUG
5679   else {
5680     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5681   }
5682 #endif
5683   __kmp_unlock_suspend_mx(this_th);
5684 
5685   TCW_4(__kmp_nth, __kmp_nth - 1);
5686 
5687 #ifdef KMP_ADJUST_BLOCKTIME
5688   /* Adjust blocktime back to user setting or default if necessary */
5689   /* Middle initialization might never have occurred                */
5690   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5691     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5692     if (__kmp_nth <= __kmp_avail_proc) {
5693       __kmp_zero_bt = FALSE;
5694     }
5695   }
5696 #endif /* KMP_ADJUST_BLOCKTIME */
5697 
5698   KMP_MB();
5699 }
5700 
5701 /* ------------------------------------------------------------------------ */
5702 
5703 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5704   int gtid = this_thr->th.th_info.ds.ds_gtid;
5705   /*    void                 *stack_data;*/
5706   kmp_team_t **volatile pteam;
5707 
5708   KMP_MB();
5709   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5710 
5711   if (__kmp_env_consistency_check) {
5712     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5713   }
5714 
5715 #if OMPT_SUPPORT
5716   ompt_data_t *thread_data;
5717   if (ompt_enabled.enabled) {
5718     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5719     *thread_data = ompt_data_none;
5720 
5721     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5722     this_thr->th.ompt_thread_info.wait_id = 0;
5723     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5724     this_thr->th.ompt_thread_info.parallel_flags = 0;
5725     if (ompt_enabled.ompt_callback_thread_begin) {
5726       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5727           ompt_thread_worker, thread_data);
5728     }
5729     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5730   }
5731 #endif
5732 
5733   /* This is the place where threads wait for work */
5734   while (!TCR_4(__kmp_global.g.g_done)) {
5735     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5736     KMP_MB();
5737 
5738     /* wait for work to do */
5739     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5740 
5741     /* No tid yet since not part of a team */
5742     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5743 
5744 #if OMPT_SUPPORT
5745     if (ompt_enabled.enabled) {
5746       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5747     }
5748 #endif
5749 
5750     pteam = &this_thr->th.th_team;
5751 
5752     /* have we been allocated? */
5753     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5754       /* we were just woken up, so run our new task */
5755       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5756         int rc;
5757         KA_TRACE(20,
5758                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5759                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5760                   (*pteam)->t.t_pkfn));
5761 
5762         updateHWFPControl(*pteam);
5763 
5764 #if OMPT_SUPPORT
5765         if (ompt_enabled.enabled) {
5766           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5767         }
5768 #endif
5769 
5770         rc = (*pteam)->t.t_invoke(gtid);
5771         KMP_ASSERT(rc);
5772 
5773         KMP_MB();
5774         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5775                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5776                       (*pteam)->t.t_pkfn));
5777       }
5778 #if OMPT_SUPPORT
5779       if (ompt_enabled.enabled) {
5780         /* no frame set while outside task */
5781         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5782 
5783         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5784       }
5785 #endif
5786       /* join barrier after parallel region */
5787       __kmp_join_barrier(gtid);
5788     }
5789   }
5790   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5791 
5792 #if OMPT_SUPPORT
5793   if (ompt_enabled.ompt_callback_thread_end) {
5794     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5795   }
5796 #endif
5797 
5798   this_thr->th.th_task_team = NULL;
5799   /* run the destructors for the threadprivate data for this thread */
5800   __kmp_common_destroy_gtid(gtid);
5801 
5802   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5803   KMP_MB();
5804   return this_thr;
5805 }
5806 
5807 /* ------------------------------------------------------------------------ */
5808 
5809 void __kmp_internal_end_dest(void *specific_gtid) {
5810   // Make sure no significant bits are lost
5811   int gtid;
5812   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5813 
5814   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5815   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5816    * this is because 0 is reserved for the nothing-stored case */
5817 
5818   __kmp_internal_end_thread(gtid);
5819 }
5820 
5821 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5822 
5823 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5824   __kmp_internal_end_atexit();
5825 }
5826 
5827 #endif
5828 
5829 /* [Windows] josh: when the atexit handler is called, there may still be more
5830    than one thread alive */
5831 void __kmp_internal_end_atexit(void) {
5832   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5833   /* [Windows]
5834      josh: ideally, we want to completely shutdown the library in this atexit
5835      handler, but stat code that depends on thread specific data for gtid fails
5836      because that data becomes unavailable at some point during the shutdown, so
5837      we call __kmp_internal_end_thread instead. We should eventually remove the
5838      dependency on __kmp_get_specific_gtid in the stat code and use
5839      __kmp_internal_end_library to cleanly shutdown the library.
5840 
5841      // TODO: Can some of this comment about GVS be removed?
5842      I suspect that the offending stat code is executed when the calling thread
5843      tries to clean up a dead root thread's data structures, resulting in GVS
5844      code trying to close the GVS structures for that thread, but since the stat
5845      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5846      the calling thread is cleaning up itself instead of another thread, it get
5847      confused. This happens because allowing a thread to unregister and cleanup
5848      another thread is a recent modification for addressing an issue.
5849      Based on the current design (20050722), a thread may end up
5850      trying to unregister another thread only if thread death does not trigger
5851      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5852      thread specific data destructor function to detect thread death. For
5853      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5854      is nothing.  Thus, the workaround is applicable only for Windows static
5855      stat library. */
5856   __kmp_internal_end_library(-1);
5857 #if KMP_OS_WINDOWS
5858   __kmp_close_console();
5859 #endif
5860 }
5861 
5862 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5863   // It is assumed __kmp_forkjoin_lock is acquired.
5864 
5865   int gtid;
5866 
5867   KMP_DEBUG_ASSERT(thread != NULL);
5868 
5869   gtid = thread->th.th_info.ds.ds_gtid;
5870 
5871   if (!is_root) {
5872     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5873       /* Assume the threads are at the fork barrier here */
5874       KA_TRACE(
5875           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5876                gtid));
5877       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5878        * (GEH) */
5879       ANNOTATE_HAPPENS_BEFORE(thread);
5880       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5881                          thread);
5882       __kmp_release_64(&flag);
5883     }
5884 
5885     // Terminate OS thread.
5886     __kmp_reap_worker(thread);
5887 
5888     // The thread was killed asynchronously.  If it was actively
5889     // spinning in the thread pool, decrement the global count.
5890     //
5891     // There is a small timing hole here - if the worker thread was just waking
5892     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5893     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5894     // the global counter might not get updated.
5895     //
5896     // Currently, this can only happen as the library is unloaded,
5897     // so there are no harmful side effects.
5898     if (thread->th.th_active_in_pool) {
5899       thread->th.th_active_in_pool = FALSE;
5900       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5901       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5902     }
5903   }
5904 
5905   __kmp_free_implicit_task(thread);
5906 
5907 // Free the fast memory for tasking
5908 #if USE_FAST_MEMORY
5909   __kmp_free_fast_memory(thread);
5910 #endif /* USE_FAST_MEMORY */
5911 
5912   __kmp_suspend_uninitialize_thread(thread);
5913 
5914   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5915   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5916 
5917   --__kmp_all_nth;
5918 // __kmp_nth was decremented when thread is added to the pool.
5919 
5920 #ifdef KMP_ADJUST_BLOCKTIME
5921   /* Adjust blocktime back to user setting or default if necessary */
5922   /* Middle initialization might never have occurred                */
5923   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5924     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5925     if (__kmp_nth <= __kmp_avail_proc) {
5926       __kmp_zero_bt = FALSE;
5927     }
5928   }
5929 #endif /* KMP_ADJUST_BLOCKTIME */
5930 
5931   /* free the memory being used */
5932   if (__kmp_env_consistency_check) {
5933     if (thread->th.th_cons) {
5934       __kmp_free_cons_stack(thread->th.th_cons);
5935       thread->th.th_cons = NULL;
5936     }
5937   }
5938 
5939   if (thread->th.th_pri_common != NULL) {
5940     __kmp_free(thread->th.th_pri_common);
5941     thread->th.th_pri_common = NULL;
5942   }
5943 
5944   if (thread->th.th_task_state_memo_stack != NULL) {
5945     __kmp_free(thread->th.th_task_state_memo_stack);
5946     thread->th.th_task_state_memo_stack = NULL;
5947   }
5948 
5949 #if KMP_USE_BGET
5950   if (thread->th.th_local.bget_data != NULL) {
5951     __kmp_finalize_bget(thread);
5952   }
5953 #endif
5954 
5955 #if KMP_AFFINITY_SUPPORTED
5956   if (thread->th.th_affin_mask != NULL) {
5957     KMP_CPU_FREE(thread->th.th_affin_mask);
5958     thread->th.th_affin_mask = NULL;
5959   }
5960 #endif /* KMP_AFFINITY_SUPPORTED */
5961 
5962 #if KMP_USE_HIER_SCHED
5963   if (thread->th.th_hier_bar_data != NULL) {
5964     __kmp_free(thread->th.th_hier_bar_data);
5965     thread->th.th_hier_bar_data = NULL;
5966   }
5967 #endif
5968 
5969   __kmp_reap_team(thread->th.th_serial_team);
5970   thread->th.th_serial_team = NULL;
5971   __kmp_free(thread);
5972 
5973   KMP_MB();
5974 
5975 } // __kmp_reap_thread
5976 
5977 static void __kmp_internal_end(void) {
5978   int i;
5979 
5980   /* First, unregister the library */
5981   __kmp_unregister_library();
5982 
5983 #if KMP_OS_WINDOWS
5984   /* In Win static library, we can't tell when a root actually dies, so we
5985      reclaim the data structures for any root threads that have died but not
5986      unregistered themselves, in order to shut down cleanly.
5987      In Win dynamic library we also can't tell when a thread dies.  */
5988   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5989 // dead roots
5990 #endif
5991 
5992   for (i = 0; i < __kmp_threads_capacity; i++)
5993     if (__kmp_root[i])
5994       if (__kmp_root[i]->r.r_active)
5995         break;
5996   KMP_MB(); /* Flush all pending memory write invalidates.  */
5997   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5998 
5999   if (i < __kmp_threads_capacity) {
6000 #if KMP_USE_MONITOR
6001     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6002     KMP_MB(); /* Flush all pending memory write invalidates.  */
6003 
6004     // Need to check that monitor was initialized before reaping it. If we are
6005     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6006     // __kmp_monitor will appear to contain valid data, but it is only valid in
6007     // the parent process, not the child.
6008     // New behavior (201008): instead of keying off of the flag
6009     // __kmp_init_parallel, the monitor thread creation is keyed off
6010     // of the new flag __kmp_init_monitor.
6011     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6012     if (TCR_4(__kmp_init_monitor)) {
6013       __kmp_reap_monitor(&__kmp_monitor);
6014       TCW_4(__kmp_init_monitor, 0);
6015     }
6016     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6017     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6018 #endif // KMP_USE_MONITOR
6019   } else {
6020 /* TODO move this to cleanup code */
6021 #ifdef KMP_DEBUG
6022     /* make sure that everything has properly ended */
6023     for (i = 0; i < __kmp_threads_capacity; i++) {
6024       if (__kmp_root[i]) {
6025         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6026         //                    there can be uber threads alive here
6027         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6028       }
6029     }
6030 #endif
6031 
6032     KMP_MB();
6033 
6034     // Reap the worker threads.
6035     // This is valid for now, but be careful if threads are reaped sooner.
6036     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6037       // Get the next thread from the pool.
6038       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6039       __kmp_thread_pool = thread->th.th_next_pool;
6040       // Reap it.
6041       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6042       thread->th.th_next_pool = NULL;
6043       thread->th.th_in_pool = FALSE;
6044       __kmp_reap_thread(thread, 0);
6045     }
6046     __kmp_thread_pool_insert_pt = NULL;
6047 
6048     // Reap teams.
6049     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6050       // Get the next team from the pool.
6051       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6052       __kmp_team_pool = team->t.t_next_pool;
6053       // Reap it.
6054       team->t.t_next_pool = NULL;
6055       __kmp_reap_team(team);
6056     }
6057 
6058     __kmp_reap_task_teams();
6059 
6060 #if KMP_OS_UNIX
6061     // Threads that are not reaped should not access any resources since they
6062     // are going to be deallocated soon, so the shutdown sequence should wait
6063     // until all threads either exit the final spin-waiting loop or begin
6064     // sleeping after the given blocktime.
6065     for (i = 0; i < __kmp_threads_capacity; i++) {
6066       kmp_info_t *thr = __kmp_threads[i];
6067       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6068         KMP_CPU_PAUSE();
6069     }
6070 #endif
6071 
6072     for (i = 0; i < __kmp_threads_capacity; ++i) {
6073       // TBD: Add some checking...
6074       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6075     }
6076 
6077     /* Make sure all threadprivate destructors get run by joining with all
6078        worker threads before resetting this flag */
6079     TCW_SYNC_4(__kmp_init_common, FALSE);
6080 
6081     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6082     KMP_MB();
6083 
6084 #if KMP_USE_MONITOR
6085     // See note above: One of the possible fixes for CQ138434 / CQ140126
6086     //
6087     // FIXME: push both code fragments down and CSE them?
6088     // push them into __kmp_cleanup() ?
6089     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6090     if (TCR_4(__kmp_init_monitor)) {
6091       __kmp_reap_monitor(&__kmp_monitor);
6092       TCW_4(__kmp_init_monitor, 0);
6093     }
6094     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6095     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6096 #endif
6097   } /* else !__kmp_global.t_active */
6098   TCW_4(__kmp_init_gtid, FALSE);
6099   KMP_MB(); /* Flush all pending memory write invalidates.  */
6100 
6101   __kmp_cleanup();
6102 #if OMPT_SUPPORT
6103   ompt_fini();
6104 #endif
6105 }
6106 
6107 void __kmp_internal_end_library(int gtid_req) {
6108   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6109   /* this shouldn't be a race condition because __kmp_internal_end() is the
6110      only place to clear __kmp_serial_init */
6111   /* we'll check this later too, after we get the lock */
6112   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6113   // redundant, because the next check will work in any case.
6114   if (__kmp_global.g.g_abort) {
6115     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6116     /* TODO abort? */
6117     return;
6118   }
6119   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6120     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6121     return;
6122   }
6123 
6124   KMP_MB(); /* Flush all pending memory write invalidates.  */
6125   /* find out who we are and what we should do */
6126   {
6127     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6128     KA_TRACE(
6129         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6130     if (gtid == KMP_GTID_SHUTDOWN) {
6131       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6132                     "already shutdown\n"));
6133       return;
6134     } else if (gtid == KMP_GTID_MONITOR) {
6135       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6136                     "registered, or system shutdown\n"));
6137       return;
6138     } else if (gtid == KMP_GTID_DNE) {
6139       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6140                     "shutdown\n"));
6141       /* we don't know who we are, but we may still shutdown the library */
6142     } else if (KMP_UBER_GTID(gtid)) {
6143       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6144       if (__kmp_root[gtid]->r.r_active) {
6145         __kmp_global.g.g_abort = -1;
6146         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6147         __kmp_unregister_library();
6148         KA_TRACE(10,
6149                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6150                   gtid));
6151         return;
6152       } else {
6153         KA_TRACE(
6154             10,
6155             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6156         __kmp_unregister_root_current_thread(gtid);
6157       }
6158     } else {
6159 /* worker threads may call this function through the atexit handler, if they
6160  * call exit() */
6161 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6162    TODO: do a thorough shutdown instead */
6163 #ifdef DUMP_DEBUG_ON_EXIT
6164       if (__kmp_debug_buf)
6165         __kmp_dump_debug_buffer();
6166 #endif
6167       // added unregister library call here when we switch to shm linux
6168       // if we don't, it will leave lots of files in /dev/shm
6169       // cleanup shared memory file before exiting.
6170       __kmp_unregister_library();
6171       return;
6172     }
6173   }
6174   /* synchronize the termination process */
6175   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6176 
6177   /* have we already finished */
6178   if (__kmp_global.g.g_abort) {
6179     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6180     /* TODO abort? */
6181     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6182     return;
6183   }
6184   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6185     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6186     return;
6187   }
6188 
6189   /* We need this lock to enforce mutex between this reading of
6190      __kmp_threads_capacity and the writing by __kmp_register_root.
6191      Alternatively, we can use a counter of roots that is atomically updated by
6192      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6193      __kmp_internal_end_*.  */
6194   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6195 
6196   /* now we can safely conduct the actual termination */
6197   __kmp_internal_end();
6198 
6199   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6200   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6201 
6202   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6203 
6204 #ifdef DUMP_DEBUG_ON_EXIT
6205   if (__kmp_debug_buf)
6206     __kmp_dump_debug_buffer();
6207 #endif
6208 
6209 #if KMP_OS_WINDOWS
6210   __kmp_close_console();
6211 #endif
6212 
6213   __kmp_fini_allocator();
6214 
6215 } // __kmp_internal_end_library
6216 
6217 void __kmp_internal_end_thread(int gtid_req) {
6218   int i;
6219 
6220   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6221   /* this shouldn't be a race condition because __kmp_internal_end() is the
6222    * only place to clear __kmp_serial_init */
6223   /* we'll check this later too, after we get the lock */
6224   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6225   // redundant, because the next check will work in any case.
6226   if (__kmp_global.g.g_abort) {
6227     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6228     /* TODO abort? */
6229     return;
6230   }
6231   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6232     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6233     return;
6234   }
6235 
6236   KMP_MB(); /* Flush all pending memory write invalidates.  */
6237 
6238   /* find out who we are and what we should do */
6239   {
6240     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6241     KA_TRACE(10,
6242              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6243     if (gtid == KMP_GTID_SHUTDOWN) {
6244       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6245                     "already shutdown\n"));
6246       return;
6247     } else if (gtid == KMP_GTID_MONITOR) {
6248       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6249                     "registered, or system shutdown\n"));
6250       return;
6251     } else if (gtid == KMP_GTID_DNE) {
6252       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6253                     "shutdown\n"));
6254       return;
6255       /* we don't know who we are */
6256     } else if (KMP_UBER_GTID(gtid)) {
6257       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6258       if (__kmp_root[gtid]->r.r_active) {
6259         __kmp_global.g.g_abort = -1;
6260         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6261         KA_TRACE(10,
6262                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6263                   gtid));
6264         return;
6265       } else {
6266         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6267                       gtid));
6268         __kmp_unregister_root_current_thread(gtid);
6269       }
6270     } else {
6271       /* just a worker thread, let's leave */
6272       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6273 
6274       if (gtid >= 0) {
6275         __kmp_threads[gtid]->th.th_task_team = NULL;
6276       }
6277 
6278       KA_TRACE(10,
6279                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6280                 gtid));
6281       return;
6282     }
6283   }
6284 #if KMP_DYNAMIC_LIB
6285   if (__kmp_pause_status != kmp_hard_paused)
6286   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6287   // because we will better shutdown later in the library destructor.
6288   {
6289     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6290     return;
6291   }
6292 #endif
6293   /* synchronize the termination process */
6294   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6295 
6296   /* have we already finished */
6297   if (__kmp_global.g.g_abort) {
6298     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6299     /* TODO abort? */
6300     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6301     return;
6302   }
6303   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6304     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6305     return;
6306   }
6307 
6308   /* We need this lock to enforce mutex between this reading of
6309      __kmp_threads_capacity and the writing by __kmp_register_root.
6310      Alternatively, we can use a counter of roots that is atomically updated by
6311      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6312      __kmp_internal_end_*.  */
6313 
6314   /* should we finish the run-time?  are all siblings done? */
6315   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6316 
6317   for (i = 0; i < __kmp_threads_capacity; ++i) {
6318     if (KMP_UBER_GTID(i)) {
6319       KA_TRACE(
6320           10,
6321           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6322       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6323       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6324       return;
6325     }
6326   }
6327 
6328   /* now we can safely conduct the actual termination */
6329 
6330   __kmp_internal_end();
6331 
6332   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6333   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6334 
6335   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6336 
6337 #ifdef DUMP_DEBUG_ON_EXIT
6338   if (__kmp_debug_buf)
6339     __kmp_dump_debug_buffer();
6340 #endif
6341 } // __kmp_internal_end_thread
6342 
6343 // -----------------------------------------------------------------------------
6344 // Library registration stuff.
6345 
6346 static long __kmp_registration_flag = 0;
6347 // Random value used to indicate library initialization.
6348 static char *__kmp_registration_str = NULL;
6349 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6350 
6351 static inline char *__kmp_reg_status_name() {
6352 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6353    each thread. If registration and unregistration go in different threads
6354    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6355    env var can not be found, because the name will contain different pid. */
6356 // macOS* complains about name being too long with additional getuid()
6357 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6358   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6359                           (int)getuid());
6360 #else
6361   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6362 #endif
6363 } // __kmp_reg_status_get
6364 
6365 void __kmp_register_library_startup(void) {
6366 
6367   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6368   int done = 0;
6369   union {
6370     double dtime;
6371     long ltime;
6372   } time;
6373 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6374   __kmp_initialize_system_tick();
6375 #endif
6376   __kmp_read_system_time(&time.dtime);
6377   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6378   __kmp_registration_str =
6379       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6380                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6381 
6382   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6383                 __kmp_registration_str));
6384 
6385   while (!done) {
6386 
6387     char *value = NULL; // Actual value of the environment variable.
6388 
6389 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6390     char *shm_name = __kmp_str_format("/%s", name);
6391     int shm_preexist = 0;
6392     char *data1;
6393     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6394     if ((fd1 == -1) && (errno == EEXIST)) {
6395       // file didn't open because it already exists.
6396       // try opening existing file
6397       fd1 = shm_open(shm_name, O_RDWR, 0666);
6398       if (fd1 == -1) { // file didn't open
6399         // error out here
6400         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6401                     __kmp_msg_null);
6402       } else {
6403         // able to open existing file
6404         shm_preexist = 1;
6405       }
6406     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6407       // already exists.
6408       // error out here.
6409       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6410                   __kmp_msg_null);
6411     }
6412     if (shm_preexist == 0) {
6413       // we created SHM now set size
6414       if (ftruncate(fd1, SHM_SIZE) == -1) {
6415         // error occured setting size;
6416         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6417                     KMP_ERR(errno), __kmp_msg_null);
6418       }
6419     }
6420     data1 =
6421         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6422     if (data1 == MAP_FAILED) {
6423       // failed to map shared memory
6424       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6425                   __kmp_msg_null);
6426     }
6427     if (shm_preexist == 0) { // set data to SHM, set value
6428       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6429     }
6430     // Read value from either what we just wrote or existing file.
6431     value = __kmp_str_format("%s", data1); // read value from SHM
6432     munmap(data1, SHM_SIZE);
6433     close(fd1);
6434 #else // Windows and unix with static library
6435     // Set environment variable, but do not overwrite if it is exist.
6436     __kmp_env_set(name, __kmp_registration_str, 0);
6437     // read value to see if it got set
6438     value = __kmp_env_get(name);
6439 #endif
6440 
6441     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6442       done = 1; // Ok, environment variable set successfully, exit the loop.
6443     } else {
6444       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6445       // Check whether it alive or dead.
6446       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6447       char *tail = value;
6448       char *flag_addr_str = NULL;
6449       char *flag_val_str = NULL;
6450       char const *file_name = NULL;
6451       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6452       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6453       file_name = tail;
6454       if (tail != NULL) {
6455         long *flag_addr = 0;
6456         long flag_val = 0;
6457         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6458         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6459         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6460           // First, check whether environment-encoded address is mapped into
6461           // addr space.
6462           // If so, dereference it to see if it still has the right value.
6463           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6464             neighbor = 1;
6465           } else {
6466             // If not, then we know the other copy of the library is no longer
6467             // running.
6468             neighbor = 2;
6469           }
6470         }
6471       }
6472       switch (neighbor) {
6473       case 0: // Cannot parse environment variable -- neighbor status unknown.
6474         // Assume it is the incompatible format of future version of the
6475         // library. Assume the other library is alive.
6476         // WARN( ... ); // TODO: Issue a warning.
6477         file_name = "unknown library";
6478         KMP_FALLTHROUGH();
6479       // Attention! Falling to the next case. That's intentional.
6480       case 1: { // Neighbor is alive.
6481         // Check it is allowed.
6482         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6483         if (!__kmp_str_match_true(duplicate_ok)) {
6484           // That's not allowed. Issue fatal error.
6485           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6486                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6487         }
6488         KMP_INTERNAL_FREE(duplicate_ok);
6489         __kmp_duplicate_library_ok = 1;
6490         done = 1; // Exit the loop.
6491       } break;
6492       case 2: { // Neighbor is dead.
6493 
6494 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6495         // close shared memory.
6496         shm_unlink(shm_name); // this removes file in /dev/shm
6497 #else
6498         // Clear the variable and try to register library again.
6499         __kmp_env_unset(name);
6500 #endif
6501       } break;
6502       default: { KMP_DEBUG_ASSERT(0); } break;
6503       }
6504     }
6505     KMP_INTERNAL_FREE((void *)value);
6506 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6507     KMP_INTERNAL_FREE((void *)shm_name);
6508 #endif
6509   } // while
6510   KMP_INTERNAL_FREE((void *)name);
6511 
6512 } // func __kmp_register_library_startup
6513 
6514 void __kmp_unregister_library(void) {
6515 
6516   char *name = __kmp_reg_status_name();
6517   char *value = NULL;
6518 
6519 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6520   char *shm_name = __kmp_str_format("/%s", name);
6521   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6522   if (fd1 == -1) {
6523     // file did not open. return.
6524     return;
6525   }
6526   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6527   if (data1 != MAP_FAILED) {
6528     value = __kmp_str_format("%s", data1); // read value from SHM
6529     munmap(data1, SHM_SIZE);
6530   }
6531   close(fd1);
6532 #else
6533   value = __kmp_env_get(name);
6534 #endif
6535 
6536   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6537   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6538   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6539 //  Ok, this is our variable. Delete it.
6540 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6541     shm_unlink(shm_name); // this removes file in /dev/shm
6542 #else
6543     __kmp_env_unset(name);
6544 #endif
6545   }
6546 
6547 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6548   KMP_INTERNAL_FREE(shm_name);
6549 #endif
6550 
6551   KMP_INTERNAL_FREE(__kmp_registration_str);
6552   KMP_INTERNAL_FREE(value);
6553   KMP_INTERNAL_FREE(name);
6554 
6555   __kmp_registration_flag = 0;
6556   __kmp_registration_str = NULL;
6557 
6558 } // __kmp_unregister_library
6559 
6560 // End of Library registration stuff.
6561 // -----------------------------------------------------------------------------
6562 
6563 #if KMP_MIC_SUPPORTED
6564 
6565 static void __kmp_check_mic_type() {
6566   kmp_cpuid_t cpuid_state = {0};
6567   kmp_cpuid_t *cs_p = &cpuid_state;
6568   __kmp_x86_cpuid(1, 0, cs_p);
6569   // We don't support mic1 at the moment
6570   if ((cs_p->eax & 0xff0) == 0xB10) {
6571     __kmp_mic_type = mic2;
6572   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6573     __kmp_mic_type = mic3;
6574   } else {
6575     __kmp_mic_type = non_mic;
6576   }
6577 }
6578 
6579 #endif /* KMP_MIC_SUPPORTED */
6580 
6581 #if KMP_HAVE_UMWAIT
6582 static void __kmp_user_level_mwait_init() {
6583   struct kmp_cpuid buf;
6584   __kmp_x86_cpuid(7, 0, &buf);
6585   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6586   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6587                 __kmp_umwait_enabled));
6588 }
6589 #elif KMP_HAVE_MWAIT
6590 #ifndef AT_INTELPHIUSERMWAIT
6591 // Spurious, non-existent value that should always fail to return anything.
6592 // Will be replaced with the correct value when we know that.
6593 #define AT_INTELPHIUSERMWAIT 10000
6594 #endif
6595 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6596 // earlier OS is used to build the RTL, we'll use the following internal
6597 // function when the entry is not found.
6598 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6599 unsigned long getauxval(unsigned long) { return 0; }
6600 
6601 static void __kmp_user_level_mwait_init() {
6602   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6603   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6604   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6605   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6606   if (__kmp_mic_type == mic3) {
6607     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6608     if ((res & 0x1) || __kmp_user_level_mwait) {
6609       __kmp_mwait_enabled = TRUE;
6610       if (__kmp_user_level_mwait) {
6611         KMP_INFORM(EnvMwaitWarn);
6612       }
6613     } else {
6614       __kmp_mwait_enabled = FALSE;
6615     }
6616   }
6617   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6618                 "__kmp_mwait_enabled = %d\n",
6619                 __kmp_mic_type, __kmp_mwait_enabled));
6620 }
6621 #endif /* KMP_HAVE_UMWAIT */
6622 
6623 static void __kmp_do_serial_initialize(void) {
6624   int i, gtid;
6625   size_t size;
6626 
6627   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6628 
6629   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6630   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6631   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6632   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6633   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6634 
6635 #if OMPT_SUPPORT
6636   ompt_pre_init();
6637 #endif
6638 
6639   __kmp_validate_locks();
6640 
6641   /* Initialize internal memory allocator */
6642   __kmp_init_allocator();
6643 
6644   /* Register the library startup via an environment variable and check to see
6645      whether another copy of the library is already registered. */
6646 
6647   __kmp_register_library_startup();
6648 
6649   /* TODO reinitialization of library */
6650   if (TCR_4(__kmp_global.g.g_done)) {
6651     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6652   }
6653 
6654   __kmp_global.g.g_abort = 0;
6655   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6656 
6657 /* initialize the locks */
6658 #if KMP_USE_ADAPTIVE_LOCKS
6659 #if KMP_DEBUG_ADAPTIVE_LOCKS
6660   __kmp_init_speculative_stats();
6661 #endif
6662 #endif
6663 #if KMP_STATS_ENABLED
6664   __kmp_stats_init();
6665 #endif
6666   __kmp_init_lock(&__kmp_global_lock);
6667   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6668   __kmp_init_lock(&__kmp_debug_lock);
6669   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6670   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6671   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6672   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6673   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6674   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6675   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6676   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6677   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6678   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6679   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6680   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6681   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6682   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6683   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6684 #if KMP_USE_MONITOR
6685   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6686 #endif
6687   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6688 
6689   /* conduct initialization and initial setup of configuration */
6690 
6691   __kmp_runtime_initialize();
6692 
6693 #if KMP_MIC_SUPPORTED
6694   __kmp_check_mic_type();
6695 #endif
6696 
6697 // Some global variable initialization moved here from kmp_env_initialize()
6698 #ifdef KMP_DEBUG
6699   kmp_diag = 0;
6700 #endif
6701   __kmp_abort_delay = 0;
6702 
6703   // From __kmp_init_dflt_team_nth()
6704   /* assume the entire machine will be used */
6705   __kmp_dflt_team_nth_ub = __kmp_xproc;
6706   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6707     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6708   }
6709   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6710     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6711   }
6712   __kmp_max_nth = __kmp_sys_max_nth;
6713   __kmp_cg_max_nth = __kmp_sys_max_nth;
6714   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6715   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6716     __kmp_teams_max_nth = __kmp_sys_max_nth;
6717   }
6718 
6719   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6720   // part
6721   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6722 #if KMP_USE_MONITOR
6723   __kmp_monitor_wakeups =
6724       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6725   __kmp_bt_intervals =
6726       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6727 #endif
6728   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6729   __kmp_library = library_throughput;
6730   // From KMP_SCHEDULE initialization
6731   __kmp_static = kmp_sch_static_balanced;
6732 // AC: do not use analytical here, because it is non-monotonous
6733 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6734 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6735 // need to repeat assignment
6736 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6737 // bit control and barrier method control parts
6738 #if KMP_FAST_REDUCTION_BARRIER
6739 #define kmp_reduction_barrier_gather_bb ((int)1)
6740 #define kmp_reduction_barrier_release_bb ((int)1)
6741 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6742 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6743 #endif // KMP_FAST_REDUCTION_BARRIER
6744   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6745     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6746     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6747     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6748     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6749 #if KMP_FAST_REDUCTION_BARRIER
6750     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6751       // lin_64 ): hyper,1
6752       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6753       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6754       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6755       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6756     }
6757 #endif // KMP_FAST_REDUCTION_BARRIER
6758   }
6759 #if KMP_FAST_REDUCTION_BARRIER
6760 #undef kmp_reduction_barrier_release_pat
6761 #undef kmp_reduction_barrier_gather_pat
6762 #undef kmp_reduction_barrier_release_bb
6763 #undef kmp_reduction_barrier_gather_bb
6764 #endif // KMP_FAST_REDUCTION_BARRIER
6765 #if KMP_MIC_SUPPORTED
6766   if (__kmp_mic_type == mic2) { // KNC
6767     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6768     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6769     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6770         1; // forkjoin release
6771     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6772     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6773   }
6774 #if KMP_FAST_REDUCTION_BARRIER
6775   if (__kmp_mic_type == mic2) { // KNC
6776     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6777     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6778   }
6779 #endif // KMP_FAST_REDUCTION_BARRIER
6780 #endif // KMP_MIC_SUPPORTED
6781 
6782 // From KMP_CHECKS initialization
6783 #ifdef KMP_DEBUG
6784   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6785 #else
6786   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6787 #endif
6788 
6789   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6790   __kmp_foreign_tp = TRUE;
6791 
6792   __kmp_global.g.g_dynamic = FALSE;
6793   __kmp_global.g.g_dynamic_mode = dynamic_default;
6794 
6795   __kmp_env_initialize(NULL);
6796 
6797 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6798   __kmp_user_level_mwait_init();
6799 #endif
6800 // Print all messages in message catalog for testing purposes.
6801 #ifdef KMP_DEBUG
6802   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6803   if (__kmp_str_match_true(val)) {
6804     kmp_str_buf_t buffer;
6805     __kmp_str_buf_init(&buffer);
6806     __kmp_i18n_dump_catalog(&buffer);
6807     __kmp_printf("%s", buffer.str);
6808     __kmp_str_buf_free(&buffer);
6809   }
6810   __kmp_env_free(&val);
6811 #endif
6812 
6813   __kmp_threads_capacity =
6814       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6815   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6816   __kmp_tp_capacity = __kmp_default_tp_capacity(
6817       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6818 
6819   // If the library is shut down properly, both pools must be NULL. Just in
6820   // case, set them to NULL -- some memory may leak, but subsequent code will
6821   // work even if pools are not freed.
6822   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6823   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6824   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6825   __kmp_thread_pool = NULL;
6826   __kmp_thread_pool_insert_pt = NULL;
6827   __kmp_team_pool = NULL;
6828 
6829   /* Allocate all of the variable sized records */
6830   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6831    * expandable */
6832   /* Since allocation is cache-aligned, just add extra padding at the end */
6833   size =
6834       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6835       CACHE_LINE;
6836   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6837   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6838                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6839 
6840   /* init thread counts */
6841   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6842                    0); // Asserts fail if the library is reinitializing and
6843   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6844   __kmp_all_nth = 0;
6845   __kmp_nth = 0;
6846 
6847   /* setup the uber master thread and hierarchy */
6848   gtid = __kmp_register_root(TRUE);
6849   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6850   KMP_ASSERT(KMP_UBER_GTID(gtid));
6851   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6852 
6853   KMP_MB(); /* Flush all pending memory write invalidates.  */
6854 
6855   __kmp_common_initialize();
6856 
6857 #if KMP_OS_UNIX
6858   /* invoke the child fork handler */
6859   __kmp_register_atfork();
6860 #endif
6861 
6862 #if !KMP_DYNAMIC_LIB
6863   {
6864     /* Invoke the exit handler when the program finishes, only for static
6865        library. For dynamic library, we already have _fini and DllMain. */
6866     int rc = atexit(__kmp_internal_end_atexit);
6867     if (rc != 0) {
6868       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6869                   __kmp_msg_null);
6870     }
6871   }
6872 #endif
6873 
6874 #if KMP_HANDLE_SIGNALS
6875 #if KMP_OS_UNIX
6876   /* NOTE: make sure that this is called before the user installs their own
6877      signal handlers so that the user handlers are called first. this way they
6878      can return false, not call our handler, avoid terminating the library, and
6879      continue execution where they left off. */
6880   __kmp_install_signals(FALSE);
6881 #endif /* KMP_OS_UNIX */
6882 #if KMP_OS_WINDOWS
6883   __kmp_install_signals(TRUE);
6884 #endif /* KMP_OS_WINDOWS */
6885 #endif
6886 
6887   /* we have finished the serial initialization */
6888   __kmp_init_counter++;
6889 
6890   __kmp_init_serial = TRUE;
6891 
6892   if (__kmp_settings) {
6893     __kmp_env_print();
6894   }
6895 
6896   if (__kmp_display_env || __kmp_display_env_verbose) {
6897     __kmp_env_print_2();
6898   }
6899 
6900 #if OMPT_SUPPORT
6901   ompt_post_init();
6902 #endif
6903 
6904   KMP_MB();
6905 
6906   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6907 }
6908 
6909 void __kmp_serial_initialize(void) {
6910   if (__kmp_init_serial) {
6911     return;
6912   }
6913   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6914   if (__kmp_init_serial) {
6915     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6916     return;
6917   }
6918   __kmp_do_serial_initialize();
6919   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6920 }
6921 
6922 static void __kmp_do_middle_initialize(void) {
6923   int i, j;
6924   int prev_dflt_team_nth;
6925 
6926   if (!__kmp_init_serial) {
6927     __kmp_do_serial_initialize();
6928   }
6929 
6930   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6931 
6932   // Save the previous value for the __kmp_dflt_team_nth so that
6933   // we can avoid some reinitialization if it hasn't changed.
6934   prev_dflt_team_nth = __kmp_dflt_team_nth;
6935 
6936 #if KMP_AFFINITY_SUPPORTED
6937   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6938   // number of cores on the machine.
6939   __kmp_affinity_initialize();
6940 
6941   // Run through the __kmp_threads array and set the affinity mask
6942   // for each root thread that is currently registered with the RTL.
6943   for (i = 0; i < __kmp_threads_capacity; i++) {
6944     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6945       __kmp_affinity_set_init_mask(i, TRUE);
6946     }
6947   }
6948 #endif /* KMP_AFFINITY_SUPPORTED */
6949 
6950   KMP_ASSERT(__kmp_xproc > 0);
6951   if (__kmp_avail_proc == 0) {
6952     __kmp_avail_proc = __kmp_xproc;
6953   }
6954 
6955   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6956   // correct them now
6957   j = 0;
6958   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6959     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6960         __kmp_avail_proc;
6961     j++;
6962   }
6963 
6964   if (__kmp_dflt_team_nth == 0) {
6965 #ifdef KMP_DFLT_NTH_CORES
6966     // Default #threads = #cores
6967     __kmp_dflt_team_nth = __kmp_ncores;
6968     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6969                   "__kmp_ncores (%d)\n",
6970                   __kmp_dflt_team_nth));
6971 #else
6972     // Default #threads = #available OS procs
6973     __kmp_dflt_team_nth = __kmp_avail_proc;
6974     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6975                   "__kmp_avail_proc(%d)\n",
6976                   __kmp_dflt_team_nth));
6977 #endif /* KMP_DFLT_NTH_CORES */
6978   }
6979 
6980   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6981     __kmp_dflt_team_nth = KMP_MIN_NTH;
6982   }
6983   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6984     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6985   }
6986 
6987   // There's no harm in continuing if the following check fails,
6988   // but it indicates an error in the previous logic.
6989   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6990 
6991   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6992     // Run through the __kmp_threads array and set the num threads icv for each
6993     // root thread that is currently registered with the RTL (which has not
6994     // already explicitly set its nthreads-var with a call to
6995     // omp_set_num_threads()).
6996     for (i = 0; i < __kmp_threads_capacity; i++) {
6997       kmp_info_t *thread = __kmp_threads[i];
6998       if (thread == NULL)
6999         continue;
7000       if (thread->th.th_current_task->td_icvs.nproc != 0)
7001         continue;
7002 
7003       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7004     }
7005   }
7006   KA_TRACE(
7007       20,
7008       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7009        __kmp_dflt_team_nth));
7010 
7011 #ifdef KMP_ADJUST_BLOCKTIME
7012   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7013   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7014     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7015     if (__kmp_nth > __kmp_avail_proc) {
7016       __kmp_zero_bt = TRUE;
7017     }
7018   }
7019 #endif /* KMP_ADJUST_BLOCKTIME */
7020 
7021   /* we have finished middle initialization */
7022   TCW_SYNC_4(__kmp_init_middle, TRUE);
7023 
7024   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7025 }
7026 
7027 void __kmp_middle_initialize(void) {
7028   if (__kmp_init_middle) {
7029     return;
7030   }
7031   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7032   if (__kmp_init_middle) {
7033     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7034     return;
7035   }
7036   __kmp_do_middle_initialize();
7037   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7038 }
7039 
7040 void __kmp_parallel_initialize(void) {
7041   int gtid = __kmp_entry_gtid(); // this might be a new root
7042 
7043   /* synchronize parallel initialization (for sibling) */
7044   if (TCR_4(__kmp_init_parallel))
7045     return;
7046   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7047   if (TCR_4(__kmp_init_parallel)) {
7048     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7049     return;
7050   }
7051 
7052   /* TODO reinitialization after we have already shut down */
7053   if (TCR_4(__kmp_global.g.g_done)) {
7054     KA_TRACE(
7055         10,
7056         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7057     __kmp_infinite_loop();
7058   }
7059 
7060   /* jc: The lock __kmp_initz_lock is already held, so calling
7061      __kmp_serial_initialize would cause a deadlock.  So we call
7062      __kmp_do_serial_initialize directly. */
7063   if (!__kmp_init_middle) {
7064     __kmp_do_middle_initialize();
7065   }
7066   __kmp_resume_if_hard_paused();
7067 
7068   /* begin initialization */
7069   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7070   KMP_ASSERT(KMP_UBER_GTID(gtid));
7071 
7072 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7073   // Save the FP control regs.
7074   // Worker threads will set theirs to these values at thread startup.
7075   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7076   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7077   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7078 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7079 
7080 #if KMP_OS_UNIX
7081 #if KMP_HANDLE_SIGNALS
7082   /*  must be after __kmp_serial_initialize  */
7083   __kmp_install_signals(TRUE);
7084 #endif
7085 #endif
7086 
7087   __kmp_suspend_initialize();
7088 
7089 #if defined(USE_LOAD_BALANCE)
7090   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7091     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7092   }
7093 #else
7094   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7095     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7096   }
7097 #endif
7098 
7099   if (__kmp_version) {
7100     __kmp_print_version_2();
7101   }
7102 
7103   /* we have finished parallel initialization */
7104   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7105 
7106   KMP_MB();
7107   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7108 
7109   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7110 }
7111 
7112 /* ------------------------------------------------------------------------ */
7113 
7114 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7115                                    kmp_team_t *team) {
7116   kmp_disp_t *dispatch;
7117 
7118   KMP_MB();
7119 
7120   /* none of the threads have encountered any constructs, yet. */
7121   this_thr->th.th_local.this_construct = 0;
7122 #if KMP_CACHE_MANAGE
7123   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7124 #endif /* KMP_CACHE_MANAGE */
7125   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7126   KMP_DEBUG_ASSERT(dispatch);
7127   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7128   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7129   // this_thr->th.th_info.ds.ds_tid ] );
7130 
7131   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7132   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7133   if (__kmp_env_consistency_check)
7134     __kmp_push_parallel(gtid, team->t.t_ident);
7135 
7136   KMP_MB(); /* Flush all pending memory write invalidates.  */
7137 }
7138 
7139 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7140                                   kmp_team_t *team) {
7141   if (__kmp_env_consistency_check)
7142     __kmp_pop_parallel(gtid, team->t.t_ident);
7143 
7144   __kmp_finish_implicit_task(this_thr);
7145 }
7146 
7147 int __kmp_invoke_task_func(int gtid) {
7148   int rc;
7149   int tid = __kmp_tid_from_gtid(gtid);
7150   kmp_info_t *this_thr = __kmp_threads[gtid];
7151   kmp_team_t *team = this_thr->th.th_team;
7152 
7153   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7154 #if USE_ITT_BUILD
7155   if (__itt_stack_caller_create_ptr) {
7156     __kmp_itt_stack_callee_enter(
7157         (__itt_caller)
7158             team->t.t_stack_id); // inform ittnotify about entering user's code
7159   }
7160 #endif /* USE_ITT_BUILD */
7161 #if INCLUDE_SSC_MARKS
7162   SSC_MARK_INVOKING();
7163 #endif
7164 
7165 #if OMPT_SUPPORT
7166   void *dummy;
7167   void **exit_frame_p;
7168   ompt_data_t *my_task_data;
7169   ompt_data_t *my_parallel_data;
7170   int ompt_team_size;
7171 
7172   if (ompt_enabled.enabled) {
7173     exit_frame_p = &(
7174         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7175   } else {
7176     exit_frame_p = &dummy;
7177   }
7178 
7179   my_task_data =
7180       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7181   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7182   if (ompt_enabled.ompt_callback_implicit_task) {
7183     ompt_team_size = team->t.t_nproc;
7184     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7185         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7186         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7187     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7188   }
7189 #endif
7190 
7191 #if KMP_STATS_ENABLED
7192   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7193   if (previous_state == stats_state_e::TEAMS_REGION) {
7194     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7195   } else {
7196     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7197   }
7198   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7199 #endif
7200 
7201   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7202                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7203 #if OMPT_SUPPORT
7204                               ,
7205                               exit_frame_p
7206 #endif
7207                               );
7208 #if OMPT_SUPPORT
7209   *exit_frame_p = NULL;
7210    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7211 #endif
7212 
7213 #if KMP_STATS_ENABLED
7214   if (previous_state == stats_state_e::TEAMS_REGION) {
7215     KMP_SET_THREAD_STATE(previous_state);
7216   }
7217   KMP_POP_PARTITIONED_TIMER();
7218 #endif
7219 
7220 #if USE_ITT_BUILD
7221   if (__itt_stack_caller_create_ptr) {
7222     __kmp_itt_stack_callee_leave(
7223         (__itt_caller)
7224             team->t.t_stack_id); // inform ittnotify about leaving user's code
7225   }
7226 #endif /* USE_ITT_BUILD */
7227   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7228 
7229   return rc;
7230 }
7231 
7232 void __kmp_teams_master(int gtid) {
7233   // This routine is called by all master threads in teams construct
7234   kmp_info_t *thr = __kmp_threads[gtid];
7235   kmp_team_t *team = thr->th.th_team;
7236   ident_t *loc = team->t.t_ident;
7237   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7238   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7239   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7240   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7241                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7242 
7243   // This thread is a new CG root.  Set up the proper variables.
7244   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7245   tmp->cg_root = thr; // Make thr the CG root
7246   // Init to thread limit that was stored when league masters were forked
7247   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7248   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7249   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7250                  " cg_nthreads to 1\n",
7251                  thr, tmp));
7252   tmp->up = thr->th.th_cg_roots;
7253   thr->th.th_cg_roots = tmp;
7254 
7255 // Launch league of teams now, but not let workers execute
7256 // (they hang on fork barrier until next parallel)
7257 #if INCLUDE_SSC_MARKS
7258   SSC_MARK_FORKING();
7259 #endif
7260   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7261                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7262                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7263 #if INCLUDE_SSC_MARKS
7264   SSC_MARK_JOINING();
7265 #endif
7266   // If the team size was reduced from the limit, set it to the new size
7267   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7268     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7269   // AC: last parameter "1" eliminates join barrier which won't work because
7270   // worker threads are in a fork barrier waiting for more parallel regions
7271   __kmp_join_call(loc, gtid
7272 #if OMPT_SUPPORT
7273                   ,
7274                   fork_context_intel
7275 #endif
7276                   ,
7277                   1);
7278 }
7279 
7280 int __kmp_invoke_teams_master(int gtid) {
7281   kmp_info_t *this_thr = __kmp_threads[gtid];
7282   kmp_team_t *team = this_thr->th.th_team;
7283 #if KMP_DEBUG
7284   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7285     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7286                      (void *)__kmp_teams_master);
7287 #endif
7288   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7289 #if OMPT_SUPPORT
7290   int tid = __kmp_tid_from_gtid(gtid);
7291   ompt_data_t *task_data =
7292       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7293   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7294   if (ompt_enabled.ompt_callback_implicit_task) {
7295     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7296         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7297         ompt_task_initial);
7298     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7299   }
7300 #endif
7301   __kmp_teams_master(gtid);
7302 #if OMPT_SUPPORT
7303   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7304 #endif
7305   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7306   return 1;
7307 }
7308 
7309 /* this sets the requested number of threads for the next parallel region
7310    encountered by this team. since this should be enclosed in the forkjoin
7311    critical section it should avoid race conditions with asymmetrical nested
7312    parallelism */
7313 
7314 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7315   kmp_info_t *thr = __kmp_threads[gtid];
7316 
7317   if (num_threads > 0)
7318     thr->th.th_set_nproc = num_threads;
7319 }
7320 
7321 /* this sets the requested number of teams for the teams region and/or
7322    the number of threads for the next parallel region encountered  */
7323 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7324                           int num_threads) {
7325   kmp_info_t *thr = __kmp_threads[gtid];
7326   KMP_DEBUG_ASSERT(num_teams >= 0);
7327   KMP_DEBUG_ASSERT(num_threads >= 0);
7328 
7329   if (num_teams == 0)
7330     num_teams = 1; // default number of teams is 1.
7331   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7332     if (!__kmp_reserve_warn) {
7333       __kmp_reserve_warn = 1;
7334       __kmp_msg(kmp_ms_warning,
7335                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7336                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7337     }
7338     num_teams = __kmp_teams_max_nth;
7339   }
7340   // Set number of teams (number of threads in the outer "parallel" of the
7341   // teams)
7342   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7343 
7344   // Remember the number of threads for inner parallel regions
7345   if (!TCR_4(__kmp_init_middle))
7346     __kmp_middle_initialize(); // get internal globals calculated
7347   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7348   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7349   if (num_threads == 0) {
7350     num_threads = __kmp_avail_proc / num_teams;
7351     // adjust num_threads w/o warning as it is not user setting
7352     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7353     // no thread_limit clause specified -  do not change thread-limit-var ICV
7354     if (num_threads > __kmp_dflt_team_nth) {
7355       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7356     }
7357     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7358       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7359     } // prevent team size to exceed thread-limit-var
7360     if (num_teams * num_threads > __kmp_teams_max_nth) {
7361       num_threads = __kmp_teams_max_nth / num_teams;
7362     }
7363   } else {
7364     // This thread will be the master of the league masters
7365     // Store new thread limit; old limit is saved in th_cg_roots list
7366     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7367     // num_threads = min(num_threads, nthreads-var)
7368     if (num_threads > __kmp_dflt_team_nth) {
7369       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7370     }
7371     if (num_teams * num_threads > __kmp_teams_max_nth) {
7372       int new_threads = __kmp_teams_max_nth / num_teams;
7373       if (!__kmp_reserve_warn) { // user asked for too many threads
7374         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7375         __kmp_msg(kmp_ms_warning,
7376                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7377                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7378       }
7379       num_threads = new_threads;
7380     }
7381   }
7382   thr->th.th_teams_size.nth = num_threads;
7383 }
7384 
7385 // Set the proc_bind var to use in the following parallel region.
7386 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7387   kmp_info_t *thr = __kmp_threads[gtid];
7388   thr->th.th_set_proc_bind = proc_bind;
7389 }
7390 
7391 /* Launch the worker threads into the microtask. */
7392 
7393 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7394   kmp_info_t *this_thr = __kmp_threads[gtid];
7395 
7396 #ifdef KMP_DEBUG
7397   int f;
7398 #endif /* KMP_DEBUG */
7399 
7400   KMP_DEBUG_ASSERT(team);
7401   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7402   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7403   KMP_MB(); /* Flush all pending memory write invalidates.  */
7404 
7405   team->t.t_construct = 0; /* no single directives seen yet */
7406   team->t.t_ordered.dt.t_value =
7407       0; /* thread 0 enters the ordered section first */
7408 
7409   /* Reset the identifiers on the dispatch buffer */
7410   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7411   if (team->t.t_max_nproc > 1) {
7412     int i;
7413     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7414       team->t.t_disp_buffer[i].buffer_index = i;
7415       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7416     }
7417   } else {
7418     team->t.t_disp_buffer[0].buffer_index = 0;
7419     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7420   }
7421 
7422   KMP_MB(); /* Flush all pending memory write invalidates.  */
7423   KMP_ASSERT(this_thr->th.th_team == team);
7424 
7425 #ifdef KMP_DEBUG
7426   for (f = 0; f < team->t.t_nproc; f++) {
7427     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7428                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7429   }
7430 #endif /* KMP_DEBUG */
7431 
7432   /* release the worker threads so they may begin working */
7433   __kmp_fork_barrier(gtid, 0);
7434 }
7435 
7436 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7437   kmp_info_t *this_thr = __kmp_threads[gtid];
7438 
7439   KMP_DEBUG_ASSERT(team);
7440   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7441   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7442   KMP_MB(); /* Flush all pending memory write invalidates.  */
7443 
7444 /* Join barrier after fork */
7445 
7446 #ifdef KMP_DEBUG
7447   if (__kmp_threads[gtid] &&
7448       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7449     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7450                  __kmp_threads[gtid]);
7451     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7452                  "team->t.t_nproc=%d\n",
7453                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7454                  team->t.t_nproc);
7455     __kmp_print_structure();
7456   }
7457   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7458                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7459 #endif /* KMP_DEBUG */
7460 
7461   __kmp_join_barrier(gtid); /* wait for everyone */
7462 #if OMPT_SUPPORT
7463   if (ompt_enabled.enabled &&
7464       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7465     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7466     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7467     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7468 #if OMPT_OPTIONAL
7469     void *codeptr = NULL;
7470     if (KMP_MASTER_TID(ds_tid) &&
7471         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7472          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7473       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7474 
7475     if (ompt_enabled.ompt_callback_sync_region_wait) {
7476       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7477           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7478           codeptr);
7479     }
7480     if (ompt_enabled.ompt_callback_sync_region) {
7481       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7482           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7483           codeptr);
7484     }
7485 #endif
7486     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7487       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7488           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7489     }
7490   }
7491 #endif
7492 
7493   KMP_MB(); /* Flush all pending memory write invalidates.  */
7494   KMP_ASSERT(this_thr->th.th_team == team);
7495 }
7496 
7497 /* ------------------------------------------------------------------------ */
7498 
7499 #ifdef USE_LOAD_BALANCE
7500 
7501 // Return the worker threads actively spinning in the hot team, if we
7502 // are at the outermost level of parallelism.  Otherwise, return 0.
7503 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7504   int i;
7505   int retval;
7506   kmp_team_t *hot_team;
7507 
7508   if (root->r.r_active) {
7509     return 0;
7510   }
7511   hot_team = root->r.r_hot_team;
7512   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7513     return hot_team->t.t_nproc - 1; // Don't count master thread
7514   }
7515 
7516   // Skip the master thread - it is accounted for elsewhere.
7517   retval = 0;
7518   for (i = 1; i < hot_team->t.t_nproc; i++) {
7519     if (hot_team->t.t_threads[i]->th.th_active) {
7520       retval++;
7521     }
7522   }
7523   return retval;
7524 }
7525 
7526 // Perform an automatic adjustment to the number of
7527 // threads used by the next parallel region.
7528 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7529   int retval;
7530   int pool_active;
7531   int hot_team_active;
7532   int team_curr_active;
7533   int system_active;
7534 
7535   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7536                 set_nproc));
7537   KMP_DEBUG_ASSERT(root);
7538   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7539                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7540   KMP_DEBUG_ASSERT(set_nproc > 1);
7541 
7542   if (set_nproc == 1) {
7543     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7544     return 1;
7545   }
7546 
7547   // Threads that are active in the thread pool, active in the hot team for this
7548   // particular root (if we are at the outer par level), and the currently
7549   // executing thread (to become the master) are available to add to the new
7550   // team, but are currently contributing to the system load, and must be
7551   // accounted for.
7552   pool_active = __kmp_thread_pool_active_nth;
7553   hot_team_active = __kmp_active_hot_team_nproc(root);
7554   team_curr_active = pool_active + hot_team_active + 1;
7555 
7556   // Check the system load.
7557   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7558   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7559                 "hot team active = %d\n",
7560                 system_active, pool_active, hot_team_active));
7561 
7562   if (system_active < 0) {
7563     // There was an error reading the necessary info from /proc, so use the
7564     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7565     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7566     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7567     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7568 
7569     // Make this call behave like the thread limit algorithm.
7570     retval = __kmp_avail_proc - __kmp_nth +
7571              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7572     if (retval > set_nproc) {
7573       retval = set_nproc;
7574     }
7575     if (retval < KMP_MIN_NTH) {
7576       retval = KMP_MIN_NTH;
7577     }
7578 
7579     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7580                   retval));
7581     return retval;
7582   }
7583 
7584   // There is a slight delay in the load balance algorithm in detecting new
7585   // running procs. The real system load at this instant should be at least as
7586   // large as the #active omp thread that are available to add to the team.
7587   if (system_active < team_curr_active) {
7588     system_active = team_curr_active;
7589   }
7590   retval = __kmp_avail_proc - system_active + team_curr_active;
7591   if (retval > set_nproc) {
7592     retval = set_nproc;
7593   }
7594   if (retval < KMP_MIN_NTH) {
7595     retval = KMP_MIN_NTH;
7596   }
7597 
7598   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7599   return retval;
7600 } // __kmp_load_balance_nproc()
7601 
7602 #endif /* USE_LOAD_BALANCE */
7603 
7604 /* ------------------------------------------------------------------------ */
7605 
7606 /* NOTE: this is called with the __kmp_init_lock held */
7607 void __kmp_cleanup(void) {
7608   int f;
7609 
7610   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7611 
7612   if (TCR_4(__kmp_init_parallel)) {
7613 #if KMP_HANDLE_SIGNALS
7614     __kmp_remove_signals();
7615 #endif
7616     TCW_4(__kmp_init_parallel, FALSE);
7617   }
7618 
7619   if (TCR_4(__kmp_init_middle)) {
7620 #if KMP_AFFINITY_SUPPORTED
7621     __kmp_affinity_uninitialize();
7622 #endif /* KMP_AFFINITY_SUPPORTED */
7623     __kmp_cleanup_hierarchy();
7624     TCW_4(__kmp_init_middle, FALSE);
7625   }
7626 
7627   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7628 
7629   if (__kmp_init_serial) {
7630     __kmp_runtime_destroy();
7631     __kmp_init_serial = FALSE;
7632   }
7633 
7634   __kmp_cleanup_threadprivate_caches();
7635 
7636   for (f = 0; f < __kmp_threads_capacity; f++) {
7637     if (__kmp_root[f] != NULL) {
7638       __kmp_free(__kmp_root[f]);
7639       __kmp_root[f] = NULL;
7640     }
7641   }
7642   __kmp_free(__kmp_threads);
7643   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7644   // there is no need in freeing __kmp_root.
7645   __kmp_threads = NULL;
7646   __kmp_root = NULL;
7647   __kmp_threads_capacity = 0;
7648 
7649 #if KMP_USE_DYNAMIC_LOCK
7650   __kmp_cleanup_indirect_user_locks();
7651 #else
7652   __kmp_cleanup_user_locks();
7653 #endif
7654 
7655 #if KMP_AFFINITY_SUPPORTED
7656   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7657   __kmp_cpuinfo_file = NULL;
7658 #endif /* KMP_AFFINITY_SUPPORTED */
7659 
7660 #if KMP_USE_ADAPTIVE_LOCKS
7661 #if KMP_DEBUG_ADAPTIVE_LOCKS
7662   __kmp_print_speculative_stats();
7663 #endif
7664 #endif
7665   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7666   __kmp_nested_nth.nth = NULL;
7667   __kmp_nested_nth.size = 0;
7668   __kmp_nested_nth.used = 0;
7669   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7670   __kmp_nested_proc_bind.bind_types = NULL;
7671   __kmp_nested_proc_bind.size = 0;
7672   __kmp_nested_proc_bind.used = 0;
7673   if (__kmp_affinity_format) {
7674     KMP_INTERNAL_FREE(__kmp_affinity_format);
7675     __kmp_affinity_format = NULL;
7676   }
7677 
7678   __kmp_i18n_catclose();
7679 
7680 #if KMP_USE_HIER_SCHED
7681   __kmp_hier_scheds.deallocate();
7682 #endif
7683 
7684 #if KMP_STATS_ENABLED
7685   __kmp_stats_fini();
7686 #endif
7687 
7688   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7689 }
7690 
7691 /* ------------------------------------------------------------------------ */
7692 
7693 int __kmp_ignore_mppbeg(void) {
7694   char *env;
7695 
7696   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7697     if (__kmp_str_match_false(env))
7698       return FALSE;
7699   }
7700   // By default __kmpc_begin() is no-op.
7701   return TRUE;
7702 }
7703 
7704 int __kmp_ignore_mppend(void) {
7705   char *env;
7706 
7707   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7708     if (__kmp_str_match_false(env))
7709       return FALSE;
7710   }
7711   // By default __kmpc_end() is no-op.
7712   return TRUE;
7713 }
7714 
7715 void __kmp_internal_begin(void) {
7716   int gtid;
7717   kmp_root_t *root;
7718 
7719   /* this is a very important step as it will register new sibling threads
7720      and assign these new uber threads a new gtid */
7721   gtid = __kmp_entry_gtid();
7722   root = __kmp_threads[gtid]->th.th_root;
7723   KMP_ASSERT(KMP_UBER_GTID(gtid));
7724 
7725   if (root->r.r_begin)
7726     return;
7727   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7728   if (root->r.r_begin) {
7729     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7730     return;
7731   }
7732 
7733   root->r.r_begin = TRUE;
7734 
7735   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7736 }
7737 
7738 /* ------------------------------------------------------------------------ */
7739 
7740 void __kmp_user_set_library(enum library_type arg) {
7741   int gtid;
7742   kmp_root_t *root;
7743   kmp_info_t *thread;
7744 
7745   /* first, make sure we are initialized so we can get our gtid */
7746 
7747   gtid = __kmp_entry_gtid();
7748   thread = __kmp_threads[gtid];
7749 
7750   root = thread->th.th_root;
7751 
7752   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7753                 library_serial));
7754   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7755                                   thread */
7756     KMP_WARNING(SetLibraryIncorrectCall);
7757     return;
7758   }
7759 
7760   switch (arg) {
7761   case library_serial:
7762     thread->th.th_set_nproc = 0;
7763     set__nproc(thread, 1);
7764     break;
7765   case library_turnaround:
7766     thread->th.th_set_nproc = 0;
7767     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7768                                            : __kmp_dflt_team_nth_ub);
7769     break;
7770   case library_throughput:
7771     thread->th.th_set_nproc = 0;
7772     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7773                                            : __kmp_dflt_team_nth_ub);
7774     break;
7775   default:
7776     KMP_FATAL(UnknownLibraryType, arg);
7777   }
7778 
7779   __kmp_aux_set_library(arg);
7780 }
7781 
7782 void __kmp_aux_set_stacksize(size_t arg) {
7783   if (!__kmp_init_serial)
7784     __kmp_serial_initialize();
7785 
7786 #if KMP_OS_DARWIN
7787   if (arg & (0x1000 - 1)) {
7788     arg &= ~(0x1000 - 1);
7789     if (arg + 0x1000) /* check for overflow if we round up */
7790       arg += 0x1000;
7791   }
7792 #endif
7793   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7794 
7795   /* only change the default stacksize before the first parallel region */
7796   if (!TCR_4(__kmp_init_parallel)) {
7797     size_t value = arg; /* argument is in bytes */
7798 
7799     if (value < __kmp_sys_min_stksize)
7800       value = __kmp_sys_min_stksize;
7801     else if (value > KMP_MAX_STKSIZE)
7802       value = KMP_MAX_STKSIZE;
7803 
7804     __kmp_stksize = value;
7805 
7806     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7807   }
7808 
7809   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7810 }
7811 
7812 /* set the behaviour of the runtime library */
7813 /* TODO this can cause some odd behaviour with sibling parallelism... */
7814 void __kmp_aux_set_library(enum library_type arg) {
7815   __kmp_library = arg;
7816 
7817   switch (__kmp_library) {
7818   case library_serial: {
7819     KMP_INFORM(LibraryIsSerial);
7820   } break;
7821   case library_turnaround:
7822     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7823       __kmp_use_yield = 2; // only yield when oversubscribed
7824     break;
7825   case library_throughput:
7826     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7827       __kmp_dflt_blocktime = 200;
7828     break;
7829   default:
7830     KMP_FATAL(UnknownLibraryType, arg);
7831   }
7832 }
7833 
7834 /* Getting team information common for all team API */
7835 // Returns NULL if not in teams construct
7836 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7837   kmp_info_t *thr = __kmp_entry_thread();
7838   teams_serialized = 0;
7839   if (thr->th.th_teams_microtask) {
7840     kmp_team_t *team = thr->th.th_team;
7841     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7842     int ii = team->t.t_level;
7843     teams_serialized = team->t.t_serialized;
7844     int level = tlevel + 1;
7845     KMP_DEBUG_ASSERT(ii >= tlevel);
7846     while (ii > level) {
7847       for (teams_serialized = team->t.t_serialized;
7848            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7849       }
7850       if (team->t.t_serialized && (!teams_serialized)) {
7851         team = team->t.t_parent;
7852         continue;
7853       }
7854       if (ii > level) {
7855         team = team->t.t_parent;
7856         ii--;
7857       }
7858     }
7859     return team;
7860   }
7861   return NULL;
7862 }
7863 
7864 int __kmp_aux_get_team_num() {
7865   int serialized;
7866   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7867   if (team) {
7868     if (serialized > 1) {
7869       return 0; // teams region is serialized ( 1 team of 1 thread ).
7870     } else {
7871       return team->t.t_master_tid;
7872     }
7873   }
7874   return 0;
7875 }
7876 
7877 int __kmp_aux_get_num_teams() {
7878   int serialized;
7879   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7880   if (team) {
7881     if (serialized > 1) {
7882       return 1;
7883     } else {
7884       return team->t.t_parent->t.t_nproc;
7885     }
7886   }
7887   return 1;
7888 }
7889 
7890 /* ------------------------------------------------------------------------ */
7891 
7892 /*
7893  * Affinity Format Parser
7894  *
7895  * Field is in form of: %[[[0].]size]type
7896  * % and type are required (%% means print a literal '%')
7897  * type is either single char or long name surrounded by {},
7898  * e.g., N or {num_threads}
7899  * 0 => leading zeros
7900  * . => right justified when size is specified
7901  * by default output is left justified
7902  * size is the *minimum* field length
7903  * All other characters are printed as is
7904  *
7905  * Available field types:
7906  * L {thread_level}      - omp_get_level()
7907  * n {thread_num}        - omp_get_thread_num()
7908  * h {host}              - name of host machine
7909  * P {process_id}        - process id (integer)
7910  * T {thread_identifier} - native thread identifier (integer)
7911  * N {num_threads}       - omp_get_num_threads()
7912  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7913  * a {thread_affinity}   - comma separated list of integers or integer ranges
7914  *                         (values of affinity mask)
7915  *
7916  * Implementation-specific field types can be added
7917  * If a type is unknown, print "undefined"
7918 */
7919 
7920 // Structure holding the short name, long name, and corresponding data type
7921 // for snprintf.  A table of these will represent the entire valid keyword
7922 // field types.
7923 typedef struct kmp_affinity_format_field_t {
7924   char short_name; // from spec e.g., L -> thread level
7925   const char *long_name; // from spec thread_level -> thread level
7926   char field_format; // data type for snprintf (typically 'd' or 's'
7927   // for integer or string)
7928 } kmp_affinity_format_field_t;
7929 
7930 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7931 #if KMP_AFFINITY_SUPPORTED
7932     {'A', "thread_affinity", 's'},
7933 #endif
7934     {'t', "team_num", 'd'},
7935     {'T', "num_teams", 'd'},
7936     {'L', "nesting_level", 'd'},
7937     {'n', "thread_num", 'd'},
7938     {'N', "num_threads", 'd'},
7939     {'a', "ancestor_tnum", 'd'},
7940     {'H', "host", 's'},
7941     {'P', "process_id", 'd'},
7942     {'i', "native_thread_id", 'd'}};
7943 
7944 // Return the number of characters it takes to hold field
7945 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7946                                             const char **ptr,
7947                                             kmp_str_buf_t *field_buffer) {
7948   int rc, format_index, field_value;
7949   const char *width_left, *width_right;
7950   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7951   static const int FORMAT_SIZE = 20;
7952   char format[FORMAT_SIZE] = {0};
7953   char absolute_short_name = 0;
7954 
7955   KMP_DEBUG_ASSERT(gtid >= 0);
7956   KMP_DEBUG_ASSERT(th);
7957   KMP_DEBUG_ASSERT(**ptr == '%');
7958   KMP_DEBUG_ASSERT(field_buffer);
7959 
7960   __kmp_str_buf_clear(field_buffer);
7961 
7962   // Skip the initial %
7963   (*ptr)++;
7964 
7965   // Check for %% first
7966   if (**ptr == '%') {
7967     __kmp_str_buf_cat(field_buffer, "%", 1);
7968     (*ptr)++; // skip over the second %
7969     return 1;
7970   }
7971 
7972   // Parse field modifiers if they are present
7973   pad_zeros = false;
7974   if (**ptr == '0') {
7975     pad_zeros = true;
7976     (*ptr)++; // skip over 0
7977   }
7978   right_justify = false;
7979   if (**ptr == '.') {
7980     right_justify = true;
7981     (*ptr)++; // skip over .
7982   }
7983   // Parse width of field: [width_left, width_right)
7984   width_left = width_right = NULL;
7985   if (**ptr >= '0' && **ptr <= '9') {
7986     width_left = *ptr;
7987     SKIP_DIGITS(*ptr);
7988     width_right = *ptr;
7989   }
7990 
7991   // Create the format for KMP_SNPRINTF based on flags parsed above
7992   format_index = 0;
7993   format[format_index++] = '%';
7994   if (!right_justify)
7995     format[format_index++] = '-';
7996   if (pad_zeros)
7997     format[format_index++] = '0';
7998   if (width_left && width_right) {
7999     int i = 0;
8000     // Only allow 8 digit number widths.
8001     // This also prevents overflowing format variable
8002     while (i < 8 && width_left < width_right) {
8003       format[format_index++] = *width_left;
8004       width_left++;
8005       i++;
8006     }
8007   }
8008 
8009   // Parse a name (long or short)
8010   // Canonicalize the name into absolute_short_name
8011   found_valid_name = false;
8012   parse_long_name = (**ptr == '{');
8013   if (parse_long_name)
8014     (*ptr)++; // skip initial left brace
8015   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8016                              sizeof(__kmp_affinity_format_table[0]);
8017        ++i) {
8018     char short_name = __kmp_affinity_format_table[i].short_name;
8019     const char *long_name = __kmp_affinity_format_table[i].long_name;
8020     char field_format = __kmp_affinity_format_table[i].field_format;
8021     if (parse_long_name) {
8022       size_t length = KMP_STRLEN(long_name);
8023       if (strncmp(*ptr, long_name, length) == 0) {
8024         found_valid_name = true;
8025         (*ptr) += length; // skip the long name
8026       }
8027     } else if (**ptr == short_name) {
8028       found_valid_name = true;
8029       (*ptr)++; // skip the short name
8030     }
8031     if (found_valid_name) {
8032       format[format_index++] = field_format;
8033       format[format_index++] = '\0';
8034       absolute_short_name = short_name;
8035       break;
8036     }
8037   }
8038   if (parse_long_name) {
8039     if (**ptr != '}') {
8040       absolute_short_name = 0;
8041     } else {
8042       (*ptr)++; // skip over the right brace
8043     }
8044   }
8045 
8046   // Attempt to fill the buffer with the requested
8047   // value using snprintf within __kmp_str_buf_print()
8048   switch (absolute_short_name) {
8049   case 't':
8050     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8051     break;
8052   case 'T':
8053     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8054     break;
8055   case 'L':
8056     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8057     break;
8058   case 'n':
8059     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8060     break;
8061   case 'H': {
8062     static const int BUFFER_SIZE = 256;
8063     char buf[BUFFER_SIZE];
8064     __kmp_expand_host_name(buf, BUFFER_SIZE);
8065     rc = __kmp_str_buf_print(field_buffer, format, buf);
8066   } break;
8067   case 'P':
8068     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8069     break;
8070   case 'i':
8071     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8072     break;
8073   case 'N':
8074     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8075     break;
8076   case 'a':
8077     field_value =
8078         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8079     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8080     break;
8081 #if KMP_AFFINITY_SUPPORTED
8082   case 'A': {
8083     kmp_str_buf_t buf;
8084     __kmp_str_buf_init(&buf);
8085     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8086     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8087     __kmp_str_buf_free(&buf);
8088   } break;
8089 #endif
8090   default:
8091     // According to spec, If an implementation does not have info for field
8092     // type, then "undefined" is printed
8093     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8094     // Skip the field
8095     if (parse_long_name) {
8096       SKIP_TOKEN(*ptr);
8097       if (**ptr == '}')
8098         (*ptr)++;
8099     } else {
8100       (*ptr)++;
8101     }
8102   }
8103 
8104   KMP_ASSERT(format_index <= FORMAT_SIZE);
8105   return rc;
8106 }
8107 
8108 /*
8109  * Return number of characters needed to hold the affinity string
8110  * (not including null byte character)
8111  * The resultant string is printed to buffer, which the caller can then
8112  * handle afterwards
8113 */
8114 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8115                                   kmp_str_buf_t *buffer) {
8116   const char *parse_ptr;
8117   size_t retval;
8118   const kmp_info_t *th;
8119   kmp_str_buf_t field;
8120 
8121   KMP_DEBUG_ASSERT(buffer);
8122   KMP_DEBUG_ASSERT(gtid >= 0);
8123 
8124   __kmp_str_buf_init(&field);
8125   __kmp_str_buf_clear(buffer);
8126 
8127   th = __kmp_threads[gtid];
8128   retval = 0;
8129 
8130   // If format is NULL or zero-length string, then we use
8131   // affinity-format-var ICV
8132   parse_ptr = format;
8133   if (parse_ptr == NULL || *parse_ptr == '\0') {
8134     parse_ptr = __kmp_affinity_format;
8135   }
8136   KMP_DEBUG_ASSERT(parse_ptr);
8137 
8138   while (*parse_ptr != '\0') {
8139     // Parse a field
8140     if (*parse_ptr == '%') {
8141       // Put field in the buffer
8142       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8143       __kmp_str_buf_catbuf(buffer, &field);
8144       retval += rc;
8145     } else {
8146       // Put literal character in buffer
8147       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8148       retval++;
8149       parse_ptr++;
8150     }
8151   }
8152   __kmp_str_buf_free(&field);
8153   return retval;
8154 }
8155 
8156 // Displays the affinity string to stdout
8157 void __kmp_aux_display_affinity(int gtid, const char *format) {
8158   kmp_str_buf_t buf;
8159   __kmp_str_buf_init(&buf);
8160   __kmp_aux_capture_affinity(gtid, format, &buf);
8161   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8162   __kmp_str_buf_free(&buf);
8163 }
8164 
8165 /* ------------------------------------------------------------------------ */
8166 
8167 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8168   int blocktime = arg; /* argument is in milliseconds */
8169 #if KMP_USE_MONITOR
8170   int bt_intervals;
8171 #endif
8172   kmp_int8 bt_set;
8173 
8174   __kmp_save_internal_controls(thread);
8175 
8176   /* Normalize and set blocktime for the teams */
8177   if (blocktime < KMP_MIN_BLOCKTIME)
8178     blocktime = KMP_MIN_BLOCKTIME;
8179   else if (blocktime > KMP_MAX_BLOCKTIME)
8180     blocktime = KMP_MAX_BLOCKTIME;
8181 
8182   set__blocktime_team(thread->th.th_team, tid, blocktime);
8183   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8184 
8185 #if KMP_USE_MONITOR
8186   /* Calculate and set blocktime intervals for the teams */
8187   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8188 
8189   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8190   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8191 #endif
8192 
8193   /* Set whether blocktime has been set to "TRUE" */
8194   bt_set = TRUE;
8195 
8196   set__bt_set_team(thread->th.th_team, tid, bt_set);
8197   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8198 #if KMP_USE_MONITOR
8199   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8200                 "bt_intervals=%d, monitor_updates=%d\n",
8201                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8202                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8203                 __kmp_monitor_wakeups));
8204 #else
8205   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8206                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8207                 thread->th.th_team->t.t_id, tid, blocktime));
8208 #endif
8209 }
8210 
8211 void __kmp_aux_set_defaults(char const *str, size_t len) {
8212   if (!__kmp_init_serial) {
8213     __kmp_serial_initialize();
8214   }
8215   __kmp_env_initialize(str);
8216 
8217   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8218     __kmp_env_print();
8219   }
8220 } // __kmp_aux_set_defaults
8221 
8222 /* ------------------------------------------------------------------------ */
8223 /* internal fast reduction routines */
8224 
8225 PACKED_REDUCTION_METHOD_T
8226 __kmp_determine_reduction_method(
8227     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8228     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8229     kmp_critical_name *lck) {
8230 
8231   // Default reduction method: critical construct ( lck != NULL, like in current
8232   // PAROPT )
8233   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8234   // can be selected by RTL
8235   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8236   // can be selected by RTL
8237   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8238   // among generated by PAROPT.
8239 
8240   PACKED_REDUCTION_METHOD_T retval;
8241 
8242   int team_size;
8243 
8244   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8245   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8246 
8247 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8248   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8249 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8250 
8251   retval = critical_reduce_block;
8252 
8253   // another choice of getting a team size (with 1 dynamic deference) is slower
8254   team_size = __kmp_get_team_num_threads(global_tid);
8255   if (team_size == 1) {
8256 
8257     retval = empty_reduce_block;
8258 
8259   } else {
8260 
8261     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8262 
8263 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8264     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8265 
8266 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8267     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8268 
8269     int teamsize_cutoff = 4;
8270 
8271 #if KMP_MIC_SUPPORTED
8272     if (__kmp_mic_type != non_mic) {
8273       teamsize_cutoff = 8;
8274     }
8275 #endif
8276     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8277     if (tree_available) {
8278       if (team_size <= teamsize_cutoff) {
8279         if (atomic_available) {
8280           retval = atomic_reduce_block;
8281         }
8282       } else {
8283         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8284       }
8285     } else if (atomic_available) {
8286       retval = atomic_reduce_block;
8287     }
8288 #else
8289 #error "Unknown or unsupported OS"
8290 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8291        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8292 
8293 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8294 
8295 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8296 
8297     // basic tuning
8298 
8299     if (atomic_available) {
8300       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8301         retval = atomic_reduce_block;
8302       }
8303     } // otherwise: use critical section
8304 
8305 #elif KMP_OS_DARWIN
8306 
8307     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8308     if (atomic_available && (num_vars <= 3)) {
8309       retval = atomic_reduce_block;
8310     } else if (tree_available) {
8311       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8312           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8313         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8314       }
8315     } // otherwise: use critical section
8316 
8317 #else
8318 #error "Unknown or unsupported OS"
8319 #endif
8320 
8321 #else
8322 #error "Unknown or unsupported architecture"
8323 #endif
8324   }
8325 
8326   // KMP_FORCE_REDUCTION
8327 
8328   // If the team is serialized (team_size == 1), ignore the forced reduction
8329   // method and stay with the unsynchronized method (empty_reduce_block)
8330   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8331       team_size != 1) {
8332 
8333     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8334 
8335     int atomic_available, tree_available;
8336 
8337     switch ((forced_retval = __kmp_force_reduction_method)) {
8338     case critical_reduce_block:
8339       KMP_ASSERT(lck); // lck should be != 0
8340       break;
8341 
8342     case atomic_reduce_block:
8343       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8344       if (!atomic_available) {
8345         KMP_WARNING(RedMethodNotSupported, "atomic");
8346         forced_retval = critical_reduce_block;
8347       }
8348       break;
8349 
8350     case tree_reduce_block:
8351       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8352       if (!tree_available) {
8353         KMP_WARNING(RedMethodNotSupported, "tree");
8354         forced_retval = critical_reduce_block;
8355       } else {
8356 #if KMP_FAST_REDUCTION_BARRIER
8357         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8358 #endif
8359       }
8360       break;
8361 
8362     default:
8363       KMP_ASSERT(0); // "unsupported method specified"
8364     }
8365 
8366     retval = forced_retval;
8367   }
8368 
8369   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8370 
8371 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8372 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8373 
8374   return (retval);
8375 }
8376 // this function is for testing set/get/determine reduce method
8377 kmp_int32 __kmp_get_reduce_method(void) {
8378   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8379 }
8380 
8381 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8382 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8383 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8384 
8385 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8386 // OpenMP is used subsequently.
8387 void __kmp_hard_pause() {
8388   __kmp_pause_status = kmp_hard_paused;
8389   __kmp_internal_end_thread(-1);
8390 }
8391 
8392 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8393 void __kmp_resume_if_soft_paused() {
8394   if (__kmp_pause_status == kmp_soft_paused) {
8395     __kmp_pause_status = kmp_not_paused;
8396 
8397     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8398       kmp_info_t *thread = __kmp_threads[gtid];
8399       if (thread) { // Wake it if sleeping
8400         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8401                          thread);
8402         if (fl.is_sleeping())
8403           fl.resume(gtid);
8404         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8405           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8406         } else { // thread holds the lock and may sleep soon
8407           do { // until either the thread sleeps, or we can get the lock
8408             if (fl.is_sleeping()) {
8409               fl.resume(gtid);
8410               break;
8411             } else if (__kmp_try_suspend_mx(thread)) {
8412               __kmp_unlock_suspend_mx(thread);
8413               break;
8414             }
8415           } while (1);
8416         }
8417       }
8418     }
8419   }
8420 }
8421 
8422 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8423 // TODO: add warning messages
8424 int __kmp_pause_resource(kmp_pause_status_t level) {
8425   if (level == kmp_not_paused) { // requesting resume
8426     if (__kmp_pause_status == kmp_not_paused) {
8427       // error message about runtime not being paused, so can't resume
8428       return 1;
8429     } else {
8430       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8431                        __kmp_pause_status == kmp_hard_paused);
8432       __kmp_pause_status = kmp_not_paused;
8433       return 0;
8434     }
8435   } else if (level == kmp_soft_paused) { // requesting soft pause
8436     if (__kmp_pause_status != kmp_not_paused) {
8437       // error message about already being paused
8438       return 1;
8439     } else {
8440       __kmp_soft_pause();
8441       return 0;
8442     }
8443   } else if (level == kmp_hard_paused) { // requesting hard pause
8444     if (__kmp_pause_status != kmp_not_paused) {
8445       // error message about already being paused
8446       return 1;
8447     } else {
8448       __kmp_hard_pause();
8449       return 0;
8450     }
8451   } else {
8452     // error message about invalid level
8453     return 1;
8454   }
8455 }
8456 
8457 
8458 void __kmp_omp_display_env(int verbose) {
8459   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8460   if (__kmp_init_serial == 0)
8461     __kmp_do_serial_initialize();
8462   __kmp_display_env_impl(!verbose, verbose);
8463   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8464 }
8465