1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #include "tsan_annotations.h"
51 
52 #if KMP_OS_WINDOWS
53 // windows does not need include files as it doesn't use shared memory
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60 
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63     KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] =
67     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68 
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71     KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75 
76 /* ------------------------------------------------------------------------ */
77 
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81 
82 /* Forward declarations */
83 
84 void __kmp_cleanup(void);
85 
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87                                   int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89                                   kmp_internal_control_t *new_icvs,
90                                   ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93                                    int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99                           kmp_internal_control_t *new_icvs, ident_t *loc);
100 
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104 
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111 
112 /* Calculate the identifier of the current thread */
113 /* fast (and somewhat portable) way to get unique identifier of executing
114    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
115 int __kmp_get_global_thread_id() {
116   int i;
117   kmp_info_t **other_threads;
118   size_t stack_data;
119   char *stack_addr;
120   size_t stack_size;
121   char *stack_base;
122 
123   KA_TRACE(
124       1000,
125       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
126        __kmp_nth, __kmp_all_nth));
127 
128   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
129      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
130      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
131      __kmp_init_gtid for this to work. */
132 
133   if (!TCR_4(__kmp_init_gtid))
134     return KMP_GTID_DNE;
135 
136 #ifdef KMP_TDATA_GTID
137   if (TCR_4(__kmp_gtid_mode) >= 3) {
138     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
139     return __kmp_gtid;
140   }
141 #endif
142   if (TCR_4(__kmp_gtid_mode) >= 2) {
143     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
144     return __kmp_gtid_get_specific();
145   }
146   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
147 
148   stack_addr = (char *)&stack_data;
149   other_threads = __kmp_threads;
150 
151   /* ATT: The code below is a source of potential bugs due to unsynchronized
152      access to __kmp_threads array. For example:
153      1. Current thread loads other_threads[i] to thr and checks it, it is
154         non-NULL.
155      2. Current thread is suspended by OS.
156      3. Another thread unregisters and finishes (debug versions of free()
157         may fill memory with something like 0xEF).
158      4. Current thread is resumed.
159      5. Current thread reads junk from *thr.
160      TODO: Fix it.  --ln  */
161 
162   for (i = 0; i < __kmp_threads_capacity; i++) {
163 
164     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
165     if (!thr)
166       continue;
167 
168     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
169     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
170 
171     /* stack grows down -- search through all of the active threads */
172 
173     if (stack_addr <= stack_base) {
174       size_t stack_diff = stack_base - stack_addr;
175 
176       if (stack_diff <= stack_size) {
177         /* The only way we can be closer than the allocated */
178         /* stack size is if we are running on this thread. */
179         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
180         return i;
181       }
182     }
183   }
184 
185   /* get specific to try and determine our gtid */
186   KA_TRACE(1000,
187            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
188             "thread, using TLS\n"));
189   i = __kmp_gtid_get_specific();
190 
191   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
192 
193   /* if we havn't been assigned a gtid, then return code */
194   if (i < 0)
195     return i;
196 
197   /* dynamically updated stack window for uber threads to avoid get_specific
198      call */
199   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
200     KMP_FATAL(StackOverflow, i);
201   }
202 
203   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204   if (stack_addr > stack_base) {
205     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
206     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
207             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
208                 stack_base);
209   } else {
210     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
211             stack_base - stack_addr);
212   }
213 
214   /* Reprint stack bounds for ubermaster since they have been refined */
215   if (__kmp_storage_map) {
216     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
217     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
218     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
219                                  other_threads[i]->th.th_info.ds.ds_stacksize,
220                                  "th_%d stack (refinement)", i);
221   }
222   return i;
223 }
224 
225 int __kmp_get_global_thread_id_reg() {
226   int gtid;
227 
228   if (!__kmp_init_serial) {
229     gtid = KMP_GTID_DNE;
230   } else
231 #ifdef KMP_TDATA_GTID
232       if (TCR_4(__kmp_gtid_mode) >= 3) {
233     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
234     gtid = __kmp_gtid;
235   } else
236 #endif
237       if (TCR_4(__kmp_gtid_mode) >= 2) {
238     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
239     gtid = __kmp_gtid_get_specific();
240   } else {
241     KA_TRACE(1000,
242              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
243     gtid = __kmp_get_global_thread_id();
244   }
245 
246   /* we must be a new uber master sibling thread */
247   if (gtid == KMP_GTID_DNE) {
248     KA_TRACE(10,
249              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
250               "Registering a new gtid.\n"));
251     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
252     if (!__kmp_init_serial) {
253       __kmp_do_serial_initialize();
254       gtid = __kmp_gtid_get_specific();
255     } else {
256       gtid = __kmp_register_root(FALSE);
257     }
258     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
259     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
260   }
261 
262   KMP_DEBUG_ASSERT(gtid >= 0);
263 
264   return gtid;
265 }
266 
267 /* caller must hold forkjoin_lock */
268 void __kmp_check_stack_overlap(kmp_info_t *th) {
269   int f;
270   char *stack_beg = NULL;
271   char *stack_end = NULL;
272   int gtid;
273 
274   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
275   if (__kmp_storage_map) {
276     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
277     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
278 
279     gtid = __kmp_gtid_from_thread(th);
280 
281     if (gtid == KMP_GTID_MONITOR) {
282       __kmp_print_storage_map_gtid(
283           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
284           "th_%s stack (%s)", "mon",
285           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
286     } else {
287       __kmp_print_storage_map_gtid(
288           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
289           "th_%d stack (%s)", gtid,
290           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
291     }
292   }
293 
294   /* No point in checking ubermaster threads since they use refinement and
295    * cannot overlap */
296   gtid = __kmp_gtid_from_thread(th);
297   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
298     KA_TRACE(10,
299              ("__kmp_check_stack_overlap: performing extensive checking\n"));
300     if (stack_beg == NULL) {
301       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
302       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
303     }
304 
305     for (f = 0; f < __kmp_threads_capacity; f++) {
306       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
307 
308       if (f_th && f_th != th) {
309         char *other_stack_end =
310             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
311         char *other_stack_beg =
312             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
313         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
314             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
315 
316           /* Print the other stack values before the abort */
317           if (__kmp_storage_map)
318             __kmp_print_storage_map_gtid(
319                 -1, other_stack_beg, other_stack_end,
320                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
321                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
322 
323           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
324                       __kmp_msg_null);
325         }
326       }
327     }
328   }
329   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
330 }
331 
332 /* ------------------------------------------------------------------------ */
333 
334 void __kmp_infinite_loop(void) {
335   static int done = FALSE;
336 
337   while (!done) {
338     KMP_YIELD(TRUE);
339   }
340 }
341 
342 #define MAX_MESSAGE 512
343 
344 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
345                                   char const *format, ...) {
346   char buffer[MAX_MESSAGE];
347   va_list ap;
348 
349   va_start(ap, format);
350   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
351                p2, (unsigned long)size, format);
352   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
353   __kmp_vprintf(kmp_err, buffer, ap);
354 #if KMP_PRINT_DATA_PLACEMENT
355   int node;
356   if (gtid >= 0) {
357     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
358       if (__kmp_storage_map_verbose) {
359         node = __kmp_get_host_node(p1);
360         if (node < 0) /* doesn't work, so don't try this next time */
361           __kmp_storage_map_verbose = FALSE;
362         else {
363           char *last;
364           int lastNode;
365           int localProc = __kmp_get_cpu_from_gtid(gtid);
366 
367           const int page_size = KMP_GET_PAGE_SIZE();
368 
369           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
370           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
371           if (localProc >= 0)
372             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
373                                  localProc >> 1);
374           else
375             __kmp_printf_no_lock("  GTID %d\n", gtid);
376 #if KMP_USE_PRCTL
377           /* The more elaborate format is disabled for now because of the prctl
378            * hanging bug. */
379           do {
380             last = p1;
381             lastNode = node;
382             /* This loop collates adjacent pages with the same host node. */
383             do {
384               (char *)p1 += page_size;
385             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
386             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
387                                  lastNode);
388           } while (p1 <= p2);
389 #else
390           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
391                                (char *)p1 + (page_size - 1),
392                                __kmp_get_host_node(p1));
393           if (p1 < p2) {
394             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
395                                  (char *)p2 + (page_size - 1),
396                                  __kmp_get_host_node(p2));
397           }
398 #endif
399         }
400       }
401     } else
402       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
403   }
404 #endif /* KMP_PRINT_DATA_PLACEMENT */
405   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406 }
407 
408 void __kmp_warn(char const *format, ...) {
409   char buffer[MAX_MESSAGE];
410   va_list ap;
411 
412   if (__kmp_generate_warnings == kmp_warnings_off) {
413     return;
414   }
415 
416   va_start(ap, format);
417 
418   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
419   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
420   __kmp_vprintf(kmp_err, buffer, ap);
421   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
422 
423   va_end(ap);
424 }
425 
426 void __kmp_abort_process() {
427   // Later threads may stall here, but that's ok because abort() will kill them.
428   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
429 
430   if (__kmp_debug_buf) {
431     __kmp_dump_debug_buffer();
432   }
433 
434   if (KMP_OS_WINDOWS) {
435     // Let other threads know of abnormal termination and prevent deadlock
436     // if abort happened during library initialization or shutdown
437     __kmp_global.g.g_abort = SIGABRT;
438 
439     /* On Windows* OS by default abort() causes pop-up error box, which stalls
440        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
441        boxes. _set_abort_behavior() works well, but this function is not
442        available in VS7 (this is not problem for DLL, but it is a problem for
443        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
444        help, at least in some versions of MS C RTL.
445 
446        It seems following sequence is the only way to simulate abort() and
447        avoid pop-up error box. */
448     raise(SIGABRT);
449     _exit(3); // Just in case, if signal ignored, exit anyway.
450   } else {
451     __kmp_unregister_library();
452     abort();
453   }
454 
455   __kmp_infinite_loop();
456   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
457 
458 } // __kmp_abort_process
459 
460 void __kmp_abort_thread(void) {
461   // TODO: Eliminate g_abort global variable and this function.
462   // In case of abort just call abort(), it will kill all the threads.
463   __kmp_infinite_loop();
464 } // __kmp_abort_thread
465 
466 /* Print out the storage map for the major kmp_info_t thread data structures
467    that are allocated together. */
468 
469 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
470   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
471                                gtid);
472 
473   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
474                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
475 
476   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
477                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
478 
479   __kmp_print_storage_map_gtid(
480       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
481       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
482 
483   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
484                                &thr->th.th_bar[bs_plain_barrier + 1],
485                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
486                                gtid);
487 
488   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
489                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
490                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
491                                gtid);
492 
493 #if KMP_FAST_REDUCTION_BARRIER
494   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
495                                &thr->th.th_bar[bs_reduction_barrier + 1],
496                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
497                                gtid);
498 #endif // KMP_FAST_REDUCTION_BARRIER
499 }
500 
501 /* Print out the storage map for the major kmp_team_t team data structures
502    that are allocated together. */
503 
504 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
505                                          int team_id, int num_thr) {
506   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
507   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
508                                header, team_id);
509 
510   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
511                                &team->t.t_bar[bs_last_barrier],
512                                sizeof(kmp_balign_team_t) * bs_last_barrier,
513                                "%s_%d.t_bar", header, team_id);
514 
515   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
516                                &team->t.t_bar[bs_plain_barrier + 1],
517                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
518                                header, team_id);
519 
520   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
521                                &team->t.t_bar[bs_forkjoin_barrier + 1],
522                                sizeof(kmp_balign_team_t),
523                                "%s_%d.t_bar[forkjoin]", header, team_id);
524 
525 #if KMP_FAST_REDUCTION_BARRIER
526   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
527                                &team->t.t_bar[bs_reduction_barrier + 1],
528                                sizeof(kmp_balign_team_t),
529                                "%s_%d.t_bar[reduction]", header, team_id);
530 #endif // KMP_FAST_REDUCTION_BARRIER
531 
532   __kmp_print_storage_map_gtid(
533       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
534       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
535 
536   __kmp_print_storage_map_gtid(
537       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
538       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
539 
540   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
541                                &team->t.t_disp_buffer[num_disp_buff],
542                                sizeof(dispatch_shared_info_t) * num_disp_buff,
543                                "%s_%d.t_disp_buffer", header, team_id);
544 }
545 
546 static void __kmp_init_allocator() {
547   __kmp_init_memkind();
548   __kmp_init_target_mem();
549 }
550 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
551 
552 /* ------------------------------------------------------------------------ */
553 
554 #if KMP_DYNAMIC_LIB
555 #if KMP_OS_WINDOWS
556 
557 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
558   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
559 
560   switch (fdwReason) {
561 
562   case DLL_PROCESS_ATTACH:
563     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
564 
565     return TRUE;
566 
567   case DLL_PROCESS_DETACH:
568     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
569 
570     // According to Windows* documentation for DllMain entry point:
571     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
572     //   lpReserved == NULL when FreeLibrary() is called,
573     //   lpReserved != NULL when the process is terminated.
574     // When FreeLibrary() is called, worker threads remain alive. So the
575     // runtime's state is consistent and executing proper shutdown is OK.
576     // When the process is terminated, worker threads have exited or been
577     // forcefully terminated by the OS and only the shutdown thread remains.
578     // This can leave the runtime in an inconsistent state.
579     // Hence, only attempt proper cleanup when FreeLibrary() is called.
580     // Otherwise, rely on OS to reclaim resources.
581     if (lpReserved == NULL)
582       __kmp_internal_end_library(__kmp_gtid_get_specific());
583 
584     return TRUE;
585 
586   case DLL_THREAD_ATTACH:
587     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
588 
589     /* if we want to register new siblings all the time here call
590      * __kmp_get_gtid(); */
591     return TRUE;
592 
593   case DLL_THREAD_DETACH:
594     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
595 
596     __kmp_internal_end_thread(__kmp_gtid_get_specific());
597     return TRUE;
598   }
599 
600   return TRUE;
601 }
602 
603 #endif /* KMP_OS_WINDOWS */
604 #endif /* KMP_DYNAMIC_LIB */
605 
606 /* __kmp_parallel_deo -- Wait until it's our turn. */
607 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
608   int gtid = *gtid_ref;
609 #ifdef BUILD_PARALLEL_ORDERED
610   kmp_team_t *team = __kmp_team_from_gtid(gtid);
611 #endif /* BUILD_PARALLEL_ORDERED */
612 
613   if (__kmp_env_consistency_check) {
614     if (__kmp_threads[gtid]->th.th_root->r.r_active)
615 #if KMP_USE_DYNAMIC_LOCK
616       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
617 #else
618       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
619 #endif
620   }
621 #ifdef BUILD_PARALLEL_ORDERED
622   if (!team->t.t_serialized) {
623     KMP_MB();
624     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
625              NULL);
626     KMP_MB();
627   }
628 #endif /* BUILD_PARALLEL_ORDERED */
629 }
630 
631 /* __kmp_parallel_dxo -- Signal the next task. */
632 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
633   int gtid = *gtid_ref;
634 #ifdef BUILD_PARALLEL_ORDERED
635   int tid = __kmp_tid_from_gtid(gtid);
636   kmp_team_t *team = __kmp_team_from_gtid(gtid);
637 #endif /* BUILD_PARALLEL_ORDERED */
638 
639   if (__kmp_env_consistency_check) {
640     if (__kmp_threads[gtid]->th.th_root->r.r_active)
641       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
642   }
643 #ifdef BUILD_PARALLEL_ORDERED
644   if (!team->t.t_serialized) {
645     KMP_MB(); /* Flush all pending memory write invalidates.  */
646 
647     /* use the tid of the next thread in this team */
648     /* TODO replace with general release procedure */
649     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
650 
651     KMP_MB(); /* Flush all pending memory write invalidates.  */
652   }
653 #endif /* BUILD_PARALLEL_ORDERED */
654 }
655 
656 /* ------------------------------------------------------------------------ */
657 /* The BARRIER for a SINGLE process section is always explicit   */
658 
659 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
660   int status;
661   kmp_info_t *th;
662   kmp_team_t *team;
663 
664   if (!TCR_4(__kmp_init_parallel))
665     __kmp_parallel_initialize();
666   __kmp_resume_if_soft_paused();
667 
668   th = __kmp_threads[gtid];
669   team = th->th.th_team;
670   status = 0;
671 
672   th->th.th_ident = id_ref;
673 
674   if (team->t.t_serialized) {
675     status = 1;
676   } else {
677     kmp_int32 old_this = th->th.th_local.this_construct;
678 
679     ++th->th.th_local.this_construct;
680     /* try to set team count to thread count--success means thread got the
681        single block */
682     /* TODO: Should this be acquire or release? */
683     if (team->t.t_construct == old_this) {
684       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
685                                               th->th.th_local.this_construct);
686     }
687 #if USE_ITT_BUILD
688     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
689         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
690         team->t.t_active_level == 1) {
691       // Only report metadata by primary thread of active team at level 1
692       __kmp_itt_metadata_single(id_ref);
693     }
694 #endif /* USE_ITT_BUILD */
695   }
696 
697   if (__kmp_env_consistency_check) {
698     if (status && push_ws) {
699       __kmp_push_workshare(gtid, ct_psingle, id_ref);
700     } else {
701       __kmp_check_workshare(gtid, ct_psingle, id_ref);
702     }
703   }
704 #if USE_ITT_BUILD
705   if (status) {
706     __kmp_itt_single_start(gtid);
707   }
708 #endif /* USE_ITT_BUILD */
709   return status;
710 }
711 
712 void __kmp_exit_single(int gtid) {
713 #if USE_ITT_BUILD
714   __kmp_itt_single_end(gtid);
715 #endif /* USE_ITT_BUILD */
716   if (__kmp_env_consistency_check)
717     __kmp_pop_workshare(gtid, ct_psingle, NULL);
718 }
719 
720 /* determine if we can go parallel or must use a serialized parallel region and
721  * how many threads we can use
722  * set_nproc is the number of threads requested for the team
723  * returns 0 if we should serialize or only use one thread,
724  * otherwise the number of threads to use
725  * The forkjoin lock is held by the caller. */
726 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
727                                  int master_tid, int set_nthreads,
728                                  int enter_teams) {
729   int capacity;
730   int new_nthreads;
731   KMP_DEBUG_ASSERT(__kmp_init_serial);
732   KMP_DEBUG_ASSERT(root && parent_team);
733   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
734 
735   // If dyn-var is set, dynamically adjust the number of desired threads,
736   // according to the method specified by dynamic_mode.
737   new_nthreads = set_nthreads;
738   if (!get__dynamic_2(parent_team, master_tid)) {
739     ;
740   }
741 #ifdef USE_LOAD_BALANCE
742   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
743     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
744     if (new_nthreads == 1) {
745       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
746                     "reservation to 1 thread\n",
747                     master_tid));
748       return 1;
749     }
750     if (new_nthreads < set_nthreads) {
751       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
752                     "reservation to %d threads\n",
753                     master_tid, new_nthreads));
754     }
755   }
756 #endif /* USE_LOAD_BALANCE */
757   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
758     new_nthreads = __kmp_avail_proc - __kmp_nth +
759                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
760     if (new_nthreads <= 1) {
761       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
762                     "reservation to 1 thread\n",
763                     master_tid));
764       return 1;
765     }
766     if (new_nthreads < set_nthreads) {
767       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
768                     "reservation to %d threads\n",
769                     master_tid, new_nthreads));
770     } else {
771       new_nthreads = set_nthreads;
772     }
773   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
774     if (set_nthreads > 2) {
775       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
776       new_nthreads = (new_nthreads % set_nthreads) + 1;
777       if (new_nthreads == 1) {
778         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
779                       "reservation to 1 thread\n",
780                       master_tid));
781         return 1;
782       }
783       if (new_nthreads < set_nthreads) {
784         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
785                       "reservation to %d threads\n",
786                       master_tid, new_nthreads));
787       }
788     }
789   } else {
790     KMP_ASSERT(0);
791   }
792 
793   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
794   if (__kmp_nth + new_nthreads -
795           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
796       __kmp_max_nth) {
797     int tl_nthreads = __kmp_max_nth - __kmp_nth +
798                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
799     if (tl_nthreads <= 0) {
800       tl_nthreads = 1;
801     }
802 
803     // If dyn-var is false, emit a 1-time warning.
804     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
805       __kmp_reserve_warn = 1;
806       __kmp_msg(kmp_ms_warning,
807                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
808                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
809     }
810     if (tl_nthreads == 1) {
811       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
812                     "reduced reservation to 1 thread\n",
813                     master_tid));
814       return 1;
815     }
816     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
817                   "reservation to %d threads\n",
818                   master_tid, tl_nthreads));
819     new_nthreads = tl_nthreads;
820   }
821 
822   // Respect OMP_THREAD_LIMIT
823   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
824   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
825   if (cg_nthreads + new_nthreads -
826           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
827       max_cg_threads) {
828     int tl_nthreads = max_cg_threads - cg_nthreads +
829                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
830     if (tl_nthreads <= 0) {
831       tl_nthreads = 1;
832     }
833 
834     // If dyn-var is false, emit a 1-time warning.
835     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
836       __kmp_reserve_warn = 1;
837       __kmp_msg(kmp_ms_warning,
838                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
839                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
840     }
841     if (tl_nthreads == 1) {
842       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
843                     "reduced reservation to 1 thread\n",
844                     master_tid));
845       return 1;
846     }
847     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
848                   "reservation to %d threads\n",
849                   master_tid, tl_nthreads));
850     new_nthreads = tl_nthreads;
851   }
852 
853   // Check if the threads array is large enough, or needs expanding.
854   // See comment in __kmp_register_root() about the adjustment if
855   // __kmp_threads[0] == NULL.
856   capacity = __kmp_threads_capacity;
857   if (TCR_PTR(__kmp_threads[0]) == NULL) {
858     --capacity;
859   }
860   // If it is not for initializing the hidden helper team, we need to take
861   // __kmp_hidden_helper_threads_num out of the capacity because it is included
862   // in __kmp_threads_capacity.
863   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
864     capacity -= __kmp_hidden_helper_threads_num;
865   }
866   if (__kmp_nth + new_nthreads -
867           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
868       capacity) {
869     // Expand the threads array.
870     int slotsRequired = __kmp_nth + new_nthreads -
871                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
872                         capacity;
873     int slotsAdded = __kmp_expand_threads(slotsRequired);
874     if (slotsAdded < slotsRequired) {
875       // The threads array was not expanded enough.
876       new_nthreads -= (slotsRequired - slotsAdded);
877       KMP_ASSERT(new_nthreads >= 1);
878 
879       // If dyn-var is false, emit a 1-time warning.
880       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
881         __kmp_reserve_warn = 1;
882         if (__kmp_tp_cached) {
883           __kmp_msg(kmp_ms_warning,
884                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
885                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
886                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
887         } else {
888           __kmp_msg(kmp_ms_warning,
889                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
890                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
891         }
892       }
893     }
894   }
895 
896 #ifdef KMP_DEBUG
897   if (new_nthreads == 1) {
898     KC_TRACE(10,
899              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
900               "dead roots and rechecking; requested %d threads\n",
901               __kmp_get_gtid(), set_nthreads));
902   } else {
903     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
904                   " %d threads\n",
905                   __kmp_get_gtid(), new_nthreads, set_nthreads));
906   }
907 #endif // KMP_DEBUG
908   return new_nthreads;
909 }
910 
911 /* Allocate threads from the thread pool and assign them to the new team. We are
912    assured that there are enough threads available, because we checked on that
913    earlier within critical section forkjoin */
914 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
915                                     kmp_info_t *master_th, int master_gtid) {
916   int i;
917   int use_hot_team;
918 
919   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
920   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
921   KMP_MB();
922 
923   /* first, let's setup the primary thread */
924   master_th->th.th_info.ds.ds_tid = 0;
925   master_th->th.th_team = team;
926   master_th->th.th_team_nproc = team->t.t_nproc;
927   master_th->th.th_team_master = master_th;
928   master_th->th.th_team_serialized = FALSE;
929   master_th->th.th_dispatch = &team->t.t_dispatch[0];
930 
931 /* make sure we are not the optimized hot team */
932 #if KMP_NESTED_HOT_TEAMS
933   use_hot_team = 0;
934   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
935   if (hot_teams) { // hot teams array is not allocated if
936     // KMP_HOT_TEAMS_MAX_LEVEL=0
937     int level = team->t.t_active_level - 1; // index in array of hot teams
938     if (master_th->th.th_teams_microtask) { // are we inside the teams?
939       if (master_th->th.th_teams_size.nteams > 1) {
940         ++level; // level was not increased in teams construct for
941         // team_of_masters
942       }
943       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
944           master_th->th.th_teams_level == team->t.t_level) {
945         ++level; // level was not increased in teams construct for
946         // team_of_workers before the parallel
947       } // team->t.t_level will be increased inside parallel
948     }
949     if (level < __kmp_hot_teams_max_level) {
950       if (hot_teams[level].hot_team) {
951         // hot team has already been allocated for given level
952         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
953         use_hot_team = 1; // the team is ready to use
954       } else {
955         use_hot_team = 0; // AC: threads are not allocated yet
956         hot_teams[level].hot_team = team; // remember new hot team
957         hot_teams[level].hot_team_nth = team->t.t_nproc;
958       }
959     } else {
960       use_hot_team = 0;
961     }
962   }
963 #else
964   use_hot_team = team == root->r.r_hot_team;
965 #endif
966   if (!use_hot_team) {
967 
968     /* install the primary thread */
969     team->t.t_threads[0] = master_th;
970     __kmp_initialize_info(master_th, team, 0, master_gtid);
971 
972     /* now, install the worker threads */
973     for (i = 1; i < team->t.t_nproc; i++) {
974 
975       /* fork or reallocate a new thread and install it in team */
976       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
977       team->t.t_threads[i] = thr;
978       KMP_DEBUG_ASSERT(thr);
979       KMP_DEBUG_ASSERT(thr->th.th_team == team);
980       /* align team and thread arrived states */
981       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
982                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
983                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
984                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
985                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
986                     team->t.t_bar[bs_plain_barrier].b_arrived));
987       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
988       thr->th.th_teams_level = master_th->th.th_teams_level;
989       thr->th.th_teams_size = master_th->th.th_teams_size;
990       { // Initialize threads' barrier data.
991         int b;
992         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
993         for (b = 0; b < bs_last_barrier; ++b) {
994           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
995           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
996 #if USE_DEBUGGER
997           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
998 #endif
999         }
1000       }
1001     }
1002 
1003 #if KMP_AFFINITY_SUPPORTED
1004     __kmp_partition_places(team);
1005 #endif
1006   }
1007 
1008   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1009     for (i = 0; i < team->t.t_nproc; i++) {
1010       kmp_info_t *thr = team->t.t_threads[i];
1011       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1012           thr->th.th_prev_level != team->t.t_level) {
1013         team->t.t_display_affinity = 1;
1014         break;
1015       }
1016     }
1017   }
1018 
1019   KMP_MB();
1020 }
1021 
1022 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1023 // Propagate any changes to the floating point control registers out to the team
1024 // We try to avoid unnecessary writes to the relevant cache line in the team
1025 // structure, so we don't make changes unless they are needed.
1026 inline static void propagateFPControl(kmp_team_t *team) {
1027   if (__kmp_inherit_fp_control) {
1028     kmp_int16 x87_fpu_control_word;
1029     kmp_uint32 mxcsr;
1030 
1031     // Get primary thread's values of FPU control flags (both X87 and vector)
1032     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1033     __kmp_store_mxcsr(&mxcsr);
1034     mxcsr &= KMP_X86_MXCSR_MASK;
1035 
1036     // There is no point looking at t_fp_control_saved here.
1037     // If it is TRUE, we still have to update the values if they are different
1038     // from those we now have. If it is FALSE we didn't save anything yet, but
1039     // our objective is the same. We have to ensure that the values in the team
1040     // are the same as those we have.
1041     // So, this code achieves what we need whether or not t_fp_control_saved is
1042     // true. By checking whether the value needs updating we avoid unnecessary
1043     // writes that would put the cache-line into a written state, causing all
1044     // threads in the team to have to read it again.
1045     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1046     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1047     // Although we don't use this value, other code in the runtime wants to know
1048     // whether it should restore them. So we must ensure it is correct.
1049     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1050   } else {
1051     // Similarly here. Don't write to this cache-line in the team structure
1052     // unless we have to.
1053     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1054   }
1055 }
1056 
1057 // Do the opposite, setting the hardware registers to the updated values from
1058 // the team.
1059 inline static void updateHWFPControl(kmp_team_t *team) {
1060   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1061     // Only reset the fp control regs if they have been changed in the team.
1062     // the parallel region that we are exiting.
1063     kmp_int16 x87_fpu_control_word;
1064     kmp_uint32 mxcsr;
1065     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1066     __kmp_store_mxcsr(&mxcsr);
1067     mxcsr &= KMP_X86_MXCSR_MASK;
1068 
1069     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1070       __kmp_clear_x87_fpu_status_word();
1071       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1072     }
1073 
1074     if (team->t.t_mxcsr != mxcsr) {
1075       __kmp_load_mxcsr(&team->t.t_mxcsr);
1076     }
1077   }
1078 }
1079 #else
1080 #define propagateFPControl(x) ((void)0)
1081 #define updateHWFPControl(x) ((void)0)
1082 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1083 
1084 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1085                                      int realloc); // forward declaration
1086 
1087 /* Run a parallel region that has been serialized, so runs only in a team of the
1088    single primary thread. */
1089 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1090   kmp_info_t *this_thr;
1091   kmp_team_t *serial_team;
1092 
1093   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1094 
1095   /* Skip all this code for autopar serialized loops since it results in
1096      unacceptable overhead */
1097   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1098     return;
1099 
1100   if (!TCR_4(__kmp_init_parallel))
1101     __kmp_parallel_initialize();
1102   __kmp_resume_if_soft_paused();
1103 
1104   this_thr = __kmp_threads[global_tid];
1105   serial_team = this_thr->th.th_serial_team;
1106 
1107   /* utilize the serialized team held by this thread */
1108   KMP_DEBUG_ASSERT(serial_team);
1109   KMP_MB();
1110 
1111   if (__kmp_tasking_mode != tskm_immediate_exec) {
1112     KMP_DEBUG_ASSERT(
1113         this_thr->th.th_task_team ==
1114         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1115     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1116                      NULL);
1117     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1118                   "team %p, new task_team = NULL\n",
1119                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1120     this_thr->th.th_task_team = NULL;
1121   }
1122 
1123   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1124   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1125     proc_bind = proc_bind_false;
1126   } else if (proc_bind == proc_bind_default) {
1127     // No proc_bind clause was specified, so use the current value
1128     // of proc-bind-var for this parallel region.
1129     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1130   }
1131   // Reset for next parallel region
1132   this_thr->th.th_set_proc_bind = proc_bind_default;
1133 
1134 #if OMPT_SUPPORT
1135   ompt_data_t ompt_parallel_data = ompt_data_none;
1136   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1137   if (ompt_enabled.enabled &&
1138       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1139 
1140     ompt_task_info_t *parent_task_info;
1141     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1142 
1143     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1144     if (ompt_enabled.ompt_callback_parallel_begin) {
1145       int team_size = 1;
1146 
1147       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1148           &(parent_task_info->task_data), &(parent_task_info->frame),
1149           &ompt_parallel_data, team_size,
1150           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1151     }
1152   }
1153 #endif // OMPT_SUPPORT
1154 
1155   if (this_thr->th.th_team != serial_team) {
1156     // Nested level will be an index in the nested nthreads array
1157     int level = this_thr->th.th_team->t.t_level;
1158 
1159     if (serial_team->t.t_serialized) {
1160       /* this serial team was already used
1161          TODO increase performance by making this locks more specific */
1162       kmp_team_t *new_team;
1163 
1164       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1165 
1166       new_team =
1167           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1168 #if OMPT_SUPPORT
1169                               ompt_parallel_data,
1170 #endif
1171                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1172                               0 USE_NESTED_HOT_ARG(NULL));
1173       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1174       KMP_ASSERT(new_team);
1175 
1176       /* setup new serialized team and install it */
1177       new_team->t.t_threads[0] = this_thr;
1178       new_team->t.t_parent = this_thr->th.th_team;
1179       serial_team = new_team;
1180       this_thr->th.th_serial_team = serial_team;
1181 
1182       KF_TRACE(
1183           10,
1184           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1185            global_tid, serial_team));
1186 
1187       /* TODO the above breaks the requirement that if we run out of resources,
1188          then we can still guarantee that serialized teams are ok, since we may
1189          need to allocate a new one */
1190     } else {
1191       KF_TRACE(
1192           10,
1193           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1194            global_tid, serial_team));
1195     }
1196 
1197     /* we have to initialize this serial team */
1198     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1199     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1200     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1201     serial_team->t.t_ident = loc;
1202     serial_team->t.t_serialized = 1;
1203     serial_team->t.t_nproc = 1;
1204     serial_team->t.t_parent = this_thr->th.th_team;
1205     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1206     this_thr->th.th_team = serial_team;
1207     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1208 
1209     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1210                   this_thr->th.th_current_task));
1211     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1212     this_thr->th.th_current_task->td_flags.executing = 0;
1213 
1214     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1215 
1216     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1217        implicit task for each serialized task represented by
1218        team->t.t_serialized? */
1219     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1220               &this_thr->th.th_current_task->td_parent->td_icvs);
1221 
1222     // Thread value exists in the nested nthreads array for the next nested
1223     // level
1224     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1225       this_thr->th.th_current_task->td_icvs.nproc =
1226           __kmp_nested_nth.nth[level + 1];
1227     }
1228 
1229     if (__kmp_nested_proc_bind.used &&
1230         (level + 1 < __kmp_nested_proc_bind.used)) {
1231       this_thr->th.th_current_task->td_icvs.proc_bind =
1232           __kmp_nested_proc_bind.bind_types[level + 1];
1233     }
1234 
1235 #if USE_DEBUGGER
1236     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1237 #endif
1238     this_thr->th.th_info.ds.ds_tid = 0;
1239 
1240     /* set thread cache values */
1241     this_thr->th.th_team_nproc = 1;
1242     this_thr->th.th_team_master = this_thr;
1243     this_thr->th.th_team_serialized = 1;
1244 
1245     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1246     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1247     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1248 
1249     propagateFPControl(serial_team);
1250 
1251     /* check if we need to allocate dispatch buffers stack */
1252     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1253     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1254       serial_team->t.t_dispatch->th_disp_buffer =
1255           (dispatch_private_info_t *)__kmp_allocate(
1256               sizeof(dispatch_private_info_t));
1257     }
1258     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1259 
1260     KMP_MB();
1261 
1262   } else {
1263     /* this serialized team is already being used,
1264      * that's fine, just add another nested level */
1265     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1266     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1267     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1268     ++serial_team->t.t_serialized;
1269     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1270 
1271     // Nested level will be an index in the nested nthreads array
1272     int level = this_thr->th.th_team->t.t_level;
1273     // Thread value exists in the nested nthreads array for the next nested
1274     // level
1275     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1276       this_thr->th.th_current_task->td_icvs.nproc =
1277           __kmp_nested_nth.nth[level + 1];
1278     }
1279     serial_team->t.t_level++;
1280     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1281                   "of serial team %p to %d\n",
1282                   global_tid, serial_team, serial_team->t.t_level));
1283 
1284     /* allocate/push dispatch buffers stack */
1285     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1286     {
1287       dispatch_private_info_t *disp_buffer =
1288           (dispatch_private_info_t *)__kmp_allocate(
1289               sizeof(dispatch_private_info_t));
1290       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1291       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1292     }
1293     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1294 
1295     KMP_MB();
1296   }
1297   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1298 
1299   // Perform the display affinity functionality for
1300   // serialized parallel regions
1301   if (__kmp_display_affinity) {
1302     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1303         this_thr->th.th_prev_num_threads != 1) {
1304       // NULL means use the affinity-format-var ICV
1305       __kmp_aux_display_affinity(global_tid, NULL);
1306       this_thr->th.th_prev_level = serial_team->t.t_level;
1307       this_thr->th.th_prev_num_threads = 1;
1308     }
1309   }
1310 
1311   if (__kmp_env_consistency_check)
1312     __kmp_push_parallel(global_tid, NULL);
1313 #if OMPT_SUPPORT
1314   serial_team->t.ompt_team_info.master_return_address = codeptr;
1315   if (ompt_enabled.enabled &&
1316       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1317     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1318         OMPT_GET_FRAME_ADDRESS(0);
1319 
1320     ompt_lw_taskteam_t lw_taskteam;
1321     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1322                             &ompt_parallel_data, codeptr);
1323 
1324     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1325     // don't use lw_taskteam after linking. content was swaped
1326 
1327     /* OMPT implicit task begin */
1328     if (ompt_enabled.ompt_callback_implicit_task) {
1329       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1330           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1331           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1332           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1333       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1334           __kmp_tid_from_gtid(global_tid);
1335     }
1336 
1337     /* OMPT state */
1338     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1339     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1340         OMPT_GET_FRAME_ADDRESS(0);
1341   }
1342 #endif
1343 }
1344 
1345 /* most of the work for a fork */
1346 /* return true if we really went parallel, false if serialized */
1347 int __kmp_fork_call(ident_t *loc, int gtid,
1348                     enum fork_context_e call_context, // Intel, GNU, ...
1349                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1350                     kmp_va_list ap) {
1351   void **argv;
1352   int i;
1353   int master_tid;
1354   int master_this_cons;
1355   kmp_team_t *team;
1356   kmp_team_t *parent_team;
1357   kmp_info_t *master_th;
1358   kmp_root_t *root;
1359   int nthreads;
1360   int master_active;
1361   int master_set_numthreads;
1362   int level;
1363   int active_level;
1364   int teams_level;
1365 #if KMP_NESTED_HOT_TEAMS
1366   kmp_hot_team_ptr_t **p_hot_teams;
1367 #endif
1368   { // KMP_TIME_BLOCK
1369     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1370     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1371 
1372     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1373     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1374       /* Some systems prefer the stack for the root thread(s) to start with */
1375       /* some gap from the parent stack to prevent false sharing. */
1376       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1377       /* These 2 lines below are so this does not get optimized out */
1378       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1379         __kmp_stkpadding += (short)((kmp_int64)dummy);
1380     }
1381 
1382     /* initialize if needed */
1383     KMP_DEBUG_ASSERT(
1384         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1385     if (!TCR_4(__kmp_init_parallel))
1386       __kmp_parallel_initialize();
1387     __kmp_resume_if_soft_paused();
1388 
1389     /* setup current data */
1390     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1391     // shutdown
1392     parent_team = master_th->th.th_team;
1393     master_tid = master_th->th.th_info.ds.ds_tid;
1394     master_this_cons = master_th->th.th_local.this_construct;
1395     root = master_th->th.th_root;
1396     master_active = root->r.r_active;
1397     master_set_numthreads = master_th->th.th_set_nproc;
1398 
1399 #if OMPT_SUPPORT
1400     ompt_data_t ompt_parallel_data = ompt_data_none;
1401     ompt_data_t *parent_task_data;
1402     ompt_frame_t *ompt_frame;
1403     ompt_data_t *implicit_task_data;
1404     void *return_address = NULL;
1405 
1406     if (ompt_enabled.enabled) {
1407       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1408                                     NULL, NULL);
1409       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1410     }
1411 #endif
1412 
1413     // Assign affinity to root thread if it hasn't happened yet
1414     __kmp_assign_root_init_mask();
1415 
1416     // Nested level will be an index in the nested nthreads array
1417     level = parent_team->t.t_level;
1418     // used to launch non-serial teams even if nested is not allowed
1419     active_level = parent_team->t.t_active_level;
1420     // needed to check nesting inside the teams
1421     teams_level = master_th->th.th_teams_level;
1422 #if KMP_NESTED_HOT_TEAMS
1423     p_hot_teams = &master_th->th.th_hot_teams;
1424     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1425       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1426           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1427       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1428       // it is either actual or not needed (when active_level > 0)
1429       (*p_hot_teams)[0].hot_team_nth = 1;
1430     }
1431 #endif
1432 
1433 #if OMPT_SUPPORT
1434     if (ompt_enabled.enabled) {
1435       if (ompt_enabled.ompt_callback_parallel_begin) {
1436         int team_size = master_set_numthreads
1437                             ? master_set_numthreads
1438                             : get__nproc_2(parent_team, master_tid);
1439         int flags = OMPT_INVOKER(call_context) |
1440                     ((microtask == (microtask_t)__kmp_teams_master)
1441                          ? ompt_parallel_league
1442                          : ompt_parallel_team);
1443         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1444             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1445             return_address);
1446       }
1447       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1448     }
1449 #endif
1450 
1451     master_th->th.th_ident = loc;
1452 
1453     if (master_th->th.th_teams_microtask && ap &&
1454         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1455       // AC: This is start of parallel that is nested inside teams construct.
1456       // The team is actual (hot), all workers are ready at the fork barrier.
1457       // No lock needed to initialize the team a bit, then free workers.
1458       parent_team->t.t_ident = loc;
1459       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1460       parent_team->t.t_argc = argc;
1461       argv = (void **)parent_team->t.t_argv;
1462       for (i = argc - 1; i >= 0; --i)
1463         *argv++ = va_arg(kmp_va_deref(ap), void *);
1464       // Increment our nested depth levels, but not increase the serialization
1465       if (parent_team == master_th->th.th_serial_team) {
1466         // AC: we are in serialized parallel
1467         __kmpc_serialized_parallel(loc, gtid);
1468         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1469 
1470         if (call_context == fork_context_gnu) {
1471           // AC: need to decrement t_serialized for enquiry functions to work
1472           // correctly, will restore at join time
1473           parent_team->t.t_serialized--;
1474           return TRUE;
1475         }
1476 
1477 #if OMPD_SUPPORT
1478         parent_team->t.t_pkfn = microtask;
1479 #endif
1480 
1481 #if OMPT_SUPPORT
1482         void *dummy;
1483         void **exit_frame_p;
1484 
1485         ompt_lw_taskteam_t lw_taskteam;
1486 
1487         if (ompt_enabled.enabled) {
1488           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1489                                   &ompt_parallel_data, return_address);
1490           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1491 
1492           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1493           // don't use lw_taskteam after linking. content was swaped
1494 
1495           /* OMPT implicit task begin */
1496           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1497           if (ompt_enabled.ompt_callback_implicit_task) {
1498             OMPT_CUR_TASK_INFO(master_th)->thread_num =
1499                 __kmp_tid_from_gtid(gtid);
1500             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1501                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1502                 implicit_task_data, 1,
1503                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1504           }
1505 
1506           /* OMPT state */
1507           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1508         } else {
1509           exit_frame_p = &dummy;
1510         }
1511 #endif
1512         // AC: need to decrement t_serialized for enquiry functions to work
1513         // correctly, will restore at join time
1514         parent_team->t.t_serialized--;
1515 
1516         {
1517           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1518           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1519           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1520 #if OMPT_SUPPORT
1521                                  ,
1522                                  exit_frame_p
1523 #endif
1524           );
1525         }
1526 
1527 #if OMPT_SUPPORT
1528         if (ompt_enabled.enabled) {
1529           *exit_frame_p = NULL;
1530           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1531           if (ompt_enabled.ompt_callback_implicit_task) {
1532             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1533                 ompt_scope_end, NULL, implicit_task_data, 1,
1534                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1535           }
1536           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1537           __ompt_lw_taskteam_unlink(master_th);
1538           if (ompt_enabled.ompt_callback_parallel_end) {
1539             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1540                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1541                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1542                 return_address);
1543           }
1544           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1545         }
1546 #endif
1547         return TRUE;
1548       }
1549 
1550       parent_team->t.t_pkfn = microtask;
1551       parent_team->t.t_invoke = invoker;
1552       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1553       parent_team->t.t_active_level++;
1554       parent_team->t.t_level++;
1555       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1556 
1557 #if OMPT_SUPPORT
1558       if (ompt_enabled.enabled) {
1559         ompt_lw_taskteam_t lw_taskteam;
1560         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1561                                 &ompt_parallel_data, return_address);
1562         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1563       }
1564 #endif
1565 
1566       /* Change number of threads in the team if requested */
1567       if (master_set_numthreads) { // The parallel has num_threads clause
1568         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1569           // AC: only can reduce number of threads dynamically, can't increase
1570           kmp_info_t **other_threads = parent_team->t.t_threads;
1571           parent_team->t.t_nproc = master_set_numthreads;
1572           for (i = 0; i < master_set_numthreads; ++i) {
1573             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1574           }
1575           // Keep extra threads hot in the team for possible next parallels
1576         }
1577         master_th->th.th_set_nproc = 0;
1578       }
1579 
1580 #if USE_DEBUGGER
1581       if (__kmp_debugging) { // Let debugger override number of threads.
1582         int nth = __kmp_omp_num_threads(loc);
1583         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1584           master_set_numthreads = nth;
1585         }
1586       }
1587 #endif
1588 
1589 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1590       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1591            KMP_ITT_DEBUG) &&
1592           __kmp_forkjoin_frames_mode == 3 &&
1593           parent_team->t.t_active_level == 1 // only report frames at level 1
1594           && master_th->th.th_teams_size.nteams == 1) {
1595         kmp_uint64 tmp_time = __itt_get_timestamp();
1596         master_th->th.th_frame_time = tmp_time;
1597         parent_team->t.t_region_time = tmp_time;
1598       }
1599       if (__itt_stack_caller_create_ptr) {
1600         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1601         // create new stack stitching id before entering fork barrier
1602         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1603       }
1604 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1605 
1606       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1607                     "master_th=%p, gtid=%d\n",
1608                     root, parent_team, master_th, gtid));
1609       __kmp_internal_fork(loc, gtid, parent_team);
1610       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1611                     "master_th=%p, gtid=%d\n",
1612                     root, parent_team, master_th, gtid));
1613 
1614       if (call_context == fork_context_gnu)
1615         return TRUE;
1616 
1617       /* Invoke microtask for PRIMARY thread */
1618       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1619                     parent_team->t.t_id, parent_team->t.t_pkfn));
1620 
1621       if (!parent_team->t.t_invoke(gtid)) {
1622         KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1623       }
1624       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1625                     parent_team->t.t_id, parent_team->t.t_pkfn));
1626       KMP_MB(); /* Flush all pending memory write invalidates.  */
1627 
1628       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1629 
1630       return TRUE;
1631     } // Parallel closely nested in teams construct
1632 
1633 #if KMP_DEBUG
1634     if (__kmp_tasking_mode != tskm_immediate_exec) {
1635       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1636                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1637     }
1638 #endif
1639 
1640     int enter_teams = 0;
1641     if (parent_team->t.t_active_level >=
1642         master_th->th.th_current_task->td_icvs.max_active_levels) {
1643       nthreads = 1;
1644     } else {
1645       enter_teams = ((ap == NULL && active_level == 0) ||
1646                      (ap && teams_level > 0 && teams_level == level));
1647       nthreads =
1648           master_set_numthreads
1649               ? master_set_numthreads
1650               : get__nproc_2(
1651                     parent_team,
1652                     master_tid); // TODO: get nproc directly from current task
1653 
1654       // Check if we need to take forkjoin lock? (no need for serialized
1655       // parallel out of teams construct). This code moved here from
1656       // __kmp_reserve_threads() to speedup nested serialized parallels.
1657       if (nthreads > 1) {
1658         if ((get__max_active_levels(master_th) == 1 &&
1659              (root->r.r_in_parallel && !enter_teams)) ||
1660             (__kmp_library == library_serial)) {
1661           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1662                         " threads\n",
1663                         gtid, nthreads));
1664           nthreads = 1;
1665         }
1666       }
1667       if (nthreads > 1) {
1668         /* determine how many new threads we can use */
1669         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1670         /* AC: If we execute teams from parallel region (on host), then teams
1671            should be created but each can only have 1 thread if nesting is
1672            disabled. If teams called from serial region, then teams and their
1673            threads should be created regardless of the nesting setting. */
1674         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1675                                          nthreads, enter_teams);
1676         if (nthreads == 1) {
1677           // Free lock for single thread execution here; for multi-thread
1678           // execution it will be freed later after team of threads created
1679           // and initialized
1680           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1681         }
1682       }
1683     }
1684     KMP_DEBUG_ASSERT(nthreads > 0);
1685 
1686     // If we temporarily changed the set number of threads then restore it now
1687     master_th->th.th_set_nproc = 0;
1688 
1689     /* create a serialized parallel region? */
1690     if (nthreads == 1) {
1691 /* josh todo: hypothetical question: what do we do for OS X*? */
1692 #if KMP_OS_LINUX &&                                                            \
1693     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1694       void *args[argc];
1695 #else
1696       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1697 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1698           KMP_ARCH_AARCH64) */
1699 
1700       KA_TRACE(20,
1701                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1702 
1703       __kmpc_serialized_parallel(loc, gtid);
1704 
1705 #if OMPD_SUPPORT
1706       master_th->th.th_serial_team->t.t_pkfn = microtask;
1707 #endif
1708 
1709       if (call_context == fork_context_intel) {
1710         /* TODO this sucks, use the compiler itself to pass args! :) */
1711         master_th->th.th_serial_team->t.t_ident = loc;
1712         if (!ap) {
1713           // revert change made in __kmpc_serialized_parallel()
1714           master_th->th.th_serial_team->t.t_level--;
1715           // Get args from parent team for teams construct
1716 
1717 #if OMPT_SUPPORT
1718           void *dummy;
1719           void **exit_frame_p;
1720           ompt_task_info_t *task_info;
1721 
1722           ompt_lw_taskteam_t lw_taskteam;
1723 
1724           if (ompt_enabled.enabled) {
1725             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1726                                     &ompt_parallel_data, return_address);
1727 
1728             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1729             // don't use lw_taskteam after linking. content was swaped
1730 
1731             task_info = OMPT_CUR_TASK_INFO(master_th);
1732             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1733             if (ompt_enabled.ompt_callback_implicit_task) {
1734               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1735                   __kmp_tid_from_gtid(gtid);
1736               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1737                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1738                   &(task_info->task_data), 1,
1739                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1740                   ompt_task_implicit);
1741             }
1742 
1743             /* OMPT state */
1744             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1745           } else {
1746             exit_frame_p = &dummy;
1747           }
1748 #endif
1749 
1750           {
1751             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1752             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1753             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1754                                    parent_team->t.t_argv
1755 #if OMPT_SUPPORT
1756                                    ,
1757                                    exit_frame_p
1758 #endif
1759             );
1760           }
1761 
1762 #if OMPT_SUPPORT
1763           if (ompt_enabled.enabled) {
1764             *exit_frame_p = NULL;
1765             if (ompt_enabled.ompt_callback_implicit_task) {
1766               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1767                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1768                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1769                   ompt_task_implicit);
1770             }
1771             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1772             __ompt_lw_taskteam_unlink(master_th);
1773             if (ompt_enabled.ompt_callback_parallel_end) {
1774               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1775                   &ompt_parallel_data, parent_task_data,
1776                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1777                   return_address);
1778             }
1779             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1780           }
1781 #endif
1782         } else if (microtask == (microtask_t)__kmp_teams_master) {
1783           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1784                            master_th->th.th_serial_team);
1785           team = master_th->th.th_team;
1786           // team->t.t_pkfn = microtask;
1787           team->t.t_invoke = invoker;
1788           __kmp_alloc_argv_entries(argc, team, TRUE);
1789           team->t.t_argc = argc;
1790           argv = (void **)team->t.t_argv;
1791           if (ap) {
1792             for (i = argc - 1; i >= 0; --i)
1793               *argv++ = va_arg(kmp_va_deref(ap), void *);
1794           } else {
1795             for (i = 0; i < argc; ++i)
1796               // Get args from parent team for teams construct
1797               argv[i] = parent_team->t.t_argv[i];
1798           }
1799           // AC: revert change made in __kmpc_serialized_parallel()
1800           //     because initial code in teams should have level=0
1801           team->t.t_level--;
1802           // AC: call special invoker for outer "parallel" of teams construct
1803           invoker(gtid);
1804 #if OMPT_SUPPORT
1805           if (ompt_enabled.enabled) {
1806             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1807             if (ompt_enabled.ompt_callback_implicit_task) {
1808               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1809                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1810                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1811             }
1812             if (ompt_enabled.ompt_callback_parallel_end) {
1813               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1814                   &ompt_parallel_data, parent_task_data,
1815                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1816                   return_address);
1817             }
1818             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1819           }
1820 #endif
1821         } else {
1822           argv = args;
1823           for (i = argc - 1; i >= 0; --i)
1824             *argv++ = va_arg(kmp_va_deref(ap), void *);
1825           KMP_MB();
1826 
1827 #if OMPT_SUPPORT
1828           void *dummy;
1829           void **exit_frame_p;
1830           ompt_task_info_t *task_info;
1831 
1832           ompt_lw_taskteam_t lw_taskteam;
1833 
1834           if (ompt_enabled.enabled) {
1835             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1836                                     &ompt_parallel_data, return_address);
1837             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1838             // don't use lw_taskteam after linking. content was swaped
1839             task_info = OMPT_CUR_TASK_INFO(master_th);
1840             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1841 
1842             /* OMPT implicit task begin */
1843             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1844             if (ompt_enabled.ompt_callback_implicit_task) {
1845               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1846                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1847                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1848                   ompt_task_implicit);
1849               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1850                   __kmp_tid_from_gtid(gtid);
1851             }
1852 
1853             /* OMPT state */
1854             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1855           } else {
1856             exit_frame_p = &dummy;
1857           }
1858 #endif
1859 
1860           {
1861             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1862             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1863             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1864 #if OMPT_SUPPORT
1865                                    ,
1866                                    exit_frame_p
1867 #endif
1868             );
1869           }
1870 
1871 #if OMPT_SUPPORT
1872           if (ompt_enabled.enabled) {
1873             *exit_frame_p = NULL;
1874             if (ompt_enabled.ompt_callback_implicit_task) {
1875               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1876                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1877                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1878                   ompt_task_implicit);
1879             }
1880 
1881             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1882             __ompt_lw_taskteam_unlink(master_th);
1883             if (ompt_enabled.ompt_callback_parallel_end) {
1884               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1885                   &ompt_parallel_data, parent_task_data,
1886                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1887                   return_address);
1888             }
1889             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1890           }
1891 #endif
1892         }
1893       } else if (call_context == fork_context_gnu) {
1894 #if OMPT_SUPPORT
1895         ompt_lw_taskteam_t lwt;
1896         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1897                                 return_address);
1898 
1899         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1900         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1901 // don't use lw_taskteam after linking. content was swaped
1902 #endif
1903 
1904         // we were called from GNU native code
1905         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1906         return FALSE;
1907       } else {
1908         KMP_ASSERT2(call_context < fork_context_last,
1909                     "__kmp_fork_call: unknown fork_context parameter");
1910       }
1911 
1912       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1913       KMP_MB();
1914       return FALSE;
1915     } // if (nthreads == 1)
1916 
1917     // GEH: only modify the executing flag in the case when not serialized
1918     //      serialized case is handled in kmpc_serialized_parallel
1919     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1920                   "curtask=%p, curtask_max_aclevel=%d\n",
1921                   parent_team->t.t_active_level, master_th,
1922                   master_th->th.th_current_task,
1923                   master_th->th.th_current_task->td_icvs.max_active_levels));
1924     // TODO: GEH - cannot do this assertion because root thread not set up as
1925     // executing
1926     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1927     master_th->th.th_current_task->td_flags.executing = 0;
1928 
1929     if (!master_th->th.th_teams_microtask || level > teams_level) {
1930       /* Increment our nested depth level */
1931       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1932     }
1933 
1934     // See if we need to make a copy of the ICVs.
1935     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1936     if ((level + 1 < __kmp_nested_nth.used) &&
1937         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1938       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1939     } else {
1940       nthreads_icv = 0; // don't update
1941     }
1942 
1943     // Figure out the proc_bind_policy for the new team.
1944     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1945     kmp_proc_bind_t proc_bind_icv =
1946         proc_bind_default; // proc_bind_default means don't update
1947     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1948       proc_bind = proc_bind_false;
1949     } else {
1950       if (proc_bind == proc_bind_default) {
1951         // No proc_bind clause specified; use current proc-bind-var for this
1952         // parallel region
1953         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1954       }
1955       /* else: The proc_bind policy was specified explicitly on parallel clause.
1956          This overrides proc-bind-var for this parallel region, but does not
1957          change proc-bind-var. */
1958       // Figure the value of proc-bind-var for the child threads.
1959       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1960           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1961            master_th->th.th_current_task->td_icvs.proc_bind)) {
1962         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1963       }
1964     }
1965 
1966     // Reset for next parallel region
1967     master_th->th.th_set_proc_bind = proc_bind_default;
1968 
1969     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1970       kmp_internal_control_t new_icvs;
1971       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1972       new_icvs.next = NULL;
1973       if (nthreads_icv > 0) {
1974         new_icvs.nproc = nthreads_icv;
1975       }
1976       if (proc_bind_icv != proc_bind_default) {
1977         new_icvs.proc_bind = proc_bind_icv;
1978       }
1979 
1980       /* allocate a new parallel team */
1981       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1982       team = __kmp_allocate_team(root, nthreads, nthreads,
1983 #if OMPT_SUPPORT
1984                                  ompt_parallel_data,
1985 #endif
1986                                  proc_bind, &new_icvs,
1987                                  argc USE_NESTED_HOT_ARG(master_th));
1988     } else {
1989       /* allocate a new parallel team */
1990       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1991       team = __kmp_allocate_team(root, nthreads, nthreads,
1992 #if OMPT_SUPPORT
1993                                  ompt_parallel_data,
1994 #endif
1995                                  proc_bind,
1996                                  &master_th->th.th_current_task->td_icvs,
1997                                  argc USE_NESTED_HOT_ARG(master_th));
1998     }
1999     KF_TRACE(
2000         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2001 
2002     /* setup the new team */
2003     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2004     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2005     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2006     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2007     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2008 #if OMPT_SUPPORT
2009     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2010                           return_address);
2011 #endif
2012     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2013     // TODO: parent_team->t.t_level == INT_MAX ???
2014     if (!master_th->th.th_teams_microtask || level > teams_level) {
2015       int new_level = parent_team->t.t_level + 1;
2016       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2017       new_level = parent_team->t.t_active_level + 1;
2018       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2019     } else {
2020       // AC: Do not increase parallel level at start of the teams construct
2021       int new_level = parent_team->t.t_level;
2022       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2023       new_level = parent_team->t.t_active_level;
2024       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2025     }
2026     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2027     // set primary thread's schedule as new run-time schedule
2028     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2029 
2030     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2031     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2032 
2033     // Update the floating point rounding in the team if required.
2034     propagateFPControl(team);
2035 #if OMPD_SUPPORT
2036     if (ompd_state & OMPD_ENABLE_BP)
2037       ompd_bp_parallel_begin();
2038 #endif
2039 
2040     if (__kmp_tasking_mode != tskm_immediate_exec) {
2041       // Set primary thread's task team to team's task team. Unless this is hot
2042       // team, it should be NULL.
2043       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2044                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2045       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2046                     "%p, new task_team %p / team %p\n",
2047                     __kmp_gtid_from_thread(master_th),
2048                     master_th->th.th_task_team, parent_team,
2049                     team->t.t_task_team[master_th->th.th_task_state], team));
2050 
2051       if (active_level || master_th->th.th_task_team) {
2052         // Take a memo of primary thread's task_state
2053         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2054         if (master_th->th.th_task_state_top >=
2055             master_th->th.th_task_state_stack_sz) { // increase size
2056           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2057           kmp_uint8 *old_stack, *new_stack;
2058           kmp_uint32 i;
2059           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2060           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2061             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2062           }
2063           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2064                ++i) { // zero-init rest of stack
2065             new_stack[i] = 0;
2066           }
2067           old_stack = master_th->th.th_task_state_memo_stack;
2068           master_th->th.th_task_state_memo_stack = new_stack;
2069           master_th->th.th_task_state_stack_sz = new_size;
2070           __kmp_free(old_stack);
2071         }
2072         // Store primary thread's task_state on stack
2073         master_th->th
2074             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2075             master_th->th.th_task_state;
2076         master_th->th.th_task_state_top++;
2077 #if KMP_NESTED_HOT_TEAMS
2078         if (master_th->th.th_hot_teams &&
2079             active_level < __kmp_hot_teams_max_level &&
2080             team == master_th->th.th_hot_teams[active_level].hot_team) {
2081           // Restore primary thread's nested state if nested hot team
2082           master_th->th.th_task_state =
2083               master_th->th
2084                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2085         } else {
2086 #endif
2087           master_th->th.th_task_state = 0;
2088 #if KMP_NESTED_HOT_TEAMS
2089         }
2090 #endif
2091       }
2092 #if !KMP_NESTED_HOT_TEAMS
2093       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2094                        (team == root->r.r_hot_team));
2095 #endif
2096     }
2097 
2098     KA_TRACE(
2099         20,
2100         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2101          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2102          team->t.t_nproc));
2103     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2104                      (team->t.t_master_tid == 0 &&
2105                       (team->t.t_parent == root->r.r_root_team ||
2106                        team->t.t_parent->t.t_serialized)));
2107     KMP_MB();
2108 
2109     /* now, setup the arguments */
2110     argv = (void **)team->t.t_argv;
2111     if (ap) {
2112       for (i = argc - 1; i >= 0; --i) {
2113         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2114         KMP_CHECK_UPDATE(*argv, new_argv);
2115         argv++;
2116       }
2117     } else {
2118       for (i = 0; i < argc; ++i) {
2119         // Get args from parent team for teams construct
2120         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2121       }
2122     }
2123 
2124     /* now actually fork the threads */
2125     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2126     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2127       root->r.r_active = TRUE;
2128 
2129     __kmp_fork_team_threads(root, team, master_th, gtid);
2130     __kmp_setup_icv_copy(team, nthreads,
2131                          &master_th->th.th_current_task->td_icvs, loc);
2132 
2133 #if OMPT_SUPPORT
2134     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2135 #endif
2136 
2137     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2138 
2139 #if USE_ITT_BUILD
2140     if (team->t.t_active_level == 1 // only report frames at level 1
2141         && !master_th->th.th_teams_microtask) { // not in teams construct
2142 #if USE_ITT_NOTIFY
2143       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2144           (__kmp_forkjoin_frames_mode == 3 ||
2145            __kmp_forkjoin_frames_mode == 1)) {
2146         kmp_uint64 tmp_time = 0;
2147         if (__itt_get_timestamp_ptr)
2148           tmp_time = __itt_get_timestamp();
2149         // Internal fork - report frame begin
2150         master_th->th.th_frame_time = tmp_time;
2151         if (__kmp_forkjoin_frames_mode == 3)
2152           team->t.t_region_time = tmp_time;
2153       } else
2154 // only one notification scheme (either "submit" or "forking/joined", not both)
2155 #endif /* USE_ITT_NOTIFY */
2156           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2157               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2158         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2159         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2160       }
2161     }
2162 #endif /* USE_ITT_BUILD */
2163 
2164     /* now go on and do the work */
2165     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2166     KMP_MB();
2167     KF_TRACE(10,
2168              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2169               root, team, master_th, gtid));
2170 
2171 #if USE_ITT_BUILD
2172     if (__itt_stack_caller_create_ptr) {
2173       // create new stack stitching id before entering fork barrier
2174       if (!enter_teams) {
2175         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2176         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2177       } else if (parent_team->t.t_serialized) {
2178         // keep stack stitching id in the serialized parent_team;
2179         // current team will be used for parallel inside the teams;
2180         // if parent_team is active, then it already keeps stack stitching id
2181         // for the league of teams
2182         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2183         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2184       }
2185     }
2186 #endif /* USE_ITT_BUILD */
2187 
2188     // AC: skip __kmp_internal_fork at teams construct, let only primary
2189     // threads execute
2190     if (ap) {
2191       __kmp_internal_fork(loc, gtid, team);
2192       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2193                     "master_th=%p, gtid=%d\n",
2194                     root, team, master_th, gtid));
2195     }
2196 
2197     if (call_context == fork_context_gnu) {
2198       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2199       return TRUE;
2200     }
2201 
2202     /* Invoke microtask for PRIMARY thread */
2203     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2204                   team->t.t_id, team->t.t_pkfn));
2205   } // END of timer KMP_fork_call block
2206 
2207 #if KMP_STATS_ENABLED
2208   // If beginning a teams construct, then change thread state
2209   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2210   if (!ap) {
2211     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2212   }
2213 #endif
2214 
2215   if (!team->t.t_invoke(gtid)) {
2216     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2217   }
2218 
2219 #if KMP_STATS_ENABLED
2220   // If was beginning of a teams construct, then reset thread state
2221   if (!ap) {
2222     KMP_SET_THREAD_STATE(previous_state);
2223   }
2224 #endif
2225 
2226   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2227                 team->t.t_id, team->t.t_pkfn));
2228   KMP_MB(); /* Flush all pending memory write invalidates.  */
2229 
2230   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2231 #if OMPT_SUPPORT
2232   if (ompt_enabled.enabled) {
2233     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2234   }
2235 #endif
2236 
2237   return TRUE;
2238 }
2239 
2240 #if OMPT_SUPPORT
2241 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2242                                             kmp_team_t *team) {
2243   // restore state outside the region
2244   thread->th.ompt_thread_info.state =
2245       ((team->t.t_serialized) ? ompt_state_work_serial
2246                               : ompt_state_work_parallel);
2247 }
2248 
2249 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2250                                    kmp_team_t *team, ompt_data_t *parallel_data,
2251                                    int flags, void *codeptr) {
2252   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2253   if (ompt_enabled.ompt_callback_parallel_end) {
2254     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2255         parallel_data, &(task_info->task_data), flags, codeptr);
2256   }
2257 
2258   task_info->frame.enter_frame = ompt_data_none;
2259   __kmp_join_restore_state(thread, team);
2260 }
2261 #endif
2262 
2263 void __kmp_join_call(ident_t *loc, int gtid
2264 #if OMPT_SUPPORT
2265                      ,
2266                      enum fork_context_e fork_context
2267 #endif
2268                      ,
2269                      int exit_teams) {
2270   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2271   kmp_team_t *team;
2272   kmp_team_t *parent_team;
2273   kmp_info_t *master_th;
2274   kmp_root_t *root;
2275   int master_active;
2276 
2277   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2278 
2279   /* setup current data */
2280   master_th = __kmp_threads[gtid];
2281   root = master_th->th.th_root;
2282   team = master_th->th.th_team;
2283   parent_team = team->t.t_parent;
2284 
2285   master_th->th.th_ident = loc;
2286 
2287 #if OMPT_SUPPORT
2288   void *team_microtask = (void *)team->t.t_pkfn;
2289   // For GOMP interface with serialized parallel, need the
2290   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2291   // and end-parallel events.
2292   if (ompt_enabled.enabled &&
2293       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2294     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2295   }
2296 #endif
2297 
2298 #if KMP_DEBUG
2299   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2300     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2301                   "th_task_team = %p\n",
2302                   __kmp_gtid_from_thread(master_th), team,
2303                   team->t.t_task_team[master_th->th.th_task_state],
2304                   master_th->th.th_task_team));
2305     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2306                      team->t.t_task_team[master_th->th.th_task_state]);
2307   }
2308 #endif
2309 
2310   if (team->t.t_serialized) {
2311     if (master_th->th.th_teams_microtask) {
2312       // We are in teams construct
2313       int level = team->t.t_level;
2314       int tlevel = master_th->th.th_teams_level;
2315       if (level == tlevel) {
2316         // AC: we haven't incremented it earlier at start of teams construct,
2317         //     so do it here - at the end of teams construct
2318         team->t.t_level++;
2319       } else if (level == tlevel + 1) {
2320         // AC: we are exiting parallel inside teams, need to increment
2321         // serialization in order to restore it in the next call to
2322         // __kmpc_end_serialized_parallel
2323         team->t.t_serialized++;
2324       }
2325     }
2326     __kmpc_end_serialized_parallel(loc, gtid);
2327 
2328 #if OMPT_SUPPORT
2329     if (ompt_enabled.enabled) {
2330       __kmp_join_restore_state(master_th, parent_team);
2331     }
2332 #endif
2333 
2334     return;
2335   }
2336 
2337   master_active = team->t.t_master_active;
2338 
2339   if (!exit_teams) {
2340     // AC: No barrier for internal teams at exit from teams construct.
2341     //     But there is barrier for external team (league).
2342     __kmp_internal_join(loc, gtid, team);
2343 #if USE_ITT_BUILD
2344     if (__itt_stack_caller_create_ptr) {
2345       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2346       // destroy the stack stitching id after join barrier
2347       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2348       team->t.t_stack_id = NULL;
2349     }
2350 #endif
2351   } else {
2352     master_th->th.th_task_state =
2353         0; // AC: no tasking in teams (out of any parallel)
2354 #if USE_ITT_BUILD
2355     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2356       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2357       // destroy the stack stitching id on exit from the teams construct
2358       // if parent_team is active, then the id will be destroyed later on
2359       // by master of the league of teams
2360       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2361       parent_team->t.t_stack_id = NULL;
2362     }
2363 #endif
2364   }
2365 
2366   KMP_MB();
2367 
2368 #if OMPT_SUPPORT
2369   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2370   void *codeptr = team->t.ompt_team_info.master_return_address;
2371 #endif
2372 
2373 #if USE_ITT_BUILD
2374   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2375   if (team->t.t_active_level == 1 &&
2376       (!master_th->th.th_teams_microtask || /* not in teams construct */
2377        master_th->th.th_teams_size.nteams == 1)) {
2378     master_th->th.th_ident = loc;
2379     // only one notification scheme (either "submit" or "forking/joined", not
2380     // both)
2381     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2382         __kmp_forkjoin_frames_mode == 3)
2383       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2384                              master_th->th.th_frame_time, 0, loc,
2385                              master_th->th.th_team_nproc, 1);
2386     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2387              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2388       __kmp_itt_region_joined(gtid);
2389   } // active_level == 1
2390 #endif /* USE_ITT_BUILD */
2391 
2392   if (master_th->th.th_teams_microtask && !exit_teams &&
2393       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2394       team->t.t_level == master_th->th.th_teams_level + 1) {
2395 // AC: We need to leave the team structure intact at the end of parallel
2396 // inside the teams construct, so that at the next parallel same (hot) team
2397 // works, only adjust nesting levels
2398 #if OMPT_SUPPORT
2399     ompt_data_t ompt_parallel_data = ompt_data_none;
2400     if (ompt_enabled.enabled) {
2401       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2402       if (ompt_enabled.ompt_callback_implicit_task) {
2403         int ompt_team_size = team->t.t_nproc;
2404         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2405             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2406             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2407       }
2408       task_info->frame.exit_frame = ompt_data_none;
2409       task_info->task_data = ompt_data_none;
2410       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2411       __ompt_lw_taskteam_unlink(master_th);
2412     }
2413 #endif
2414     /* Decrement our nested depth level */
2415     team->t.t_level--;
2416     team->t.t_active_level--;
2417     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2418 
2419     // Restore number of threads in the team if needed. This code relies on
2420     // the proper adjustment of th_teams_size.nth after the fork in
2421     // __kmp_teams_master on each teams primary thread in the case that
2422     // __kmp_reserve_threads reduced it.
2423     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2424       int old_num = master_th->th.th_team_nproc;
2425       int new_num = master_th->th.th_teams_size.nth;
2426       kmp_info_t **other_threads = team->t.t_threads;
2427       team->t.t_nproc = new_num;
2428       for (int i = 0; i < old_num; ++i) {
2429         other_threads[i]->th.th_team_nproc = new_num;
2430       }
2431       // Adjust states of non-used threads of the team
2432       for (int i = old_num; i < new_num; ++i) {
2433         // Re-initialize thread's barrier data.
2434         KMP_DEBUG_ASSERT(other_threads[i]);
2435         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2436         for (int b = 0; b < bs_last_barrier; ++b) {
2437           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2438           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2439 #if USE_DEBUGGER
2440           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2441 #endif
2442         }
2443         if (__kmp_tasking_mode != tskm_immediate_exec) {
2444           // Synchronize thread's task state
2445           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2446         }
2447       }
2448     }
2449 
2450 #if OMPT_SUPPORT
2451     if (ompt_enabled.enabled) {
2452       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2453                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2454     }
2455 #endif
2456 
2457     return;
2458   }
2459 
2460   /* do cleanup and restore the parent team */
2461   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2462   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2463 
2464   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2465 
2466   /* jc: The following lock has instructions with REL and ACQ semantics,
2467      separating the parallel user code called in this parallel region
2468      from the serial user code called after this function returns. */
2469   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2470 
2471   if (!master_th->th.th_teams_microtask ||
2472       team->t.t_level > master_th->th.th_teams_level) {
2473     /* Decrement our nested depth level */
2474     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2475   }
2476   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2477 
2478 #if OMPT_SUPPORT
2479   if (ompt_enabled.enabled) {
2480     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2481     if (ompt_enabled.ompt_callback_implicit_task) {
2482       int flags = (team_microtask == (void *)__kmp_teams_master)
2483                       ? ompt_task_initial
2484                       : ompt_task_implicit;
2485       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2486       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2487           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2488           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2489     }
2490     task_info->frame.exit_frame = ompt_data_none;
2491     task_info->task_data = ompt_data_none;
2492   }
2493 #endif
2494 
2495   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2496                 master_th, team));
2497   __kmp_pop_current_task_from_thread(master_th);
2498 
2499 #if KMP_AFFINITY_SUPPORTED
2500   // Restore master thread's partition.
2501   master_th->th.th_first_place = team->t.t_first_place;
2502   master_th->th.th_last_place = team->t.t_last_place;
2503 #endif // KMP_AFFINITY_SUPPORTED
2504   master_th->th.th_def_allocator = team->t.t_def_allocator;
2505 
2506 #if OMPD_SUPPORT
2507   if (ompd_state & OMPD_ENABLE_BP)
2508     ompd_bp_parallel_end();
2509 #endif
2510   updateHWFPControl(team);
2511 
2512   if (root->r.r_active != master_active)
2513     root->r.r_active = master_active;
2514 
2515   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2516                             master_th)); // this will free worker threads
2517 
2518   /* this race was fun to find. make sure the following is in the critical
2519      region otherwise assertions may fail occasionally since the old team may be
2520      reallocated and the hierarchy appears inconsistent. it is actually safe to
2521      run and won't cause any bugs, but will cause those assertion failures. it's
2522      only one deref&assign so might as well put this in the critical region */
2523   master_th->th.th_team = parent_team;
2524   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2525   master_th->th.th_team_master = parent_team->t.t_threads[0];
2526   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2527 
2528   /* restore serialized team, if need be */
2529   if (parent_team->t.t_serialized &&
2530       parent_team != master_th->th.th_serial_team &&
2531       parent_team != root->r.r_root_team) {
2532     __kmp_free_team(root,
2533                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2534     master_th->th.th_serial_team = parent_team;
2535   }
2536 
2537   if (__kmp_tasking_mode != tskm_immediate_exec) {
2538     if (master_th->th.th_task_state_top >
2539         0) { // Restore task state from memo stack
2540       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2541       // Remember primary thread's state if we re-use this nested hot team
2542       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2543           master_th->th.th_task_state;
2544       --master_th->th.th_task_state_top; // pop
2545       // Now restore state at this level
2546       master_th->th.th_task_state =
2547           master_th->th
2548               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2549     }
2550     // Copy the task team from the parent team to the primary thread
2551     master_th->th.th_task_team =
2552         parent_team->t.t_task_team[master_th->th.th_task_state];
2553     KA_TRACE(20,
2554              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2555               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2556               parent_team));
2557   }
2558 
2559   // TODO: GEH - cannot do this assertion because root thread not set up as
2560   // executing
2561   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2562   master_th->th.th_current_task->td_flags.executing = 1;
2563 
2564   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2565 
2566 #if OMPT_SUPPORT
2567   int flags =
2568       OMPT_INVOKER(fork_context) |
2569       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2570                                                       : ompt_parallel_team);
2571   if (ompt_enabled.enabled) {
2572     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2573                     codeptr);
2574   }
2575 #endif
2576 
2577   KMP_MB();
2578   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2579 }
2580 
2581 /* Check whether we should push an internal control record onto the
2582    serial team stack.  If so, do it.  */
2583 void __kmp_save_internal_controls(kmp_info_t *thread) {
2584 
2585   if (thread->th.th_team != thread->th.th_serial_team) {
2586     return;
2587   }
2588   if (thread->th.th_team->t.t_serialized > 1) {
2589     int push = 0;
2590 
2591     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2592       push = 1;
2593     } else {
2594       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2595           thread->th.th_team->t.t_serialized) {
2596         push = 1;
2597       }
2598     }
2599     if (push) { /* push a record on the serial team's stack */
2600       kmp_internal_control_t *control =
2601           (kmp_internal_control_t *)__kmp_allocate(
2602               sizeof(kmp_internal_control_t));
2603 
2604       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2605 
2606       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2607 
2608       control->next = thread->th.th_team->t.t_control_stack_top;
2609       thread->th.th_team->t.t_control_stack_top = control;
2610     }
2611   }
2612 }
2613 
2614 /* Changes set_nproc */
2615 void __kmp_set_num_threads(int new_nth, int gtid) {
2616   kmp_info_t *thread;
2617   kmp_root_t *root;
2618 
2619   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2620   KMP_DEBUG_ASSERT(__kmp_init_serial);
2621 
2622   if (new_nth < 1)
2623     new_nth = 1;
2624   else if (new_nth > __kmp_max_nth)
2625     new_nth = __kmp_max_nth;
2626 
2627   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2628   thread = __kmp_threads[gtid];
2629   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2630     return; // nothing to do
2631 
2632   __kmp_save_internal_controls(thread);
2633 
2634   set__nproc(thread, new_nth);
2635 
2636   // If this omp_set_num_threads() call will cause the hot team size to be
2637   // reduced (in the absence of a num_threads clause), then reduce it now,
2638   // rather than waiting for the next parallel region.
2639   root = thread->th.th_root;
2640   if (__kmp_init_parallel && (!root->r.r_active) &&
2641       (root->r.r_hot_team->t.t_nproc > new_nth)
2642 #if KMP_NESTED_HOT_TEAMS
2643       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2644 #endif
2645   ) {
2646     kmp_team_t *hot_team = root->r.r_hot_team;
2647     int f;
2648 
2649     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2650 
2651     // Release the extra threads we don't need any more.
2652     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2653       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2654       if (__kmp_tasking_mode != tskm_immediate_exec) {
2655         // When decreasing team size, threads no longer in the team should unref
2656         // task team.
2657         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2658       }
2659       __kmp_free_thread(hot_team->t.t_threads[f]);
2660       hot_team->t.t_threads[f] = NULL;
2661     }
2662     hot_team->t.t_nproc = new_nth;
2663 #if KMP_NESTED_HOT_TEAMS
2664     if (thread->th.th_hot_teams) {
2665       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2666       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2667     }
2668 #endif
2669 
2670     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2671 
2672     // Update the t_nproc field in the threads that are still active.
2673     for (f = 0; f < new_nth; f++) {
2674       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2675       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2676     }
2677     // Special flag in case omp_set_num_threads() call
2678     hot_team->t.t_size_changed = -1;
2679   }
2680 }
2681 
2682 /* Changes max_active_levels */
2683 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2684   kmp_info_t *thread;
2685 
2686   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2687                 "%d = (%d)\n",
2688                 gtid, max_active_levels));
2689   KMP_DEBUG_ASSERT(__kmp_init_serial);
2690 
2691   // validate max_active_levels
2692   if (max_active_levels < 0) {
2693     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2694     // We ignore this call if the user has specified a negative value.
2695     // The current setting won't be changed. The last valid setting will be
2696     // used. A warning will be issued (if warnings are allowed as controlled by
2697     // the KMP_WARNINGS env var).
2698     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2699                   "max_active_levels for thread %d = (%d)\n",
2700                   gtid, max_active_levels));
2701     return;
2702   }
2703   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2704     // it's OK, the max_active_levels is within the valid range: [ 0;
2705     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2706     // We allow a zero value. (implementation defined behavior)
2707   } else {
2708     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2709                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2710     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2711     // Current upper limit is MAX_INT. (implementation defined behavior)
2712     // If the input exceeds the upper limit, we correct the input to be the
2713     // upper limit. (implementation defined behavior)
2714     // Actually, the flow should never get here until we use MAX_INT limit.
2715   }
2716   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2717                 "max_active_levels for thread %d = (%d)\n",
2718                 gtid, max_active_levels));
2719 
2720   thread = __kmp_threads[gtid];
2721 
2722   __kmp_save_internal_controls(thread);
2723 
2724   set__max_active_levels(thread, max_active_levels);
2725 }
2726 
2727 /* Gets max_active_levels */
2728 int __kmp_get_max_active_levels(int gtid) {
2729   kmp_info_t *thread;
2730 
2731   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2732   KMP_DEBUG_ASSERT(__kmp_init_serial);
2733 
2734   thread = __kmp_threads[gtid];
2735   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2736   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2737                 "curtask_maxaclevel=%d\n",
2738                 gtid, thread->th.th_current_task,
2739                 thread->th.th_current_task->td_icvs.max_active_levels));
2740   return thread->th.th_current_task->td_icvs.max_active_levels;
2741 }
2742 
2743 // nteams-var per-device ICV
2744 void __kmp_set_num_teams(int num_teams) {
2745   if (num_teams > 0)
2746     __kmp_nteams = num_teams;
2747 }
2748 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2749 // teams-thread-limit-var per-device ICV
2750 void __kmp_set_teams_thread_limit(int limit) {
2751   if (limit > 0)
2752     __kmp_teams_thread_limit = limit;
2753 }
2754 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2755 
2756 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2757 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2758 
2759 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2760 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2761   kmp_info_t *thread;
2762   kmp_sched_t orig_kind;
2763   //    kmp_team_t *team;
2764 
2765   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2766                 gtid, (int)kind, chunk));
2767   KMP_DEBUG_ASSERT(__kmp_init_serial);
2768 
2769   // Check if the kind parameter is valid, correct if needed.
2770   // Valid parameters should fit in one of two intervals - standard or extended:
2771   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2772   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2773   orig_kind = kind;
2774   kind = __kmp_sched_without_mods(kind);
2775 
2776   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2777       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2778     // TODO: Hint needs attention in case we change the default schedule.
2779     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2780               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2781               __kmp_msg_null);
2782     kind = kmp_sched_default;
2783     chunk = 0; // ignore chunk value in case of bad kind
2784   }
2785 
2786   thread = __kmp_threads[gtid];
2787 
2788   __kmp_save_internal_controls(thread);
2789 
2790   if (kind < kmp_sched_upper_std) {
2791     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2792       // differ static chunked vs. unchunked:  chunk should be invalid to
2793       // indicate unchunked schedule (which is the default)
2794       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2795     } else {
2796       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2797           __kmp_sch_map[kind - kmp_sched_lower - 1];
2798     }
2799   } else {
2800     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2801     //    kmp_sched_lower - 2 ];
2802     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2803         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2804                       kmp_sched_lower - 2];
2805   }
2806   __kmp_sched_apply_mods_intkind(
2807       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2808   if (kind == kmp_sched_auto || chunk < 1) {
2809     // ignore parameter chunk for schedule auto
2810     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2811   } else {
2812     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2813   }
2814 }
2815 
2816 /* Gets def_sched_var ICV values */
2817 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2818   kmp_info_t *thread;
2819   enum sched_type th_type;
2820 
2821   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2822   KMP_DEBUG_ASSERT(__kmp_init_serial);
2823 
2824   thread = __kmp_threads[gtid];
2825 
2826   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2827   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2828   case kmp_sch_static:
2829   case kmp_sch_static_greedy:
2830   case kmp_sch_static_balanced:
2831     *kind = kmp_sched_static;
2832     __kmp_sched_apply_mods_stdkind(kind, th_type);
2833     *chunk = 0; // chunk was not set, try to show this fact via zero value
2834     return;
2835   case kmp_sch_static_chunked:
2836     *kind = kmp_sched_static;
2837     break;
2838   case kmp_sch_dynamic_chunked:
2839     *kind = kmp_sched_dynamic;
2840     break;
2841   case kmp_sch_guided_chunked:
2842   case kmp_sch_guided_iterative_chunked:
2843   case kmp_sch_guided_analytical_chunked:
2844     *kind = kmp_sched_guided;
2845     break;
2846   case kmp_sch_auto:
2847     *kind = kmp_sched_auto;
2848     break;
2849   case kmp_sch_trapezoidal:
2850     *kind = kmp_sched_trapezoidal;
2851     break;
2852 #if KMP_STATIC_STEAL_ENABLED
2853   case kmp_sch_static_steal:
2854     *kind = kmp_sched_static_steal;
2855     break;
2856 #endif
2857   default:
2858     KMP_FATAL(UnknownSchedulingType, th_type);
2859   }
2860 
2861   __kmp_sched_apply_mods_stdkind(kind, th_type);
2862   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2863 }
2864 
2865 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2866 
2867   int ii, dd;
2868   kmp_team_t *team;
2869   kmp_info_t *thr;
2870 
2871   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2872   KMP_DEBUG_ASSERT(__kmp_init_serial);
2873 
2874   // validate level
2875   if (level == 0)
2876     return 0;
2877   if (level < 0)
2878     return -1;
2879   thr = __kmp_threads[gtid];
2880   team = thr->th.th_team;
2881   ii = team->t.t_level;
2882   if (level > ii)
2883     return -1;
2884 
2885   if (thr->th.th_teams_microtask) {
2886     // AC: we are in teams region where multiple nested teams have same level
2887     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2888     if (level <=
2889         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2890       KMP_DEBUG_ASSERT(ii >= tlevel);
2891       // AC: As we need to pass by the teams league, we need to artificially
2892       // increase ii
2893       if (ii == tlevel) {
2894         ii += 2; // three teams have same level
2895       } else {
2896         ii++; // two teams have same level
2897       }
2898     }
2899   }
2900 
2901   if (ii == level)
2902     return __kmp_tid_from_gtid(gtid);
2903 
2904   dd = team->t.t_serialized;
2905   level++;
2906   while (ii > level) {
2907     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2908     }
2909     if ((team->t.t_serialized) && (!dd)) {
2910       team = team->t.t_parent;
2911       continue;
2912     }
2913     if (ii > level) {
2914       team = team->t.t_parent;
2915       dd = team->t.t_serialized;
2916       ii--;
2917     }
2918   }
2919 
2920   return (dd > 1) ? (0) : (team->t.t_master_tid);
2921 }
2922 
2923 int __kmp_get_team_size(int gtid, int level) {
2924 
2925   int ii, dd;
2926   kmp_team_t *team;
2927   kmp_info_t *thr;
2928 
2929   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2930   KMP_DEBUG_ASSERT(__kmp_init_serial);
2931 
2932   // validate level
2933   if (level == 0)
2934     return 1;
2935   if (level < 0)
2936     return -1;
2937   thr = __kmp_threads[gtid];
2938   team = thr->th.th_team;
2939   ii = team->t.t_level;
2940   if (level > ii)
2941     return -1;
2942 
2943   if (thr->th.th_teams_microtask) {
2944     // AC: we are in teams region where multiple nested teams have same level
2945     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2946     if (level <=
2947         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2948       KMP_DEBUG_ASSERT(ii >= tlevel);
2949       // AC: As we need to pass by the teams league, we need to artificially
2950       // increase ii
2951       if (ii == tlevel) {
2952         ii += 2; // three teams have same level
2953       } else {
2954         ii++; // two teams have same level
2955       }
2956     }
2957   }
2958 
2959   while (ii > level) {
2960     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2961     }
2962     if (team->t.t_serialized && (!dd)) {
2963       team = team->t.t_parent;
2964       continue;
2965     }
2966     if (ii > level) {
2967       team = team->t.t_parent;
2968       ii--;
2969     }
2970   }
2971 
2972   return team->t.t_nproc;
2973 }
2974 
2975 kmp_r_sched_t __kmp_get_schedule_global() {
2976   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2977   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2978   // independently. So one can get the updated schedule here.
2979 
2980   kmp_r_sched_t r_sched;
2981 
2982   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2983   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2984   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2985   // different roots (even in OMP 2.5)
2986   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2987   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2988   if (s == kmp_sch_static) {
2989     // replace STATIC with more detailed schedule (balanced or greedy)
2990     r_sched.r_sched_type = __kmp_static;
2991   } else if (s == kmp_sch_guided_chunked) {
2992     // replace GUIDED with more detailed schedule (iterative or analytical)
2993     r_sched.r_sched_type = __kmp_guided;
2994   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2995     r_sched.r_sched_type = __kmp_sched;
2996   }
2997   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2998 
2999   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3000     // __kmp_chunk may be wrong here (if it was not ever set)
3001     r_sched.chunk = KMP_DEFAULT_CHUNK;
3002   } else {
3003     r_sched.chunk = __kmp_chunk;
3004   }
3005 
3006   return r_sched;
3007 }
3008 
3009 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3010    at least argc number of *t_argv entries for the requested team. */
3011 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3012 
3013   KMP_DEBUG_ASSERT(team);
3014   if (!realloc || argc > team->t.t_max_argc) {
3015 
3016     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3017                    "current entries=%d\n",
3018                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3019     /* if previously allocated heap space for args, free them */
3020     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3021       __kmp_free((void *)team->t.t_argv);
3022 
3023     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3024       /* use unused space in the cache line for arguments */
3025       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3026       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3027                      "argv entries\n",
3028                      team->t.t_id, team->t.t_max_argc));
3029       team->t.t_argv = &team->t.t_inline_argv[0];
3030       if (__kmp_storage_map) {
3031         __kmp_print_storage_map_gtid(
3032             -1, &team->t.t_inline_argv[0],
3033             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3034             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3035             team->t.t_id);
3036       }
3037     } else {
3038       /* allocate space for arguments in the heap */
3039       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3040                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3041                                : 2 * argc;
3042       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3043                      "argv entries\n",
3044                      team->t.t_id, team->t.t_max_argc));
3045       team->t.t_argv =
3046           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3047       if (__kmp_storage_map) {
3048         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3049                                      &team->t.t_argv[team->t.t_max_argc],
3050                                      sizeof(void *) * team->t.t_max_argc,
3051                                      "team_%d.t_argv", team->t.t_id);
3052       }
3053     }
3054   }
3055 }
3056 
3057 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3058   int i;
3059   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3060   team->t.t_threads =
3061       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3062   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3063       sizeof(dispatch_shared_info_t) * num_disp_buff);
3064   team->t.t_dispatch =
3065       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3066   team->t.t_implicit_task_taskdata =
3067       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3068   team->t.t_max_nproc = max_nth;
3069 
3070   /* setup dispatch buffers */
3071   for (i = 0; i < num_disp_buff; ++i) {
3072     team->t.t_disp_buffer[i].buffer_index = i;
3073     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3074   }
3075 }
3076 
3077 static void __kmp_free_team_arrays(kmp_team_t *team) {
3078   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3079   int i;
3080   for (i = 0; i < team->t.t_max_nproc; ++i) {
3081     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3082       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3083       team->t.t_dispatch[i].th_disp_buffer = NULL;
3084     }
3085   }
3086 #if KMP_USE_HIER_SCHED
3087   __kmp_dispatch_free_hierarchies(team);
3088 #endif
3089   __kmp_free(team->t.t_threads);
3090   __kmp_free(team->t.t_disp_buffer);
3091   __kmp_free(team->t.t_dispatch);
3092   __kmp_free(team->t.t_implicit_task_taskdata);
3093   team->t.t_threads = NULL;
3094   team->t.t_disp_buffer = NULL;
3095   team->t.t_dispatch = NULL;
3096   team->t.t_implicit_task_taskdata = 0;
3097 }
3098 
3099 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3100   kmp_info_t **oldThreads = team->t.t_threads;
3101 
3102   __kmp_free(team->t.t_disp_buffer);
3103   __kmp_free(team->t.t_dispatch);
3104   __kmp_free(team->t.t_implicit_task_taskdata);
3105   __kmp_allocate_team_arrays(team, max_nth);
3106 
3107   KMP_MEMCPY(team->t.t_threads, oldThreads,
3108              team->t.t_nproc * sizeof(kmp_info_t *));
3109 
3110   __kmp_free(oldThreads);
3111 }
3112 
3113 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3114 
3115   kmp_r_sched_t r_sched =
3116       __kmp_get_schedule_global(); // get current state of scheduling globals
3117 
3118   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3119 
3120   kmp_internal_control_t g_icvs = {
3121     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3122     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3123     // adjustment of threads (per thread)
3124     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3125     // whether blocktime is explicitly set
3126     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3127 #if KMP_USE_MONITOR
3128     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3129 // intervals
3130 #endif
3131     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3132     // next parallel region (per thread)
3133     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3134     __kmp_cg_max_nth, // int thread_limit;
3135     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3136     // for max_active_levels
3137     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3138     // {sched,chunk} pair
3139     __kmp_nested_proc_bind.bind_types[0],
3140     __kmp_default_device,
3141     NULL // struct kmp_internal_control *next;
3142   };
3143 
3144   return g_icvs;
3145 }
3146 
3147 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3148 
3149   kmp_internal_control_t gx_icvs;
3150   gx_icvs.serial_nesting_level =
3151       0; // probably =team->t.t_serial like in save_inter_controls
3152   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3153   gx_icvs.next = NULL;
3154 
3155   return gx_icvs;
3156 }
3157 
3158 static void __kmp_initialize_root(kmp_root_t *root) {
3159   int f;
3160   kmp_team_t *root_team;
3161   kmp_team_t *hot_team;
3162   int hot_team_max_nth;
3163   kmp_r_sched_t r_sched =
3164       __kmp_get_schedule_global(); // get current state of scheduling globals
3165   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3166   KMP_DEBUG_ASSERT(root);
3167   KMP_ASSERT(!root->r.r_begin);
3168 
3169   /* setup the root state structure */
3170   __kmp_init_lock(&root->r.r_begin_lock);
3171   root->r.r_begin = FALSE;
3172   root->r.r_active = FALSE;
3173   root->r.r_in_parallel = 0;
3174   root->r.r_blocktime = __kmp_dflt_blocktime;
3175 #if KMP_AFFINITY_SUPPORTED
3176   root->r.r_affinity_assigned = FALSE;
3177 #endif
3178 
3179   /* setup the root team for this task */
3180   /* allocate the root team structure */
3181   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3182 
3183   root_team =
3184       __kmp_allocate_team(root,
3185                           1, // new_nproc
3186                           1, // max_nproc
3187 #if OMPT_SUPPORT
3188                           ompt_data_none, // root parallel id
3189 #endif
3190                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3191                           0 // argc
3192                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3193                           );
3194 #if USE_DEBUGGER
3195   // Non-NULL value should be assigned to make the debugger display the root
3196   // team.
3197   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3198 #endif
3199 
3200   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3201 
3202   root->r.r_root_team = root_team;
3203   root_team->t.t_control_stack_top = NULL;
3204 
3205   /* initialize root team */
3206   root_team->t.t_threads[0] = NULL;
3207   root_team->t.t_nproc = 1;
3208   root_team->t.t_serialized = 1;
3209   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3210   root_team->t.t_sched.sched = r_sched.sched;
3211   KA_TRACE(
3212       20,
3213       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3214        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3215 
3216   /* setup the  hot team for this task */
3217   /* allocate the hot team structure */
3218   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3219 
3220   hot_team =
3221       __kmp_allocate_team(root,
3222                           1, // new_nproc
3223                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3224 #if OMPT_SUPPORT
3225                           ompt_data_none, // root parallel id
3226 #endif
3227                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3228                           0 // argc
3229                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3230                           );
3231   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3232 
3233   root->r.r_hot_team = hot_team;
3234   root_team->t.t_control_stack_top = NULL;
3235 
3236   /* first-time initialization */
3237   hot_team->t.t_parent = root_team;
3238 
3239   /* initialize hot team */
3240   hot_team_max_nth = hot_team->t.t_max_nproc;
3241   for (f = 0; f < hot_team_max_nth; ++f) {
3242     hot_team->t.t_threads[f] = NULL;
3243   }
3244   hot_team->t.t_nproc = 1;
3245   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3246   hot_team->t.t_sched.sched = r_sched.sched;
3247   hot_team->t.t_size_changed = 0;
3248 }
3249 
3250 #ifdef KMP_DEBUG
3251 
3252 typedef struct kmp_team_list_item {
3253   kmp_team_p const *entry;
3254   struct kmp_team_list_item *next;
3255 } kmp_team_list_item_t;
3256 typedef kmp_team_list_item_t *kmp_team_list_t;
3257 
3258 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3259     kmp_team_list_t list, // List of teams.
3260     kmp_team_p const *team // Team to add.
3261 ) {
3262 
3263   // List must terminate with item where both entry and next are NULL.
3264   // Team is added to the list only once.
3265   // List is sorted in ascending order by team id.
3266   // Team id is *not* a key.
3267 
3268   kmp_team_list_t l;
3269 
3270   KMP_DEBUG_ASSERT(list != NULL);
3271   if (team == NULL) {
3272     return;
3273   }
3274 
3275   __kmp_print_structure_team_accum(list, team->t.t_parent);
3276   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3277 
3278   // Search list for the team.
3279   l = list;
3280   while (l->next != NULL && l->entry != team) {
3281     l = l->next;
3282   }
3283   if (l->next != NULL) {
3284     return; // Team has been added before, exit.
3285   }
3286 
3287   // Team is not found. Search list again for insertion point.
3288   l = list;
3289   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3290     l = l->next;
3291   }
3292 
3293   // Insert team.
3294   {
3295     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3296         sizeof(kmp_team_list_item_t));
3297     *item = *l;
3298     l->entry = team;
3299     l->next = item;
3300   }
3301 }
3302 
3303 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3304 
3305 ) {
3306   __kmp_printf("%s", title);
3307   if (team != NULL) {
3308     __kmp_printf("%2x %p\n", team->t.t_id, team);
3309   } else {
3310     __kmp_printf(" - (nil)\n");
3311   }
3312 }
3313 
3314 static void __kmp_print_structure_thread(char const *title,
3315                                          kmp_info_p const *thread) {
3316   __kmp_printf("%s", title);
3317   if (thread != NULL) {
3318     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3319   } else {
3320     __kmp_printf(" - (nil)\n");
3321   }
3322 }
3323 
3324 void __kmp_print_structure(void) {
3325 
3326   kmp_team_list_t list;
3327 
3328   // Initialize list of teams.
3329   list =
3330       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3331   list->entry = NULL;
3332   list->next = NULL;
3333 
3334   __kmp_printf("\n------------------------------\nGlobal Thread "
3335                "Table\n------------------------------\n");
3336   {
3337     int gtid;
3338     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3339       __kmp_printf("%2d", gtid);
3340       if (__kmp_threads != NULL) {
3341         __kmp_printf(" %p", __kmp_threads[gtid]);
3342       }
3343       if (__kmp_root != NULL) {
3344         __kmp_printf(" %p", __kmp_root[gtid]);
3345       }
3346       __kmp_printf("\n");
3347     }
3348   }
3349 
3350   // Print out __kmp_threads array.
3351   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3352                "----------\n");
3353   if (__kmp_threads != NULL) {
3354     int gtid;
3355     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3356       kmp_info_t const *thread = __kmp_threads[gtid];
3357       if (thread != NULL) {
3358         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3359         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3360         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3361         __kmp_print_structure_team("    Serial Team:  ",
3362                                    thread->th.th_serial_team);
3363         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3364         __kmp_print_structure_thread("    Primary:      ",
3365                                      thread->th.th_team_master);
3366         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3367         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3368         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3369         __kmp_print_structure_thread("    Next in pool: ",
3370                                      thread->th.th_next_pool);
3371         __kmp_printf("\n");
3372         __kmp_print_structure_team_accum(list, thread->th.th_team);
3373         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3374       }
3375     }
3376   } else {
3377     __kmp_printf("Threads array is not allocated.\n");
3378   }
3379 
3380   // Print out __kmp_root array.
3381   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3382                "--------\n");
3383   if (__kmp_root != NULL) {
3384     int gtid;
3385     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3386       kmp_root_t const *root = __kmp_root[gtid];
3387       if (root != NULL) {
3388         __kmp_printf("GTID %2d %p:\n", gtid, root);
3389         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3390         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3391         __kmp_print_structure_thread("    Uber Thread:  ",
3392                                      root->r.r_uber_thread);
3393         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3394         __kmp_printf("    In Parallel:  %2d\n",
3395                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3396         __kmp_printf("\n");
3397         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3398         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3399       }
3400     }
3401   } else {
3402     __kmp_printf("Ubers array is not allocated.\n");
3403   }
3404 
3405   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3406                "--------\n");
3407   while (list->next != NULL) {
3408     kmp_team_p const *team = list->entry;
3409     int i;
3410     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3411     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3412     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3413     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3414     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3415     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3416     for (i = 0; i < team->t.t_nproc; ++i) {
3417       __kmp_printf("    Thread %2d:      ", i);
3418       __kmp_print_structure_thread("", team->t.t_threads[i]);
3419     }
3420     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3421     __kmp_printf("\n");
3422     list = list->next;
3423   }
3424 
3425   // Print out __kmp_thread_pool and __kmp_team_pool.
3426   __kmp_printf("\n------------------------------\nPools\n----------------------"
3427                "--------\n");
3428   __kmp_print_structure_thread("Thread pool:          ",
3429                                CCAST(kmp_info_t *, __kmp_thread_pool));
3430   __kmp_print_structure_team("Team pool:            ",
3431                              CCAST(kmp_team_t *, __kmp_team_pool));
3432   __kmp_printf("\n");
3433 
3434   // Free team list.
3435   while (list != NULL) {
3436     kmp_team_list_item_t *item = list;
3437     list = list->next;
3438     KMP_INTERNAL_FREE(item);
3439   }
3440 }
3441 
3442 #endif
3443 
3444 //---------------------------------------------------------------------------
3445 //  Stuff for per-thread fast random number generator
3446 //  Table of primes
3447 static const unsigned __kmp_primes[] = {
3448     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3449     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3450     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3451     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3452     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3453     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3454     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3455     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3456     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3457     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3458     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3459 
3460 //---------------------------------------------------------------------------
3461 //  __kmp_get_random: Get a random number using a linear congruential method.
3462 unsigned short __kmp_get_random(kmp_info_t *thread) {
3463   unsigned x = thread->th.th_x;
3464   unsigned short r = (unsigned short)(x >> 16);
3465 
3466   thread->th.th_x = x * thread->th.th_a + 1;
3467 
3468   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3469                 thread->th.th_info.ds.ds_tid, r));
3470 
3471   return r;
3472 }
3473 //--------------------------------------------------------
3474 // __kmp_init_random: Initialize a random number generator
3475 void __kmp_init_random(kmp_info_t *thread) {
3476   unsigned seed = thread->th.th_info.ds.ds_tid;
3477 
3478   thread->th.th_a =
3479       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3480   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3481   KA_TRACE(30,
3482            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3483 }
3484 
3485 #if KMP_OS_WINDOWS
3486 /* reclaim array entries for root threads that are already dead, returns number
3487  * reclaimed */
3488 static int __kmp_reclaim_dead_roots(void) {
3489   int i, r = 0;
3490 
3491   for (i = 0; i < __kmp_threads_capacity; ++i) {
3492     if (KMP_UBER_GTID(i) &&
3493         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3494         !__kmp_root[i]
3495              ->r.r_active) { // AC: reclaim only roots died in non-active state
3496       r += __kmp_unregister_root_other_thread(i);
3497     }
3498   }
3499   return r;
3500 }
3501 #endif
3502 
3503 /* This function attempts to create free entries in __kmp_threads and
3504    __kmp_root, and returns the number of free entries generated.
3505 
3506    For Windows* OS static library, the first mechanism used is to reclaim array
3507    entries for root threads that are already dead.
3508 
3509    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3510    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3511    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3512    threadprivate cache array has been created. Synchronization with
3513    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3514 
3515    After any dead root reclamation, if the clipping value allows array expansion
3516    to result in the generation of a total of nNeed free slots, the function does
3517    that expansion. If not, nothing is done beyond the possible initial root
3518    thread reclamation.
3519 
3520    If any argument is negative, the behavior is undefined. */
3521 static int __kmp_expand_threads(int nNeed) {
3522   int added = 0;
3523   int minimumRequiredCapacity;
3524   int newCapacity;
3525   kmp_info_t **newThreads;
3526   kmp_root_t **newRoot;
3527 
3528   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3529   // resizing __kmp_threads does not need additional protection if foreign
3530   // threads are present
3531 
3532 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3533   /* only for Windows static library */
3534   /* reclaim array entries for root threads that are already dead */
3535   added = __kmp_reclaim_dead_roots();
3536 
3537   if (nNeed) {
3538     nNeed -= added;
3539     if (nNeed < 0)
3540       nNeed = 0;
3541   }
3542 #endif
3543   if (nNeed <= 0)
3544     return added;
3545 
3546   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3547   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3548   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3549   // > __kmp_max_nth in one of two ways:
3550   //
3551   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3552   //    may not be reused by another thread, so we may need to increase
3553   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3554   //
3555   // 2) New foreign root(s) are encountered.  We always register new foreign
3556   //    roots. This may cause a smaller # of threads to be allocated at
3557   //    subsequent parallel regions, but the worker threads hang around (and
3558   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3559   //
3560   // Anyway, that is the reason for moving the check to see if
3561   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3562   // instead of having it performed here. -BB
3563 
3564   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3565 
3566   /* compute expansion headroom to check if we can expand */
3567   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3568     /* possible expansion too small -- give up */
3569     return added;
3570   }
3571   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3572 
3573   newCapacity = __kmp_threads_capacity;
3574   do {
3575     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3576                                                           : __kmp_sys_max_nth;
3577   } while (newCapacity < minimumRequiredCapacity);
3578   newThreads = (kmp_info_t **)__kmp_allocate(
3579       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3580   newRoot =
3581       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3582   KMP_MEMCPY(newThreads, __kmp_threads,
3583              __kmp_threads_capacity * sizeof(kmp_info_t *));
3584   KMP_MEMCPY(newRoot, __kmp_root,
3585              __kmp_threads_capacity * sizeof(kmp_root_t *));
3586 
3587   kmp_info_t **temp_threads = __kmp_threads;
3588   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3589   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3590   __kmp_free(temp_threads);
3591   added += newCapacity - __kmp_threads_capacity;
3592   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3593 
3594   if (newCapacity > __kmp_tp_capacity) {
3595     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3596     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3597       __kmp_threadprivate_resize_cache(newCapacity);
3598     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3599       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3600     }
3601     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3602   }
3603 
3604   return added;
3605 }
3606 
3607 /* Register the current thread as a root thread and obtain our gtid. We must
3608    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3609    thread that calls from __kmp_do_serial_initialize() */
3610 int __kmp_register_root(int initial_thread) {
3611   kmp_info_t *root_thread;
3612   kmp_root_t *root;
3613   int gtid;
3614   int capacity;
3615   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3616   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3617   KMP_MB();
3618 
3619   /* 2007-03-02:
3620      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3621      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3622      work as expected -- it may return false (that means there is at least one
3623      empty slot in __kmp_threads array), but it is possible the only free slot
3624      is #0, which is reserved for initial thread and so cannot be used for this
3625      one. Following code workarounds this bug.
3626 
3627      However, right solution seems to be not reserving slot #0 for initial
3628      thread because:
3629      (1) there is no magic in slot #0,
3630      (2) we cannot detect initial thread reliably (the first thread which does
3631         serial initialization may be not a real initial thread).
3632   */
3633   capacity = __kmp_threads_capacity;
3634   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3635     --capacity;
3636   }
3637 
3638   // If it is not for initializing the hidden helper team, we need to take
3639   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3640   // in __kmp_threads_capacity.
3641   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3642     capacity -= __kmp_hidden_helper_threads_num;
3643   }
3644 
3645   /* see if there are too many threads */
3646   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3647     if (__kmp_tp_cached) {
3648       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3649                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3650                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3651     } else {
3652       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3653                   __kmp_msg_null);
3654     }
3655   }
3656 
3657   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3658   // 0: initial thread, also a regular OpenMP thread.
3659   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3660   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3661   // regular OpenMP threads.
3662   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3663     // Find an available thread slot for hidden helper thread. Slots for hidden
3664     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3665     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3666                    gtid <= __kmp_hidden_helper_threads_num;
3667          gtid++)
3668       ;
3669     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3670     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3671                  "hidden helper thread: T#%d\n",
3672                  gtid));
3673   } else {
3674     /* find an available thread slot */
3675     // Don't reassign the zero slot since we need that to only be used by
3676     // initial thread. Slots for hidden helper threads should also be skipped.
3677     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3678       gtid = 0;
3679     } else {
3680       for (gtid = __kmp_hidden_helper_threads_num + 1;
3681            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3682         ;
3683     }
3684     KA_TRACE(
3685         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3686     KMP_ASSERT(gtid < __kmp_threads_capacity);
3687   }
3688 
3689   /* update global accounting */
3690   __kmp_all_nth++;
3691   TCW_4(__kmp_nth, __kmp_nth + 1);
3692 
3693   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3694   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3695   if (__kmp_adjust_gtid_mode) {
3696     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3697       if (TCR_4(__kmp_gtid_mode) != 2) {
3698         TCW_4(__kmp_gtid_mode, 2);
3699       }
3700     } else {
3701       if (TCR_4(__kmp_gtid_mode) != 1) {
3702         TCW_4(__kmp_gtid_mode, 1);
3703       }
3704     }
3705   }
3706 
3707 #ifdef KMP_ADJUST_BLOCKTIME
3708   /* Adjust blocktime to zero if necessary            */
3709   /* Middle initialization might not have occurred yet */
3710   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3711     if (__kmp_nth > __kmp_avail_proc) {
3712       __kmp_zero_bt = TRUE;
3713     }
3714   }
3715 #endif /* KMP_ADJUST_BLOCKTIME */
3716 
3717   /* setup this new hierarchy */
3718   if (!(root = __kmp_root[gtid])) {
3719     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3720     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3721   }
3722 
3723 #if KMP_STATS_ENABLED
3724   // Initialize stats as soon as possible (right after gtid assignment).
3725   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3726   __kmp_stats_thread_ptr->startLife();
3727   KMP_SET_THREAD_STATE(SERIAL_REGION);
3728   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3729 #endif
3730   __kmp_initialize_root(root);
3731 
3732   /* setup new root thread structure */
3733   if (root->r.r_uber_thread) {
3734     root_thread = root->r.r_uber_thread;
3735   } else {
3736     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3737     if (__kmp_storage_map) {
3738       __kmp_print_thread_storage_map(root_thread, gtid);
3739     }
3740     root_thread->th.th_info.ds.ds_gtid = gtid;
3741 #if OMPT_SUPPORT
3742     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3743 #endif
3744     root_thread->th.th_root = root;
3745     if (__kmp_env_consistency_check) {
3746       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3747     }
3748 #if USE_FAST_MEMORY
3749     __kmp_initialize_fast_memory(root_thread);
3750 #endif /* USE_FAST_MEMORY */
3751 
3752 #if KMP_USE_BGET
3753     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3754     __kmp_initialize_bget(root_thread);
3755 #endif
3756     __kmp_init_random(root_thread); // Initialize random number generator
3757   }
3758 
3759   /* setup the serial team held in reserve by the root thread */
3760   if (!root_thread->th.th_serial_team) {
3761     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3762     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3763     root_thread->th.th_serial_team = __kmp_allocate_team(
3764         root, 1, 1,
3765 #if OMPT_SUPPORT
3766         ompt_data_none, // root parallel id
3767 #endif
3768         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3769   }
3770   KMP_ASSERT(root_thread->th.th_serial_team);
3771   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3772                 root_thread->th.th_serial_team));
3773 
3774   /* drop root_thread into place */
3775   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3776 
3777   root->r.r_root_team->t.t_threads[0] = root_thread;
3778   root->r.r_hot_team->t.t_threads[0] = root_thread;
3779   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3780   // AC: the team created in reserve, not for execution (it is unused for now).
3781   root_thread->th.th_serial_team->t.t_serialized = 0;
3782   root->r.r_uber_thread = root_thread;
3783 
3784   /* initialize the thread, get it ready to go */
3785   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3786   TCW_4(__kmp_init_gtid, TRUE);
3787 
3788   /* prepare the primary thread for get_gtid() */
3789   __kmp_gtid_set_specific(gtid);
3790 
3791 #if USE_ITT_BUILD
3792   __kmp_itt_thread_name(gtid);
3793 #endif /* USE_ITT_BUILD */
3794 
3795 #ifdef KMP_TDATA_GTID
3796   __kmp_gtid = gtid;
3797 #endif
3798   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3799   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3800 
3801   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3802                 "plain=%u\n",
3803                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3804                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3805                 KMP_INIT_BARRIER_STATE));
3806   { // Initialize barrier data.
3807     int b;
3808     for (b = 0; b < bs_last_barrier; ++b) {
3809       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3810 #if USE_DEBUGGER
3811       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3812 #endif
3813     }
3814   }
3815   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3816                    KMP_INIT_BARRIER_STATE);
3817 
3818 #if KMP_AFFINITY_SUPPORTED
3819   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3820   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3821   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3822   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3823 #endif /* KMP_AFFINITY_SUPPORTED */
3824   root_thread->th.th_def_allocator = __kmp_def_allocator;
3825   root_thread->th.th_prev_level = 0;
3826   root_thread->th.th_prev_num_threads = 1;
3827 
3828   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3829   tmp->cg_root = root_thread;
3830   tmp->cg_thread_limit = __kmp_cg_max_nth;
3831   tmp->cg_nthreads = 1;
3832   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3833                  " cg_nthreads init to 1\n",
3834                  root_thread, tmp));
3835   tmp->up = NULL;
3836   root_thread->th.th_cg_roots = tmp;
3837 
3838   __kmp_root_counter++;
3839 
3840 #if OMPT_SUPPORT
3841   if (!initial_thread && ompt_enabled.enabled) {
3842 
3843     kmp_info_t *root_thread = ompt_get_thread();
3844 
3845     ompt_set_thread_state(root_thread, ompt_state_overhead);
3846 
3847     if (ompt_enabled.ompt_callback_thread_begin) {
3848       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3849           ompt_thread_initial, __ompt_get_thread_data_internal());
3850     }
3851     ompt_data_t *task_data;
3852     ompt_data_t *parallel_data;
3853     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3854                                   NULL);
3855     if (ompt_enabled.ompt_callback_implicit_task) {
3856       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3857           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3858     }
3859 
3860     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3861   }
3862 #endif
3863 #if OMPD_SUPPORT
3864   if (ompd_state & OMPD_ENABLE_BP)
3865     ompd_bp_thread_begin();
3866 #endif
3867 
3868   KMP_MB();
3869   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3870 
3871   return gtid;
3872 }
3873 
3874 #if KMP_NESTED_HOT_TEAMS
3875 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3876                                 const int max_level) {
3877   int i, n, nth;
3878   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3879   if (!hot_teams || !hot_teams[level].hot_team) {
3880     return 0;
3881   }
3882   KMP_DEBUG_ASSERT(level < max_level);
3883   kmp_team_t *team = hot_teams[level].hot_team;
3884   nth = hot_teams[level].hot_team_nth;
3885   n = nth - 1; // primary thread is not freed
3886   if (level < max_level - 1) {
3887     for (i = 0; i < nth; ++i) {
3888       kmp_info_t *th = team->t.t_threads[i];
3889       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3890       if (i > 0 && th->th.th_hot_teams) {
3891         __kmp_free(th->th.th_hot_teams);
3892         th->th.th_hot_teams = NULL;
3893       }
3894     }
3895   }
3896   __kmp_free_team(root, team, NULL);
3897   return n;
3898 }
3899 #endif
3900 
3901 // Resets a root thread and clear its root and hot teams.
3902 // Returns the number of __kmp_threads entries directly and indirectly freed.
3903 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3904   kmp_team_t *root_team = root->r.r_root_team;
3905   kmp_team_t *hot_team = root->r.r_hot_team;
3906   int n = hot_team->t.t_nproc;
3907   int i;
3908 
3909   KMP_DEBUG_ASSERT(!root->r.r_active);
3910 
3911   root->r.r_root_team = NULL;
3912   root->r.r_hot_team = NULL;
3913   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3914   // before call to __kmp_free_team().
3915   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3916 #if KMP_NESTED_HOT_TEAMS
3917   if (__kmp_hot_teams_max_level >
3918       0) { // need to free nested hot teams and their threads if any
3919     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3920       kmp_info_t *th = hot_team->t.t_threads[i];
3921       if (__kmp_hot_teams_max_level > 1) {
3922         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3923       }
3924       if (th->th.th_hot_teams) {
3925         __kmp_free(th->th.th_hot_teams);
3926         th->th.th_hot_teams = NULL;
3927       }
3928     }
3929   }
3930 #endif
3931   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3932 
3933   // Before we can reap the thread, we need to make certain that all other
3934   // threads in the teams that had this root as ancestor have stopped trying to
3935   // steal tasks.
3936   if (__kmp_tasking_mode != tskm_immediate_exec) {
3937     __kmp_wait_to_unref_task_teams();
3938   }
3939 
3940 #if KMP_OS_WINDOWS
3941   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3942   KA_TRACE(
3943       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3944            "\n",
3945            (LPVOID) & (root->r.r_uber_thread->th),
3946            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3947   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3948 #endif /* KMP_OS_WINDOWS */
3949 
3950 #if OMPD_SUPPORT
3951   if (ompd_state & OMPD_ENABLE_BP)
3952     ompd_bp_thread_end();
3953 #endif
3954 
3955 #if OMPT_SUPPORT
3956   ompt_data_t *task_data;
3957   ompt_data_t *parallel_data;
3958   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3959                                 NULL);
3960   if (ompt_enabled.ompt_callback_implicit_task) {
3961     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3962         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3963   }
3964   if (ompt_enabled.ompt_callback_thread_end) {
3965     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3966         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3967   }
3968 #endif
3969 
3970   TCW_4(__kmp_nth,
3971         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3972   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3973   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3974                  " to %d\n",
3975                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3976                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3977   if (i == 1) {
3978     // need to free contention group structure
3979     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3980                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3981     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3982     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3983     root->r.r_uber_thread->th.th_cg_roots = NULL;
3984   }
3985   __kmp_reap_thread(root->r.r_uber_thread, 1);
3986 
3987   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3988   // instead of freeing.
3989   root->r.r_uber_thread = NULL;
3990   /* mark root as no longer in use */
3991   root->r.r_begin = FALSE;
3992 
3993   return n;
3994 }
3995 
3996 void __kmp_unregister_root_current_thread(int gtid) {
3997   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3998   /* this lock should be ok, since unregister_root_current_thread is never
3999      called during an abort, only during a normal close. furthermore, if you
4000      have the forkjoin lock, you should never try to get the initz lock */
4001   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4002   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4003     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4004                   "exiting T#%d\n",
4005                   gtid));
4006     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4007     return;
4008   }
4009   kmp_root_t *root = __kmp_root[gtid];
4010 
4011   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4012   KMP_ASSERT(KMP_UBER_GTID(gtid));
4013   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4014   KMP_ASSERT(root->r.r_active == FALSE);
4015 
4016   KMP_MB();
4017 
4018   kmp_info_t *thread = __kmp_threads[gtid];
4019   kmp_team_t *team = thread->th.th_team;
4020   kmp_task_team_t *task_team = thread->th.th_task_team;
4021 
4022   // we need to wait for the proxy tasks before finishing the thread
4023   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4024 #if OMPT_SUPPORT
4025     // the runtime is shutting down so we won't report any events
4026     thread->th.ompt_thread_info.state = ompt_state_undefined;
4027 #endif
4028     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4029   }
4030 
4031   __kmp_reset_root(gtid, root);
4032 
4033   KMP_MB();
4034   KC_TRACE(10,
4035            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4036 
4037   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4038 }
4039 
4040 #if KMP_OS_WINDOWS
4041 /* __kmp_forkjoin_lock must be already held
4042    Unregisters a root thread that is not the current thread.  Returns the number
4043    of __kmp_threads entries freed as a result. */
4044 static int __kmp_unregister_root_other_thread(int gtid) {
4045   kmp_root_t *root = __kmp_root[gtid];
4046   int r;
4047 
4048   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4049   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4050   KMP_ASSERT(KMP_UBER_GTID(gtid));
4051   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4052   KMP_ASSERT(root->r.r_active == FALSE);
4053 
4054   r = __kmp_reset_root(gtid, root);
4055   KC_TRACE(10,
4056            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4057   return r;
4058 }
4059 #endif
4060 
4061 #if KMP_DEBUG
4062 void __kmp_task_info() {
4063 
4064   kmp_int32 gtid = __kmp_entry_gtid();
4065   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4066   kmp_info_t *this_thr = __kmp_threads[gtid];
4067   kmp_team_t *steam = this_thr->th.th_serial_team;
4068   kmp_team_t *team = this_thr->th.th_team;
4069 
4070   __kmp_printf(
4071       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4072       "ptask=%p\n",
4073       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4074       team->t.t_implicit_task_taskdata[tid].td_parent);
4075 }
4076 #endif // KMP_DEBUG
4077 
4078 /* TODO optimize with one big memclr, take out what isn't needed, split
4079    responsibility to workers as much as possible, and delay initialization of
4080    features as much as possible  */
4081 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4082                                   int tid, int gtid) {
4083   /* this_thr->th.th_info.ds.ds_gtid is setup in
4084      kmp_allocate_thread/create_worker.
4085      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4086   KMP_DEBUG_ASSERT(this_thr != NULL);
4087   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4088   KMP_DEBUG_ASSERT(team);
4089   KMP_DEBUG_ASSERT(team->t.t_threads);
4090   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4091   kmp_info_t *master = team->t.t_threads[0];
4092   KMP_DEBUG_ASSERT(master);
4093   KMP_DEBUG_ASSERT(master->th.th_root);
4094 
4095   KMP_MB();
4096 
4097   TCW_SYNC_PTR(this_thr->th.th_team, team);
4098 
4099   this_thr->th.th_info.ds.ds_tid = tid;
4100   this_thr->th.th_set_nproc = 0;
4101   if (__kmp_tasking_mode != tskm_immediate_exec)
4102     // When tasking is possible, threads are not safe to reap until they are
4103     // done tasking; this will be set when tasking code is exited in wait
4104     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4105   else // no tasking --> always safe to reap
4106     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4107   this_thr->th.th_set_proc_bind = proc_bind_default;
4108 #if KMP_AFFINITY_SUPPORTED
4109   this_thr->th.th_new_place = this_thr->th.th_current_place;
4110 #endif
4111   this_thr->th.th_root = master->th.th_root;
4112 
4113   /* setup the thread's cache of the team structure */
4114   this_thr->th.th_team_nproc = team->t.t_nproc;
4115   this_thr->th.th_team_master = master;
4116   this_thr->th.th_team_serialized = team->t.t_serialized;
4117   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4118 
4119   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4120 
4121   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4122                 tid, gtid, this_thr, this_thr->th.th_current_task));
4123 
4124   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4125                            team, tid, TRUE);
4126 
4127   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4128                 tid, gtid, this_thr, this_thr->th.th_current_task));
4129   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4130   // __kmp_initialize_team()?
4131 
4132   /* TODO no worksharing in speculative threads */
4133   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4134 
4135   this_thr->th.th_local.this_construct = 0;
4136 
4137   if (!this_thr->th.th_pri_common) {
4138     this_thr->th.th_pri_common =
4139         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4140     if (__kmp_storage_map) {
4141       __kmp_print_storage_map_gtid(
4142           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4143           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4144     }
4145     this_thr->th.th_pri_head = NULL;
4146   }
4147 
4148   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4149       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4150     // Make new thread's CG root same as primary thread's
4151     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4152     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4153     if (tmp) {
4154       // worker changes CG, need to check if old CG should be freed
4155       int i = tmp->cg_nthreads--;
4156       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4157                      " on node %p of thread %p to %d\n",
4158                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4159       if (i == 1) {
4160         __kmp_free(tmp); // last thread left CG --> free it
4161       }
4162     }
4163     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4164     // Increment new thread's CG root's counter to add the new thread
4165     this_thr->th.th_cg_roots->cg_nthreads++;
4166     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4167                    " node %p of thread %p to %d\n",
4168                    this_thr, this_thr->th.th_cg_roots,
4169                    this_thr->th.th_cg_roots->cg_root,
4170                    this_thr->th.th_cg_roots->cg_nthreads));
4171     this_thr->th.th_current_task->td_icvs.thread_limit =
4172         this_thr->th.th_cg_roots->cg_thread_limit;
4173   }
4174 
4175   /* Initialize dynamic dispatch */
4176   {
4177     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4178     // Use team max_nproc since this will never change for the team.
4179     size_t disp_size =
4180         sizeof(dispatch_private_info_t) *
4181         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4182     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4183                   team->t.t_max_nproc));
4184     KMP_ASSERT(dispatch);
4185     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4186     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4187 
4188     dispatch->th_disp_index = 0;
4189     dispatch->th_doacross_buf_idx = 0;
4190     if (!dispatch->th_disp_buffer) {
4191       dispatch->th_disp_buffer =
4192           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4193 
4194       if (__kmp_storage_map) {
4195         __kmp_print_storage_map_gtid(
4196             gtid, &dispatch->th_disp_buffer[0],
4197             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4198                                           ? 1
4199                                           : __kmp_dispatch_num_buffers],
4200             disp_size,
4201             "th_%d.th_dispatch.th_disp_buffer "
4202             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4203             gtid, team->t.t_id, gtid);
4204       }
4205     } else {
4206       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4207     }
4208 
4209     dispatch->th_dispatch_pr_current = 0;
4210     dispatch->th_dispatch_sh_current = 0;
4211 
4212     dispatch->th_deo_fcn = 0; /* ORDERED     */
4213     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4214   }
4215 
4216   this_thr->th.th_next_pool = NULL;
4217 
4218   if (!this_thr->th.th_task_state_memo_stack) {
4219     size_t i;
4220     this_thr->th.th_task_state_memo_stack =
4221         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4222     this_thr->th.th_task_state_top = 0;
4223     this_thr->th.th_task_state_stack_sz = 4;
4224     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4225          ++i) // zero init the stack
4226       this_thr->th.th_task_state_memo_stack[i] = 0;
4227   }
4228 
4229   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4230   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4231 
4232   KMP_MB();
4233 }
4234 
4235 /* allocate a new thread for the requesting team. this is only called from
4236    within a forkjoin critical section. we will first try to get an available
4237    thread from the thread pool. if none is available, we will fork a new one
4238    assuming we are able to create a new one. this should be assured, as the
4239    caller should check on this first. */
4240 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4241                                   int new_tid) {
4242   kmp_team_t *serial_team;
4243   kmp_info_t *new_thr;
4244   int new_gtid;
4245 
4246   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4247   KMP_DEBUG_ASSERT(root && team);
4248 #if !KMP_NESTED_HOT_TEAMS
4249   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4250 #endif
4251   KMP_MB();
4252 
4253   /* first, try to get one from the thread pool */
4254   if (__kmp_thread_pool) {
4255     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4256     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4257     if (new_thr == __kmp_thread_pool_insert_pt) {
4258       __kmp_thread_pool_insert_pt = NULL;
4259     }
4260     TCW_4(new_thr->th.th_in_pool, FALSE);
4261     __kmp_suspend_initialize_thread(new_thr);
4262     __kmp_lock_suspend_mx(new_thr);
4263     if (new_thr->th.th_active_in_pool == TRUE) {
4264       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4265       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4266       new_thr->th.th_active_in_pool = FALSE;
4267     }
4268     __kmp_unlock_suspend_mx(new_thr);
4269 
4270     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4271                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4272     KMP_ASSERT(!new_thr->th.th_team);
4273     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4274 
4275     /* setup the thread structure */
4276     __kmp_initialize_info(new_thr, team, new_tid,
4277                           new_thr->th.th_info.ds.ds_gtid);
4278     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4279 
4280     TCW_4(__kmp_nth, __kmp_nth + 1);
4281 
4282     new_thr->th.th_task_state = 0;
4283     new_thr->th.th_task_state_top = 0;
4284     new_thr->th.th_task_state_stack_sz = 4;
4285 
4286 #ifdef KMP_ADJUST_BLOCKTIME
4287     /* Adjust blocktime back to zero if necessary */
4288     /* Middle initialization might not have occurred yet */
4289     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4290       if (__kmp_nth > __kmp_avail_proc) {
4291         __kmp_zero_bt = TRUE;
4292       }
4293     }
4294 #endif /* KMP_ADJUST_BLOCKTIME */
4295 
4296 #if KMP_DEBUG
4297     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4298     // KMP_BARRIER_PARENT_FLAG.
4299     int b;
4300     kmp_balign_t *balign = new_thr->th.th_bar;
4301     for (b = 0; b < bs_last_barrier; ++b)
4302       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4303 #endif
4304 
4305     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4306                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4307 
4308     KMP_MB();
4309     return new_thr;
4310   }
4311 
4312   /* no, well fork a new one */
4313   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4314   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4315 
4316 #if KMP_USE_MONITOR
4317   // If this is the first worker thread the RTL is creating, then also
4318   // launch the monitor thread.  We try to do this as early as possible.
4319   if (!TCR_4(__kmp_init_monitor)) {
4320     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4321     if (!TCR_4(__kmp_init_monitor)) {
4322       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4323       TCW_4(__kmp_init_monitor, 1);
4324       __kmp_create_monitor(&__kmp_monitor);
4325       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4326 #if KMP_OS_WINDOWS
4327       // AC: wait until monitor has started. This is a fix for CQ232808.
4328       // The reason is that if the library is loaded/unloaded in a loop with
4329       // small (parallel) work in between, then there is high probability that
4330       // monitor thread started after the library shutdown. At shutdown it is
4331       // too late to cope with the problem, because when the primary thread is
4332       // in DllMain (process detach) the monitor has no chances to start (it is
4333       // blocked), and primary thread has no means to inform the monitor that
4334       // the library has gone, because all the memory which the monitor can
4335       // access is going to be released/reset.
4336       while (TCR_4(__kmp_init_monitor) < 2) {
4337         KMP_YIELD(TRUE);
4338       }
4339       KF_TRACE(10, ("after monitor thread has started\n"));
4340 #endif
4341     }
4342     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4343   }
4344 #endif
4345 
4346   KMP_MB();
4347 
4348   {
4349     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4350                              ? 1
4351                              : __kmp_hidden_helper_threads_num + 1;
4352 
4353     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4354          ++new_gtid) {
4355       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4356     }
4357 
4358     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4359       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4360     }
4361   }
4362 
4363   /* allocate space for it. */
4364   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4365 
4366   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4367 
4368 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4369   // suppress race conditions detection on synchronization flags in debug mode
4370   // this helps to analyze library internals eliminating false positives
4371   __itt_suppress_mark_range(
4372       __itt_suppress_range, __itt_suppress_threading_errors,
4373       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4374   __itt_suppress_mark_range(
4375       __itt_suppress_range, __itt_suppress_threading_errors,
4376       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4377 #if KMP_OS_WINDOWS
4378   __itt_suppress_mark_range(
4379       __itt_suppress_range, __itt_suppress_threading_errors,
4380       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4381 #else
4382   __itt_suppress_mark_range(__itt_suppress_range,
4383                             __itt_suppress_threading_errors,
4384                             &new_thr->th.th_suspend_init_count,
4385                             sizeof(new_thr->th.th_suspend_init_count));
4386 #endif
4387   // TODO: check if we need to also suppress b_arrived flags
4388   __itt_suppress_mark_range(__itt_suppress_range,
4389                             __itt_suppress_threading_errors,
4390                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4391                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4392   __itt_suppress_mark_range(__itt_suppress_range,
4393                             __itt_suppress_threading_errors,
4394                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4395                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4396   __itt_suppress_mark_range(__itt_suppress_range,
4397                             __itt_suppress_threading_errors,
4398                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4399                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4400 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4401   if (__kmp_storage_map) {
4402     __kmp_print_thread_storage_map(new_thr, new_gtid);
4403   }
4404 
4405   // add the reserve serialized team, initialized from the team's primary thread
4406   {
4407     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4408     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4409     new_thr->th.th_serial_team = serial_team =
4410         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4411 #if OMPT_SUPPORT
4412                                           ompt_data_none, // root parallel id
4413 #endif
4414                                           proc_bind_default, &r_icvs,
4415                                           0 USE_NESTED_HOT_ARG(NULL));
4416   }
4417   KMP_ASSERT(serial_team);
4418   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4419   // execution (it is unused for now).
4420   serial_team->t.t_threads[0] = new_thr;
4421   KF_TRACE(10,
4422            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4423             new_thr));
4424 
4425   /* setup the thread structures */
4426   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4427 
4428 #if USE_FAST_MEMORY
4429   __kmp_initialize_fast_memory(new_thr);
4430 #endif /* USE_FAST_MEMORY */
4431 
4432 #if KMP_USE_BGET
4433   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4434   __kmp_initialize_bget(new_thr);
4435 #endif
4436 
4437   __kmp_init_random(new_thr); // Initialize random number generator
4438 
4439   /* Initialize these only once when thread is grabbed for a team allocation */
4440   KA_TRACE(20,
4441            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4442             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4443 
4444   int b;
4445   kmp_balign_t *balign = new_thr->th.th_bar;
4446   for (b = 0; b < bs_last_barrier; ++b) {
4447     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4448     balign[b].bb.team = NULL;
4449     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4450     balign[b].bb.use_oncore_barrier = 0;
4451   }
4452 
4453   new_thr->th.th_spin_here = FALSE;
4454   new_thr->th.th_next_waiting = 0;
4455 #if KMP_OS_UNIX
4456   new_thr->th.th_blocking = false;
4457 #endif
4458 
4459 #if KMP_AFFINITY_SUPPORTED
4460   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4461   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4462   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4463   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4464 #endif
4465   new_thr->th.th_def_allocator = __kmp_def_allocator;
4466   new_thr->th.th_prev_level = 0;
4467   new_thr->th.th_prev_num_threads = 1;
4468 
4469   TCW_4(new_thr->th.th_in_pool, FALSE);
4470   new_thr->th.th_active_in_pool = FALSE;
4471   TCW_4(new_thr->th.th_active, TRUE);
4472 
4473   /* adjust the global counters */
4474   __kmp_all_nth++;
4475   __kmp_nth++;
4476 
4477   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4478   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4479   if (__kmp_adjust_gtid_mode) {
4480     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4481       if (TCR_4(__kmp_gtid_mode) != 2) {
4482         TCW_4(__kmp_gtid_mode, 2);
4483       }
4484     } else {
4485       if (TCR_4(__kmp_gtid_mode) != 1) {
4486         TCW_4(__kmp_gtid_mode, 1);
4487       }
4488     }
4489   }
4490 
4491 #ifdef KMP_ADJUST_BLOCKTIME
4492   /* Adjust blocktime back to zero if necessary       */
4493   /* Middle initialization might not have occurred yet */
4494   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4495     if (__kmp_nth > __kmp_avail_proc) {
4496       __kmp_zero_bt = TRUE;
4497     }
4498   }
4499 #endif /* KMP_ADJUST_BLOCKTIME */
4500 
4501   /* actually fork it and create the new worker thread */
4502   KF_TRACE(
4503       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4504   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4505   KF_TRACE(10,
4506            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4507 
4508   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4509                 new_gtid));
4510   KMP_MB();
4511   return new_thr;
4512 }
4513 
4514 /* Reinitialize team for reuse.
4515    The hot team code calls this case at every fork barrier, so EPCC barrier
4516    test are extremely sensitive to changes in it, esp. writes to the team
4517    struct, which cause a cache invalidation in all threads.
4518    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4519 static void __kmp_reinitialize_team(kmp_team_t *team,
4520                                     kmp_internal_control_t *new_icvs,
4521                                     ident_t *loc) {
4522   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4523                 team->t.t_threads[0], team));
4524   KMP_DEBUG_ASSERT(team && new_icvs);
4525   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4526   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4527 
4528   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4529   // Copy ICVs to the primary thread's implicit taskdata
4530   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4531   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4532 
4533   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4534                 team->t.t_threads[0], team));
4535 }
4536 
4537 /* Initialize the team data structure.
4538    This assumes the t_threads and t_max_nproc are already set.
4539    Also, we don't touch the arguments */
4540 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4541                                   kmp_internal_control_t *new_icvs,
4542                                   ident_t *loc) {
4543   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4544 
4545   /* verify */
4546   KMP_DEBUG_ASSERT(team);
4547   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4548   KMP_DEBUG_ASSERT(team->t.t_threads);
4549   KMP_MB();
4550 
4551   team->t.t_master_tid = 0; /* not needed */
4552   /* team->t.t_master_bar;        not needed */
4553   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4554   team->t.t_nproc = new_nproc;
4555 
4556   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4557   team->t.t_next_pool = NULL;
4558   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4559    * up hot team */
4560 
4561   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4562   team->t.t_invoke = NULL; /* not needed */
4563 
4564   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4565   team->t.t_sched.sched = new_icvs->sched.sched;
4566 
4567 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4568   team->t.t_fp_control_saved = FALSE; /* not needed */
4569   team->t.t_x87_fpu_control_word = 0; /* not needed */
4570   team->t.t_mxcsr = 0; /* not needed */
4571 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4572 
4573   team->t.t_construct = 0;
4574 
4575   team->t.t_ordered.dt.t_value = 0;
4576   team->t.t_master_active = FALSE;
4577 
4578 #ifdef KMP_DEBUG
4579   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4580 #endif
4581 #if KMP_OS_WINDOWS
4582   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4583 #endif
4584 
4585   team->t.t_control_stack_top = NULL;
4586 
4587   __kmp_reinitialize_team(team, new_icvs, loc);
4588 
4589   KMP_MB();
4590   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4591 }
4592 
4593 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4594 /* Sets full mask for thread and returns old mask, no changes to structures. */
4595 static void
4596 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4597   if (KMP_AFFINITY_CAPABLE()) {
4598     int status;
4599     if (old_mask != NULL) {
4600       status = __kmp_get_system_affinity(old_mask, TRUE);
4601       int error = errno;
4602       if (status != 0) {
4603         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4604                     __kmp_msg_null);
4605       }
4606     }
4607     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4608   }
4609 }
4610 #endif
4611 
4612 #if KMP_AFFINITY_SUPPORTED
4613 
4614 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4615 // It calculates the worker + primary thread's partition based upon the parent
4616 // thread's partition, and binds each worker to a thread in their partition.
4617 // The primary thread's partition should already include its current binding.
4618 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4619   // Do not partition places for the hidden helper team
4620   if (KMP_HIDDEN_HELPER_TEAM(team))
4621     return;
4622   // Copy the primary thread's place partition to the team struct
4623   kmp_info_t *master_th = team->t.t_threads[0];
4624   KMP_DEBUG_ASSERT(master_th != NULL);
4625   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4626   int first_place = master_th->th.th_first_place;
4627   int last_place = master_th->th.th_last_place;
4628   int masters_place = master_th->th.th_current_place;
4629   team->t.t_first_place = first_place;
4630   team->t.t_last_place = last_place;
4631 
4632   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4633                 "bound to place %d partition = [%d,%d]\n",
4634                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4635                 team->t.t_id, masters_place, first_place, last_place));
4636 
4637   switch (proc_bind) {
4638 
4639   case proc_bind_default:
4640     // Serial teams might have the proc_bind policy set to proc_bind_default.
4641     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4642     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4643     break;
4644 
4645   case proc_bind_primary: {
4646     int f;
4647     int n_th = team->t.t_nproc;
4648     for (f = 1; f < n_th; f++) {
4649       kmp_info_t *th = team->t.t_threads[f];
4650       KMP_DEBUG_ASSERT(th != NULL);
4651       th->th.th_first_place = first_place;
4652       th->th.th_last_place = last_place;
4653       th->th.th_new_place = masters_place;
4654       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4655           team->t.t_display_affinity != 1) {
4656         team->t.t_display_affinity = 1;
4657       }
4658 
4659       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4660                      "partition = [%d,%d]\n",
4661                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4662                      f, masters_place, first_place, last_place));
4663     }
4664   } break;
4665 
4666   case proc_bind_close: {
4667     int f;
4668     int n_th = team->t.t_nproc;
4669     int n_places;
4670     if (first_place <= last_place) {
4671       n_places = last_place - first_place + 1;
4672     } else {
4673       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4674     }
4675     if (n_th <= n_places) {
4676       int place = masters_place;
4677       for (f = 1; f < n_th; f++) {
4678         kmp_info_t *th = team->t.t_threads[f];
4679         KMP_DEBUG_ASSERT(th != NULL);
4680 
4681         if (place == last_place) {
4682           place = first_place;
4683         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4684           place = 0;
4685         } else {
4686           place++;
4687         }
4688         th->th.th_first_place = first_place;
4689         th->th.th_last_place = last_place;
4690         th->th.th_new_place = place;
4691         if (__kmp_display_affinity && place != th->th.th_current_place &&
4692             team->t.t_display_affinity != 1) {
4693           team->t.t_display_affinity = 1;
4694         }
4695 
4696         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4697                        "partition = [%d,%d]\n",
4698                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4699                        team->t.t_id, f, place, first_place, last_place));
4700       }
4701     } else {
4702       int S, rem, gap, s_count;
4703       S = n_th / n_places;
4704       s_count = 0;
4705       rem = n_th - (S * n_places);
4706       gap = rem > 0 ? n_places / rem : n_places;
4707       int place = masters_place;
4708       int gap_ct = gap;
4709       for (f = 0; f < n_th; f++) {
4710         kmp_info_t *th = team->t.t_threads[f];
4711         KMP_DEBUG_ASSERT(th != NULL);
4712 
4713         th->th.th_first_place = first_place;
4714         th->th.th_last_place = last_place;
4715         th->th.th_new_place = place;
4716         if (__kmp_display_affinity && place != th->th.th_current_place &&
4717             team->t.t_display_affinity != 1) {
4718           team->t.t_display_affinity = 1;
4719         }
4720         s_count++;
4721 
4722         if ((s_count == S) && rem && (gap_ct == gap)) {
4723           // do nothing, add an extra thread to place on next iteration
4724         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4725           // we added an extra thread to this place; move to next place
4726           if (place == last_place) {
4727             place = first_place;
4728           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4729             place = 0;
4730           } else {
4731             place++;
4732           }
4733           s_count = 0;
4734           gap_ct = 1;
4735           rem--;
4736         } else if (s_count == S) { // place full; don't add extra
4737           if (place == last_place) {
4738             place = first_place;
4739           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4740             place = 0;
4741           } else {
4742             place++;
4743           }
4744           gap_ct++;
4745           s_count = 0;
4746         }
4747 
4748         KA_TRACE(100,
4749                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4750                   "partition = [%d,%d]\n",
4751                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4752                   th->th.th_new_place, first_place, last_place));
4753       }
4754       KMP_DEBUG_ASSERT(place == masters_place);
4755     }
4756   } break;
4757 
4758   case proc_bind_spread: {
4759     int f;
4760     int n_th = team->t.t_nproc;
4761     int n_places;
4762     int thidx;
4763     if (first_place <= last_place) {
4764       n_places = last_place - first_place + 1;
4765     } else {
4766       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4767     }
4768     if (n_th <= n_places) {
4769       int place = -1;
4770 
4771       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4772         int S = n_places / n_th;
4773         int s_count, rem, gap, gap_ct;
4774 
4775         place = masters_place;
4776         rem = n_places - n_th * S;
4777         gap = rem ? n_th / rem : 1;
4778         gap_ct = gap;
4779         thidx = n_th;
4780         if (update_master_only == 1)
4781           thidx = 1;
4782         for (f = 0; f < thidx; f++) {
4783           kmp_info_t *th = team->t.t_threads[f];
4784           KMP_DEBUG_ASSERT(th != NULL);
4785 
4786           th->th.th_first_place = place;
4787           th->th.th_new_place = place;
4788           if (__kmp_display_affinity && place != th->th.th_current_place &&
4789               team->t.t_display_affinity != 1) {
4790             team->t.t_display_affinity = 1;
4791           }
4792           s_count = 1;
4793           while (s_count < S) {
4794             if (place == last_place) {
4795               place = first_place;
4796             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4797               place = 0;
4798             } else {
4799               place++;
4800             }
4801             s_count++;
4802           }
4803           if (rem && (gap_ct == gap)) {
4804             if (place == last_place) {
4805               place = first_place;
4806             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4807               place = 0;
4808             } else {
4809               place++;
4810             }
4811             rem--;
4812             gap_ct = 0;
4813           }
4814           th->th.th_last_place = place;
4815           gap_ct++;
4816 
4817           if (place == last_place) {
4818             place = first_place;
4819           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4820             place = 0;
4821           } else {
4822             place++;
4823           }
4824 
4825           KA_TRACE(100,
4826                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4827                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4828                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4829                     f, th->th.th_new_place, th->th.th_first_place,
4830                     th->th.th_last_place, __kmp_affinity_num_masks));
4831         }
4832       } else {
4833         /* Having uniform space of available computation places I can create
4834            T partitions of round(P/T) size and put threads into the first
4835            place of each partition. */
4836         double current = static_cast<double>(masters_place);
4837         double spacing =
4838             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4839         int first, last;
4840         kmp_info_t *th;
4841 
4842         thidx = n_th + 1;
4843         if (update_master_only == 1)
4844           thidx = 1;
4845         for (f = 0; f < thidx; f++) {
4846           first = static_cast<int>(current);
4847           last = static_cast<int>(current + spacing) - 1;
4848           KMP_DEBUG_ASSERT(last >= first);
4849           if (first >= n_places) {
4850             if (masters_place) {
4851               first -= n_places;
4852               last -= n_places;
4853               if (first == (masters_place + 1)) {
4854                 KMP_DEBUG_ASSERT(f == n_th);
4855                 first--;
4856               }
4857               if (last == masters_place) {
4858                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4859                 last--;
4860               }
4861             } else {
4862               KMP_DEBUG_ASSERT(f == n_th);
4863               first = 0;
4864               last = 0;
4865             }
4866           }
4867           if (last >= n_places) {
4868             last = (n_places - 1);
4869           }
4870           place = first;
4871           current += spacing;
4872           if (f < n_th) {
4873             KMP_DEBUG_ASSERT(0 <= first);
4874             KMP_DEBUG_ASSERT(n_places > first);
4875             KMP_DEBUG_ASSERT(0 <= last);
4876             KMP_DEBUG_ASSERT(n_places > last);
4877             KMP_DEBUG_ASSERT(last_place >= first_place);
4878             th = team->t.t_threads[f];
4879             KMP_DEBUG_ASSERT(th);
4880             th->th.th_first_place = first;
4881             th->th.th_new_place = place;
4882             th->th.th_last_place = last;
4883             if (__kmp_display_affinity && place != th->th.th_current_place &&
4884                 team->t.t_display_affinity != 1) {
4885               team->t.t_display_affinity = 1;
4886             }
4887             KA_TRACE(100,
4888                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4889                       "partition = [%d,%d], spacing = %.4f\n",
4890                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4891                       team->t.t_id, f, th->th.th_new_place,
4892                       th->th.th_first_place, th->th.th_last_place, spacing));
4893           }
4894         }
4895       }
4896       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4897     } else {
4898       int S, rem, gap, s_count;
4899       S = n_th / n_places;
4900       s_count = 0;
4901       rem = n_th - (S * n_places);
4902       gap = rem > 0 ? n_places / rem : n_places;
4903       int place = masters_place;
4904       int gap_ct = gap;
4905       thidx = n_th;
4906       if (update_master_only == 1)
4907         thidx = 1;
4908       for (f = 0; f < thidx; f++) {
4909         kmp_info_t *th = team->t.t_threads[f];
4910         KMP_DEBUG_ASSERT(th != NULL);
4911 
4912         th->th.th_first_place = place;
4913         th->th.th_last_place = place;
4914         th->th.th_new_place = place;
4915         if (__kmp_display_affinity && place != th->th.th_current_place &&
4916             team->t.t_display_affinity != 1) {
4917           team->t.t_display_affinity = 1;
4918         }
4919         s_count++;
4920 
4921         if ((s_count == S) && rem && (gap_ct == gap)) {
4922           // do nothing, add an extra thread to place on next iteration
4923         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4924           // we added an extra thread to this place; move on to next place
4925           if (place == last_place) {
4926             place = first_place;
4927           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4928             place = 0;
4929           } else {
4930             place++;
4931           }
4932           s_count = 0;
4933           gap_ct = 1;
4934           rem--;
4935         } else if (s_count == S) { // place is full; don't add extra thread
4936           if (place == last_place) {
4937             place = first_place;
4938           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4939             place = 0;
4940           } else {
4941             place++;
4942           }
4943           gap_ct++;
4944           s_count = 0;
4945         }
4946 
4947         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4948                        "partition = [%d,%d]\n",
4949                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4950                        team->t.t_id, f, th->th.th_new_place,
4951                        th->th.th_first_place, th->th.th_last_place));
4952       }
4953       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4954     }
4955   } break;
4956 
4957   default:
4958     break;
4959   }
4960 
4961   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4962 }
4963 
4964 #endif // KMP_AFFINITY_SUPPORTED
4965 
4966 /* allocate a new team data structure to use.  take one off of the free pool if
4967    available */
4968 kmp_team_t *
4969 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4970 #if OMPT_SUPPORT
4971                     ompt_data_t ompt_parallel_data,
4972 #endif
4973                     kmp_proc_bind_t new_proc_bind,
4974                     kmp_internal_control_t *new_icvs,
4975                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4976   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4977   int f;
4978   kmp_team_t *team;
4979   int use_hot_team = !root->r.r_active;
4980   int level = 0;
4981 
4982   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4983   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4984   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4985   KMP_MB();
4986 
4987 #if KMP_NESTED_HOT_TEAMS
4988   kmp_hot_team_ptr_t *hot_teams;
4989   if (master) {
4990     team = master->th.th_team;
4991     level = team->t.t_active_level;
4992     if (master->th.th_teams_microtask) { // in teams construct?
4993       if (master->th.th_teams_size.nteams > 1 &&
4994           ( // #teams > 1
4995               team->t.t_pkfn ==
4996                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4997               master->th.th_teams_level <
4998                   team->t.t_level)) { // or nested parallel inside the teams
4999         ++level; // not increment if #teams==1, or for outer fork of the teams;
5000         // increment otherwise
5001       }
5002     }
5003     hot_teams = master->th.th_hot_teams;
5004     if (level < __kmp_hot_teams_max_level && hot_teams &&
5005         hot_teams[level].hot_team) {
5006       // hot team has already been allocated for given level
5007       use_hot_team = 1;
5008     } else {
5009       use_hot_team = 0;
5010     }
5011   } else {
5012     // check we won't access uninitialized hot_teams, just in case
5013     KMP_DEBUG_ASSERT(new_nproc == 1);
5014   }
5015 #endif
5016   // Optimization to use a "hot" team
5017   if (use_hot_team && new_nproc > 1) {
5018     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5019 #if KMP_NESTED_HOT_TEAMS
5020     team = hot_teams[level].hot_team;
5021 #else
5022     team = root->r.r_hot_team;
5023 #endif
5024 #if KMP_DEBUG
5025     if (__kmp_tasking_mode != tskm_immediate_exec) {
5026       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5027                     "task_team[1] = %p before reinit\n",
5028                     team->t.t_task_team[0], team->t.t_task_team[1]));
5029     }
5030 #endif
5031 
5032     // Has the number of threads changed?
5033     /* Let's assume the most common case is that the number of threads is
5034        unchanged, and put that case first. */
5035     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5036       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5037       // This case can mean that omp_set_num_threads() was called and the hot
5038       // team size was already reduced, so we check the special flag
5039       if (team->t.t_size_changed == -1) {
5040         team->t.t_size_changed = 1;
5041       } else {
5042         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5043       }
5044 
5045       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5046       kmp_r_sched_t new_sched = new_icvs->sched;
5047       // set primary thread's schedule as new run-time schedule
5048       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5049 
5050       __kmp_reinitialize_team(team, new_icvs,
5051                               root->r.r_uber_thread->th.th_ident);
5052 
5053       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5054                     team->t.t_threads[0], team));
5055       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5056 
5057 #if KMP_AFFINITY_SUPPORTED
5058       if ((team->t.t_size_changed == 0) &&
5059           (team->t.t_proc_bind == new_proc_bind)) {
5060         if (new_proc_bind == proc_bind_spread) {
5061           __kmp_partition_places(
5062               team, 1); // add flag to update only master for spread
5063         }
5064         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5065                        "proc_bind = %d, partition = [%d,%d]\n",
5066                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5067                        team->t.t_last_place));
5068       } else {
5069         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5070         __kmp_partition_places(team);
5071       }
5072 #else
5073       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5074 #endif /* KMP_AFFINITY_SUPPORTED */
5075     } else if (team->t.t_nproc > new_nproc) {
5076       KA_TRACE(20,
5077                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5078                 new_nproc));
5079 
5080       team->t.t_size_changed = 1;
5081 #if KMP_NESTED_HOT_TEAMS
5082       if (__kmp_hot_teams_mode == 0) {
5083         // AC: saved number of threads should correspond to team's value in this
5084         // mode, can be bigger in mode 1, when hot team has threads in reserve
5085         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5086         hot_teams[level].hot_team_nth = new_nproc;
5087 #endif // KMP_NESTED_HOT_TEAMS
5088         /* release the extra threads we don't need any more */
5089         for (f = new_nproc; f < team->t.t_nproc; f++) {
5090           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5091           if (__kmp_tasking_mode != tskm_immediate_exec) {
5092             // When decreasing team size, threads no longer in the team should
5093             // unref task team.
5094             team->t.t_threads[f]->th.th_task_team = NULL;
5095           }
5096           __kmp_free_thread(team->t.t_threads[f]);
5097           team->t.t_threads[f] = NULL;
5098         }
5099 #if KMP_NESTED_HOT_TEAMS
5100       } // (__kmp_hot_teams_mode == 0)
5101       else {
5102         // When keeping extra threads in team, switch threads to wait on own
5103         // b_go flag
5104         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5105           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5106           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5107           for (int b = 0; b < bs_last_barrier; ++b) {
5108             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5109               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5110             }
5111             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5112           }
5113         }
5114       }
5115 #endif // KMP_NESTED_HOT_TEAMS
5116       team->t.t_nproc = new_nproc;
5117       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5118       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5119       __kmp_reinitialize_team(team, new_icvs,
5120                               root->r.r_uber_thread->th.th_ident);
5121 
5122       // Update remaining threads
5123       for (f = 0; f < new_nproc; ++f) {
5124         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5125       }
5126 
5127       // restore the current task state of the primary thread: should be the
5128       // implicit task
5129       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5130                     team->t.t_threads[0], team));
5131 
5132       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5133 
5134 #ifdef KMP_DEBUG
5135       for (f = 0; f < team->t.t_nproc; f++) {
5136         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5137                          team->t.t_threads[f]->th.th_team_nproc ==
5138                              team->t.t_nproc);
5139       }
5140 #endif
5141 
5142       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5143 #if KMP_AFFINITY_SUPPORTED
5144       __kmp_partition_places(team);
5145 #endif
5146     } else { // team->t.t_nproc < new_nproc
5147 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5148       kmp_affin_mask_t *old_mask;
5149       if (KMP_AFFINITY_CAPABLE()) {
5150         KMP_CPU_ALLOC(old_mask);
5151       }
5152 #endif
5153 
5154       KA_TRACE(20,
5155                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5156                 new_nproc));
5157 
5158       team->t.t_size_changed = 1;
5159 
5160 #if KMP_NESTED_HOT_TEAMS
5161       int avail_threads = hot_teams[level].hot_team_nth;
5162       if (new_nproc < avail_threads)
5163         avail_threads = new_nproc;
5164       kmp_info_t **other_threads = team->t.t_threads;
5165       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5166         // Adjust barrier data of reserved threads (if any) of the team
5167         // Other data will be set in __kmp_initialize_info() below.
5168         int b;
5169         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5170         for (b = 0; b < bs_last_barrier; ++b) {
5171           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5172           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5173 #if USE_DEBUGGER
5174           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5175 #endif
5176         }
5177       }
5178       if (hot_teams[level].hot_team_nth >= new_nproc) {
5179         // we have all needed threads in reserve, no need to allocate any
5180         // this only possible in mode 1, cannot have reserved threads in mode 0
5181         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5182         team->t.t_nproc = new_nproc; // just get reserved threads involved
5183       } else {
5184         // we may have some threads in reserve, but not enough
5185         team->t.t_nproc =
5186             hot_teams[level]
5187                 .hot_team_nth; // get reserved threads involved if any
5188         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5189 #endif // KMP_NESTED_HOT_TEAMS
5190         if (team->t.t_max_nproc < new_nproc) {
5191           /* reallocate larger arrays */
5192           __kmp_reallocate_team_arrays(team, new_nproc);
5193           __kmp_reinitialize_team(team, new_icvs, NULL);
5194         }
5195 
5196 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5197         /* Temporarily set full mask for primary thread before creation of
5198            workers. The reason is that workers inherit the affinity from the
5199            primary thread, so if a lot of workers are created on the single
5200            core quickly, they don't get a chance to set their own affinity for
5201            a long time. */
5202         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5203 #endif
5204 
5205         /* allocate new threads for the hot team */
5206         for (f = team->t.t_nproc; f < new_nproc; f++) {
5207           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5208           KMP_DEBUG_ASSERT(new_worker);
5209           team->t.t_threads[f] = new_worker;
5210 
5211           KA_TRACE(20,
5212                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5213                     "join=%llu, plain=%llu\n",
5214                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5215                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5216                     team->t.t_bar[bs_plain_barrier].b_arrived));
5217 
5218           { // Initialize barrier data for new threads.
5219             int b;
5220             kmp_balign_t *balign = new_worker->th.th_bar;
5221             for (b = 0; b < bs_last_barrier; ++b) {
5222               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5223               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5224                                KMP_BARRIER_PARENT_FLAG);
5225 #if USE_DEBUGGER
5226               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5227 #endif
5228             }
5229           }
5230         }
5231 
5232 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5233         if (KMP_AFFINITY_CAPABLE()) {
5234           /* Restore initial primary thread's affinity mask */
5235           __kmp_set_system_affinity(old_mask, TRUE);
5236           KMP_CPU_FREE(old_mask);
5237         }
5238 #endif
5239 #if KMP_NESTED_HOT_TEAMS
5240       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5241 #endif // KMP_NESTED_HOT_TEAMS
5242       /* make sure everyone is syncronized */
5243       int old_nproc = team->t.t_nproc; // save old value and use to update only
5244       // new threads below
5245       __kmp_initialize_team(team, new_nproc, new_icvs,
5246                             root->r.r_uber_thread->th.th_ident);
5247 
5248       /* reinitialize the threads */
5249       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5250       for (f = 0; f < team->t.t_nproc; ++f)
5251         __kmp_initialize_info(team->t.t_threads[f], team, f,
5252                               __kmp_gtid_from_tid(f, team));
5253 
5254       if (level) { // set th_task_state for new threads in nested hot team
5255         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5256         // only need to set the th_task_state for the new threads. th_task_state
5257         // for primary thread will not be accurate until after this in
5258         // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5259         // get the correct value.
5260         for (f = old_nproc; f < team->t.t_nproc; ++f)
5261           team->t.t_threads[f]->th.th_task_state =
5262               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5263       } else { // set th_task_state for new threads in non-nested hot team
5264         // copy primary thread's state
5265         kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5266         for (f = old_nproc; f < team->t.t_nproc; ++f)
5267           team->t.t_threads[f]->th.th_task_state = old_state;
5268       }
5269 
5270 #ifdef KMP_DEBUG
5271       for (f = 0; f < team->t.t_nproc; ++f) {
5272         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5273                          team->t.t_threads[f]->th.th_team_nproc ==
5274                              team->t.t_nproc);
5275       }
5276 #endif
5277 
5278       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5279 #if KMP_AFFINITY_SUPPORTED
5280       __kmp_partition_places(team);
5281 #endif
5282     } // Check changes in number of threads
5283 
5284     kmp_info_t *master = team->t.t_threads[0];
5285     if (master->th.th_teams_microtask) {
5286       for (f = 1; f < new_nproc; ++f) {
5287         // propagate teams construct specific info to workers
5288         kmp_info_t *thr = team->t.t_threads[f];
5289         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5290         thr->th.th_teams_level = master->th.th_teams_level;
5291         thr->th.th_teams_size = master->th.th_teams_size;
5292       }
5293     }
5294 #if KMP_NESTED_HOT_TEAMS
5295     if (level) {
5296       // Sync barrier state for nested hot teams, not needed for outermost hot
5297       // team.
5298       for (f = 1; f < new_nproc; ++f) {
5299         kmp_info_t *thr = team->t.t_threads[f];
5300         int b;
5301         kmp_balign_t *balign = thr->th.th_bar;
5302         for (b = 0; b < bs_last_barrier; ++b) {
5303           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5304           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5305 #if USE_DEBUGGER
5306           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5307 #endif
5308         }
5309       }
5310     }
5311 #endif // KMP_NESTED_HOT_TEAMS
5312 
5313     /* reallocate space for arguments if necessary */
5314     __kmp_alloc_argv_entries(argc, team, TRUE);
5315     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5316     // The hot team re-uses the previous task team,
5317     // if untouched during the previous release->gather phase.
5318 
5319     KF_TRACE(10, (" hot_team = %p\n", team));
5320 
5321 #if KMP_DEBUG
5322     if (__kmp_tasking_mode != tskm_immediate_exec) {
5323       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5324                     "task_team[1] = %p after reinit\n",
5325                     team->t.t_task_team[0], team->t.t_task_team[1]));
5326     }
5327 #endif
5328 
5329 #if OMPT_SUPPORT
5330     __ompt_team_assign_id(team, ompt_parallel_data);
5331 #endif
5332 
5333     KMP_MB();
5334 
5335     return team;
5336   }
5337 
5338   /* next, let's try to take one from the team pool */
5339   KMP_MB();
5340   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5341     /* TODO: consider resizing undersized teams instead of reaping them, now
5342        that we have a resizing mechanism */
5343     if (team->t.t_max_nproc >= max_nproc) {
5344       /* take this team from the team pool */
5345       __kmp_team_pool = team->t.t_next_pool;
5346 
5347       /* setup the team for fresh use */
5348       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5349 
5350       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5351                     "task_team[1] %p to NULL\n",
5352                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5353       team->t.t_task_team[0] = NULL;
5354       team->t.t_task_team[1] = NULL;
5355 
5356       /* reallocate space for arguments if necessary */
5357       __kmp_alloc_argv_entries(argc, team, TRUE);
5358       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5359 
5360       KA_TRACE(
5361           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5362                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5363       { // Initialize barrier data.
5364         int b;
5365         for (b = 0; b < bs_last_barrier; ++b) {
5366           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5367 #if USE_DEBUGGER
5368           team->t.t_bar[b].b_master_arrived = 0;
5369           team->t.t_bar[b].b_team_arrived = 0;
5370 #endif
5371         }
5372       }
5373 
5374       team->t.t_proc_bind = new_proc_bind;
5375 
5376       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5377                     team->t.t_id));
5378 
5379 #if OMPT_SUPPORT
5380       __ompt_team_assign_id(team, ompt_parallel_data);
5381 #endif
5382 
5383       KMP_MB();
5384 
5385       return team;
5386     }
5387 
5388     /* reap team if it is too small, then loop back and check the next one */
5389     // not sure if this is wise, but, will be redone during the hot-teams
5390     // rewrite.
5391     /* TODO: Use technique to find the right size hot-team, don't reap them */
5392     team = __kmp_reap_team(team);
5393     __kmp_team_pool = team;
5394   }
5395 
5396   /* nothing available in the pool, no matter, make a new team! */
5397   KMP_MB();
5398   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5399 
5400   /* and set it up */
5401   team->t.t_max_nproc = max_nproc;
5402   /* NOTE well, for some reason allocating one big buffer and dividing it up
5403      seems to really hurt performance a lot on the P4, so, let's not use this */
5404   __kmp_allocate_team_arrays(team, max_nproc);
5405 
5406   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5407   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5408 
5409   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5410                 "%p to NULL\n",
5411                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5412   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5413   // memory, no need to duplicate
5414   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5415   // memory, no need to duplicate
5416 
5417   if (__kmp_storage_map) {
5418     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5419   }
5420 
5421   /* allocate space for arguments */
5422   __kmp_alloc_argv_entries(argc, team, FALSE);
5423   team->t.t_argc = argc;
5424 
5425   KA_TRACE(20,
5426            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5427             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5428   { // Initialize barrier data.
5429     int b;
5430     for (b = 0; b < bs_last_barrier; ++b) {
5431       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5432 #if USE_DEBUGGER
5433       team->t.t_bar[b].b_master_arrived = 0;
5434       team->t.t_bar[b].b_team_arrived = 0;
5435 #endif
5436     }
5437   }
5438 
5439   team->t.t_proc_bind = new_proc_bind;
5440 
5441 #if OMPT_SUPPORT
5442   __ompt_team_assign_id(team, ompt_parallel_data);
5443   team->t.ompt_serialized_team_info = NULL;
5444 #endif
5445 
5446   KMP_MB();
5447 
5448   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5449                 team->t.t_id));
5450 
5451   return team;
5452 }
5453 
5454 /* TODO implement hot-teams at all levels */
5455 /* TODO implement lazy thread release on demand (disband request) */
5456 
5457 /* free the team.  return it to the team pool.  release all the threads
5458  * associated with it */
5459 void __kmp_free_team(kmp_root_t *root,
5460                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5461   int f;
5462   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5463                 team->t.t_id));
5464 
5465   /* verify state */
5466   KMP_DEBUG_ASSERT(root);
5467   KMP_DEBUG_ASSERT(team);
5468   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5469   KMP_DEBUG_ASSERT(team->t.t_threads);
5470 
5471   int use_hot_team = team == root->r.r_hot_team;
5472 #if KMP_NESTED_HOT_TEAMS
5473   int level;
5474   kmp_hot_team_ptr_t *hot_teams;
5475   if (master) {
5476     level = team->t.t_active_level - 1;
5477     if (master->th.th_teams_microtask) { // in teams construct?
5478       if (master->th.th_teams_size.nteams > 1) {
5479         ++level; // level was not increased in teams construct for
5480         // team_of_masters
5481       }
5482       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5483           master->th.th_teams_level == team->t.t_level) {
5484         ++level; // level was not increased in teams construct for
5485         // team_of_workers before the parallel
5486       } // team->t.t_level will be increased inside parallel
5487     }
5488     hot_teams = master->th.th_hot_teams;
5489     if (level < __kmp_hot_teams_max_level) {
5490       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5491       use_hot_team = 1;
5492     }
5493   }
5494 #endif // KMP_NESTED_HOT_TEAMS
5495 
5496   /* team is done working */
5497   TCW_SYNC_PTR(team->t.t_pkfn,
5498                NULL); // Important for Debugging Support Library.
5499 #if KMP_OS_WINDOWS
5500   team->t.t_copyin_counter = 0; // init counter for possible reuse
5501 #endif
5502   // Do not reset pointer to parent team to NULL for hot teams.
5503 
5504   /* if we are non-hot team, release our threads */
5505   if (!use_hot_team) {
5506     if (__kmp_tasking_mode != tskm_immediate_exec) {
5507       // Wait for threads to reach reapable state
5508       for (f = 1; f < team->t.t_nproc; ++f) {
5509         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5510         kmp_info_t *th = team->t.t_threads[f];
5511         volatile kmp_uint32 *state = &th->th.th_reap_state;
5512         while (*state != KMP_SAFE_TO_REAP) {
5513 #if KMP_OS_WINDOWS
5514           // On Windows a thread can be killed at any time, check this
5515           DWORD ecode;
5516           if (!__kmp_is_thread_alive(th, &ecode)) {
5517             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5518             break;
5519           }
5520 #endif
5521           // first check if thread is sleeping
5522           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5523           if (fl.is_sleeping())
5524             fl.resume(__kmp_gtid_from_thread(th));
5525           KMP_CPU_PAUSE();
5526         }
5527       }
5528 
5529       // Delete task teams
5530       int tt_idx;
5531       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5532         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5533         if (task_team != NULL) {
5534           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5535             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5536             team->t.t_threads[f]->th.th_task_team = NULL;
5537           }
5538           KA_TRACE(
5539               20,
5540               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5541                __kmp_get_gtid(), task_team, team->t.t_id));
5542 #if KMP_NESTED_HOT_TEAMS
5543           __kmp_free_task_team(master, task_team);
5544 #endif
5545           team->t.t_task_team[tt_idx] = NULL;
5546         }
5547       }
5548     }
5549 
5550     // Reset pointer to parent team only for non-hot teams.
5551     team->t.t_parent = NULL;
5552     team->t.t_level = 0;
5553     team->t.t_active_level = 0;
5554 
5555     /* free the worker threads */
5556     for (f = 1; f < team->t.t_nproc; ++f) {
5557       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5558       __kmp_free_thread(team->t.t_threads[f]);
5559       team->t.t_threads[f] = NULL;
5560     }
5561 
5562     /* put the team back in the team pool */
5563     /* TODO limit size of team pool, call reap_team if pool too large */
5564     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5565     __kmp_team_pool = (volatile kmp_team_t *)team;
5566   } else { // Check if team was created for primary threads in teams construct
5567     // See if first worker is a CG root
5568     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5569                      team->t.t_threads[1]->th.th_cg_roots);
5570     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5571       // Clean up the CG root nodes on workers so that this team can be re-used
5572       for (f = 1; f < team->t.t_nproc; ++f) {
5573         kmp_info_t *thr = team->t.t_threads[f];
5574         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5575                          thr->th.th_cg_roots->cg_root == thr);
5576         // Pop current CG root off list
5577         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5578         thr->th.th_cg_roots = tmp->up;
5579         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5580                        " up to node %p. cg_nthreads was %d\n",
5581                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5582         int i = tmp->cg_nthreads--;
5583         if (i == 1) {
5584           __kmp_free(tmp); // free CG if we are the last thread in it
5585         }
5586         // Restore current task's thread_limit from CG root
5587         if (thr->th.th_cg_roots)
5588           thr->th.th_current_task->td_icvs.thread_limit =
5589               thr->th.th_cg_roots->cg_thread_limit;
5590       }
5591     }
5592   }
5593 
5594   KMP_MB();
5595 }
5596 
5597 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5598 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5599   kmp_team_t *next_pool = team->t.t_next_pool;
5600 
5601   KMP_DEBUG_ASSERT(team);
5602   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5603   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5604   KMP_DEBUG_ASSERT(team->t.t_threads);
5605   KMP_DEBUG_ASSERT(team->t.t_argv);
5606 
5607   /* TODO clean the threads that are a part of this? */
5608 
5609   /* free stuff */
5610   __kmp_free_team_arrays(team);
5611   if (team->t.t_argv != &team->t.t_inline_argv[0])
5612     __kmp_free((void *)team->t.t_argv);
5613   __kmp_free(team);
5614 
5615   KMP_MB();
5616   return next_pool;
5617 }
5618 
5619 // Free the thread.  Don't reap it, just place it on the pool of available
5620 // threads.
5621 //
5622 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5623 // binding for the affinity mechanism to be useful.
5624 //
5625 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5626 // However, we want to avoid a potential performance problem by always
5627 // scanning through the list to find the correct point at which to insert
5628 // the thread (potential N**2 behavior).  To do this we keep track of the
5629 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5630 // With single-level parallelism, threads will always be added to the tail
5631 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5632 // parallelism, all bets are off and we may need to scan through the entire
5633 // free list.
5634 //
5635 // This change also has a potentially large performance benefit, for some
5636 // applications.  Previously, as threads were freed from the hot team, they
5637 // would be placed back on the free list in inverse order.  If the hot team
5638 // grew back to it's original size, then the freed thread would be placed
5639 // back on the hot team in reverse order.  This could cause bad cache
5640 // locality problems on programs where the size of the hot team regularly
5641 // grew and shrunk.
5642 //
5643 // Now, for single-level parallelism, the OMP tid is always == gtid.
5644 void __kmp_free_thread(kmp_info_t *this_th) {
5645   int gtid;
5646   kmp_info_t **scan;
5647 
5648   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5649                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5650 
5651   KMP_DEBUG_ASSERT(this_th);
5652 
5653   // When moving thread to pool, switch thread to wait on own b_go flag, and
5654   // uninitialized (NULL team).
5655   int b;
5656   kmp_balign_t *balign = this_th->th.th_bar;
5657   for (b = 0; b < bs_last_barrier; ++b) {
5658     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5659       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5660     balign[b].bb.team = NULL;
5661     balign[b].bb.leaf_kids = 0;
5662   }
5663   this_th->th.th_task_state = 0;
5664   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5665 
5666   /* put thread back on the free pool */
5667   TCW_PTR(this_th->th.th_team, NULL);
5668   TCW_PTR(this_th->th.th_root, NULL);
5669   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5670 
5671   while (this_th->th.th_cg_roots) {
5672     this_th->th.th_cg_roots->cg_nthreads--;
5673     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5674                    " %p of thread  %p to %d\n",
5675                    this_th, this_th->th.th_cg_roots,
5676                    this_th->th.th_cg_roots->cg_root,
5677                    this_th->th.th_cg_roots->cg_nthreads));
5678     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5679     if (tmp->cg_root == this_th) { // Thread is a cg_root
5680       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5681       KA_TRACE(
5682           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5683       this_th->th.th_cg_roots = tmp->up;
5684       __kmp_free(tmp);
5685     } else { // Worker thread
5686       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5687         __kmp_free(tmp);
5688       }
5689       this_th->th.th_cg_roots = NULL;
5690       break;
5691     }
5692   }
5693 
5694   /* If the implicit task assigned to this thread can be used by other threads
5695    * -> multiple threads can share the data and try to free the task at
5696    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5697    * with higher probability when hot team is disabled but can occurs even when
5698    * the hot team is enabled */
5699   __kmp_free_implicit_task(this_th);
5700   this_th->th.th_current_task = NULL;
5701 
5702   // If the __kmp_thread_pool_insert_pt is already past the new insert
5703   // point, then we need to re-scan the entire list.
5704   gtid = this_th->th.th_info.ds.ds_gtid;
5705   if (__kmp_thread_pool_insert_pt != NULL) {
5706     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5707     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5708       __kmp_thread_pool_insert_pt = NULL;
5709     }
5710   }
5711 
5712   // Scan down the list to find the place to insert the thread.
5713   // scan is the address of a link in the list, possibly the address of
5714   // __kmp_thread_pool itself.
5715   //
5716   // In the absence of nested parallelism, the for loop will have 0 iterations.
5717   if (__kmp_thread_pool_insert_pt != NULL) {
5718     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5719   } else {
5720     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5721   }
5722   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5723        scan = &((*scan)->th.th_next_pool))
5724     ;
5725 
5726   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5727   // to its address.
5728   TCW_PTR(this_th->th.th_next_pool, *scan);
5729   __kmp_thread_pool_insert_pt = *scan = this_th;
5730   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5731                    (this_th->th.th_info.ds.ds_gtid <
5732                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5733   TCW_4(this_th->th.th_in_pool, TRUE);
5734   __kmp_suspend_initialize_thread(this_th);
5735   __kmp_lock_suspend_mx(this_th);
5736   if (this_th->th.th_active == TRUE) {
5737     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5738     this_th->th.th_active_in_pool = TRUE;
5739   }
5740 #if KMP_DEBUG
5741   else {
5742     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5743   }
5744 #endif
5745   __kmp_unlock_suspend_mx(this_th);
5746 
5747   TCW_4(__kmp_nth, __kmp_nth - 1);
5748 
5749 #ifdef KMP_ADJUST_BLOCKTIME
5750   /* Adjust blocktime back to user setting or default if necessary */
5751   /* Middle initialization might never have occurred                */
5752   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5753     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5754     if (__kmp_nth <= __kmp_avail_proc) {
5755       __kmp_zero_bt = FALSE;
5756     }
5757   }
5758 #endif /* KMP_ADJUST_BLOCKTIME */
5759 
5760   KMP_MB();
5761 }
5762 
5763 /* ------------------------------------------------------------------------ */
5764 
5765 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5766 #if OMP_PROFILING_SUPPORT
5767   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5768   // TODO: add a configuration option for time granularity
5769   if (ProfileTraceFile)
5770     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5771 #endif
5772 
5773   int gtid = this_thr->th.th_info.ds.ds_gtid;
5774   /*    void                 *stack_data;*/
5775   kmp_team_t **volatile pteam;
5776 
5777   KMP_MB();
5778   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5779 
5780   if (__kmp_env_consistency_check) {
5781     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5782   }
5783 
5784 #if OMPD_SUPPORT
5785   if (ompd_state & OMPD_ENABLE_BP)
5786     ompd_bp_thread_begin();
5787 #endif
5788 
5789 #if OMPT_SUPPORT
5790   ompt_data_t *thread_data = nullptr;
5791   if (ompt_enabled.enabled) {
5792     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5793     *thread_data = ompt_data_none;
5794 
5795     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5796     this_thr->th.ompt_thread_info.wait_id = 0;
5797     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5798     this_thr->th.ompt_thread_info.parallel_flags = 0;
5799     if (ompt_enabled.ompt_callback_thread_begin) {
5800       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5801           ompt_thread_worker, thread_data);
5802     }
5803     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5804   }
5805 #endif
5806 
5807   /* This is the place where threads wait for work */
5808   while (!TCR_4(__kmp_global.g.g_done)) {
5809     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5810     KMP_MB();
5811 
5812     /* wait for work to do */
5813     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5814 
5815     /* No tid yet since not part of a team */
5816     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5817 
5818 #if OMPT_SUPPORT
5819     if (ompt_enabled.enabled) {
5820       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5821     }
5822 #endif
5823 
5824     pteam = &this_thr->th.th_team;
5825 
5826     /* have we been allocated? */
5827     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5828       /* we were just woken up, so run our new task */
5829       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5830         int rc;
5831         KA_TRACE(20,
5832                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5833                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5834                   (*pteam)->t.t_pkfn));
5835 
5836         updateHWFPControl(*pteam);
5837 
5838 #if OMPT_SUPPORT
5839         if (ompt_enabled.enabled) {
5840           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5841         }
5842 #endif
5843 
5844         rc = (*pteam)->t.t_invoke(gtid);
5845         KMP_ASSERT(rc);
5846 
5847         KMP_MB();
5848         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5849                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5850                       (*pteam)->t.t_pkfn));
5851       }
5852 #if OMPT_SUPPORT
5853       if (ompt_enabled.enabled) {
5854         /* no frame set while outside task */
5855         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5856 
5857         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5858       }
5859 #endif
5860       /* join barrier after parallel region */
5861       __kmp_join_barrier(gtid);
5862     }
5863   }
5864   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5865 
5866 #if OMPD_SUPPORT
5867   if (ompd_state & OMPD_ENABLE_BP)
5868     ompd_bp_thread_end();
5869 #endif
5870 
5871 #if OMPT_SUPPORT
5872   if (ompt_enabled.ompt_callback_thread_end) {
5873     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5874   }
5875 #endif
5876 
5877   this_thr->th.th_task_team = NULL;
5878   /* run the destructors for the threadprivate data for this thread */
5879   __kmp_common_destroy_gtid(gtid);
5880 
5881   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5882   KMP_MB();
5883 
5884 #if OMP_PROFILING_SUPPORT
5885   llvm::timeTraceProfilerFinishThread();
5886 #endif
5887   return this_thr;
5888 }
5889 
5890 /* ------------------------------------------------------------------------ */
5891 
5892 void __kmp_internal_end_dest(void *specific_gtid) {
5893   // Make sure no significant bits are lost
5894   int gtid;
5895   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5896 
5897   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5898   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5899    * this is because 0 is reserved for the nothing-stored case */
5900 
5901   __kmp_internal_end_thread(gtid);
5902 }
5903 
5904 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5905 
5906 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5907   __kmp_internal_end_atexit();
5908 }
5909 
5910 #endif
5911 
5912 /* [Windows] josh: when the atexit handler is called, there may still be more
5913    than one thread alive */
5914 void __kmp_internal_end_atexit(void) {
5915   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5916   /* [Windows]
5917      josh: ideally, we want to completely shutdown the library in this atexit
5918      handler, but stat code that depends on thread specific data for gtid fails
5919      because that data becomes unavailable at some point during the shutdown, so
5920      we call __kmp_internal_end_thread instead. We should eventually remove the
5921      dependency on __kmp_get_specific_gtid in the stat code and use
5922      __kmp_internal_end_library to cleanly shutdown the library.
5923 
5924      // TODO: Can some of this comment about GVS be removed?
5925      I suspect that the offending stat code is executed when the calling thread
5926      tries to clean up a dead root thread's data structures, resulting in GVS
5927      code trying to close the GVS structures for that thread, but since the stat
5928      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5929      the calling thread is cleaning up itself instead of another thread, it get
5930      confused. This happens because allowing a thread to unregister and cleanup
5931      another thread is a recent modification for addressing an issue.
5932      Based on the current design (20050722), a thread may end up
5933      trying to unregister another thread only if thread death does not trigger
5934      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5935      thread specific data destructor function to detect thread death. For
5936      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5937      is nothing.  Thus, the workaround is applicable only for Windows static
5938      stat library. */
5939   __kmp_internal_end_library(-1);
5940 #if KMP_OS_WINDOWS
5941   __kmp_close_console();
5942 #endif
5943 }
5944 
5945 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5946   // It is assumed __kmp_forkjoin_lock is acquired.
5947 
5948   int gtid;
5949 
5950   KMP_DEBUG_ASSERT(thread != NULL);
5951 
5952   gtid = thread->th.th_info.ds.ds_gtid;
5953 
5954   if (!is_root) {
5955     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5956       /* Assume the threads are at the fork barrier here */
5957       KA_TRACE(
5958           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5959                gtid));
5960       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5961        * (GEH) */
5962       ANNOTATE_HAPPENS_BEFORE(thread);
5963       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5964                          thread);
5965       __kmp_release_64(&flag);
5966     }
5967 
5968     // Terminate OS thread.
5969     __kmp_reap_worker(thread);
5970 
5971     // The thread was killed asynchronously.  If it was actively
5972     // spinning in the thread pool, decrement the global count.
5973     //
5974     // There is a small timing hole here - if the worker thread was just waking
5975     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5976     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5977     // the global counter might not get updated.
5978     //
5979     // Currently, this can only happen as the library is unloaded,
5980     // so there are no harmful side effects.
5981     if (thread->th.th_active_in_pool) {
5982       thread->th.th_active_in_pool = FALSE;
5983       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5984       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5985     }
5986   }
5987 
5988   __kmp_free_implicit_task(thread);
5989 
5990 // Free the fast memory for tasking
5991 #if USE_FAST_MEMORY
5992   __kmp_free_fast_memory(thread);
5993 #endif /* USE_FAST_MEMORY */
5994 
5995   __kmp_suspend_uninitialize_thread(thread);
5996 
5997   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5998   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5999 
6000   --__kmp_all_nth;
6001   // __kmp_nth was decremented when thread is added to the pool.
6002 
6003 #ifdef KMP_ADJUST_BLOCKTIME
6004   /* Adjust blocktime back to user setting or default if necessary */
6005   /* Middle initialization might never have occurred                */
6006   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6007     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6008     if (__kmp_nth <= __kmp_avail_proc) {
6009       __kmp_zero_bt = FALSE;
6010     }
6011   }
6012 #endif /* KMP_ADJUST_BLOCKTIME */
6013 
6014   /* free the memory being used */
6015   if (__kmp_env_consistency_check) {
6016     if (thread->th.th_cons) {
6017       __kmp_free_cons_stack(thread->th.th_cons);
6018       thread->th.th_cons = NULL;
6019     }
6020   }
6021 
6022   if (thread->th.th_pri_common != NULL) {
6023     __kmp_free(thread->th.th_pri_common);
6024     thread->th.th_pri_common = NULL;
6025   }
6026 
6027   if (thread->th.th_task_state_memo_stack != NULL) {
6028     __kmp_free(thread->th.th_task_state_memo_stack);
6029     thread->th.th_task_state_memo_stack = NULL;
6030   }
6031 
6032 #if KMP_USE_BGET
6033   if (thread->th.th_local.bget_data != NULL) {
6034     __kmp_finalize_bget(thread);
6035   }
6036 #endif
6037 
6038 #if KMP_AFFINITY_SUPPORTED
6039   if (thread->th.th_affin_mask != NULL) {
6040     KMP_CPU_FREE(thread->th.th_affin_mask);
6041     thread->th.th_affin_mask = NULL;
6042   }
6043 #endif /* KMP_AFFINITY_SUPPORTED */
6044 
6045 #if KMP_USE_HIER_SCHED
6046   if (thread->th.th_hier_bar_data != NULL) {
6047     __kmp_free(thread->th.th_hier_bar_data);
6048     thread->th.th_hier_bar_data = NULL;
6049   }
6050 #endif
6051 
6052   __kmp_reap_team(thread->th.th_serial_team);
6053   thread->th.th_serial_team = NULL;
6054   __kmp_free(thread);
6055 
6056   KMP_MB();
6057 
6058 } // __kmp_reap_thread
6059 
6060 static void __kmp_internal_end(void) {
6061   int i;
6062 
6063   /* First, unregister the library */
6064   __kmp_unregister_library();
6065 
6066 #if KMP_OS_WINDOWS
6067   /* In Win static library, we can't tell when a root actually dies, so we
6068      reclaim the data structures for any root threads that have died but not
6069      unregistered themselves, in order to shut down cleanly.
6070      In Win dynamic library we also can't tell when a thread dies.  */
6071   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6072 // dead roots
6073 #endif
6074 
6075   for (i = 0; i < __kmp_threads_capacity; i++)
6076     if (__kmp_root[i])
6077       if (__kmp_root[i]->r.r_active)
6078         break;
6079   KMP_MB(); /* Flush all pending memory write invalidates.  */
6080   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6081 
6082   if (i < __kmp_threads_capacity) {
6083 #if KMP_USE_MONITOR
6084     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6085     KMP_MB(); /* Flush all pending memory write invalidates.  */
6086 
6087     // Need to check that monitor was initialized before reaping it. If we are
6088     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6089     // __kmp_monitor will appear to contain valid data, but it is only valid in
6090     // the parent process, not the child.
6091     // New behavior (201008): instead of keying off of the flag
6092     // __kmp_init_parallel, the monitor thread creation is keyed off
6093     // of the new flag __kmp_init_monitor.
6094     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6095     if (TCR_4(__kmp_init_monitor)) {
6096       __kmp_reap_monitor(&__kmp_monitor);
6097       TCW_4(__kmp_init_monitor, 0);
6098     }
6099     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6100     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6101 #endif // KMP_USE_MONITOR
6102   } else {
6103 /* TODO move this to cleanup code */
6104 #ifdef KMP_DEBUG
6105     /* make sure that everything has properly ended */
6106     for (i = 0; i < __kmp_threads_capacity; i++) {
6107       if (__kmp_root[i]) {
6108         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6109         //                    there can be uber threads alive here
6110         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6111       }
6112     }
6113 #endif
6114 
6115     KMP_MB();
6116 
6117     // Reap the worker threads.
6118     // This is valid for now, but be careful if threads are reaped sooner.
6119     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6120       // Get the next thread from the pool.
6121       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6122       __kmp_thread_pool = thread->th.th_next_pool;
6123       // Reap it.
6124       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6125       thread->th.th_next_pool = NULL;
6126       thread->th.th_in_pool = FALSE;
6127       __kmp_reap_thread(thread, 0);
6128     }
6129     __kmp_thread_pool_insert_pt = NULL;
6130 
6131     // Reap teams.
6132     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6133       // Get the next team from the pool.
6134       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6135       __kmp_team_pool = team->t.t_next_pool;
6136       // Reap it.
6137       team->t.t_next_pool = NULL;
6138       __kmp_reap_team(team);
6139     }
6140 
6141     __kmp_reap_task_teams();
6142 
6143 #if KMP_OS_UNIX
6144     // Threads that are not reaped should not access any resources since they
6145     // are going to be deallocated soon, so the shutdown sequence should wait
6146     // until all threads either exit the final spin-waiting loop or begin
6147     // sleeping after the given blocktime.
6148     for (i = 0; i < __kmp_threads_capacity; i++) {
6149       kmp_info_t *thr = __kmp_threads[i];
6150       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6151         KMP_CPU_PAUSE();
6152     }
6153 #endif
6154 
6155     for (i = 0; i < __kmp_threads_capacity; ++i) {
6156       // TBD: Add some checking...
6157       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6158     }
6159 
6160     /* Make sure all threadprivate destructors get run by joining with all
6161        worker threads before resetting this flag */
6162     TCW_SYNC_4(__kmp_init_common, FALSE);
6163 
6164     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6165     KMP_MB();
6166 
6167 #if KMP_USE_MONITOR
6168     // See note above: One of the possible fixes for CQ138434 / CQ140126
6169     //
6170     // FIXME: push both code fragments down and CSE them?
6171     // push them into __kmp_cleanup() ?
6172     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6173     if (TCR_4(__kmp_init_monitor)) {
6174       __kmp_reap_monitor(&__kmp_monitor);
6175       TCW_4(__kmp_init_monitor, 0);
6176     }
6177     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6178     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6179 #endif
6180   } /* else !__kmp_global.t_active */
6181   TCW_4(__kmp_init_gtid, FALSE);
6182   KMP_MB(); /* Flush all pending memory write invalidates.  */
6183 
6184   __kmp_cleanup();
6185 #if OMPT_SUPPORT
6186   ompt_fini();
6187 #endif
6188 }
6189 
6190 void __kmp_internal_end_library(int gtid_req) {
6191   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6192   /* this shouldn't be a race condition because __kmp_internal_end() is the
6193      only place to clear __kmp_serial_init */
6194   /* we'll check this later too, after we get the lock */
6195   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6196   // redundant, because the next check will work in any case.
6197   if (__kmp_global.g.g_abort) {
6198     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6199     /* TODO abort? */
6200     return;
6201   }
6202   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6203     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6204     return;
6205   }
6206 
6207   // If hidden helper team has been initialized, we need to deinit it
6208   if (TCR_4(__kmp_init_hidden_helper) &&
6209       !TCR_4(__kmp_hidden_helper_team_done)) {
6210     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6211     // First release the main thread to let it continue its work
6212     __kmp_hidden_helper_main_thread_release();
6213     // Wait until the hidden helper team has been destroyed
6214     __kmp_hidden_helper_threads_deinitz_wait();
6215   }
6216 
6217   KMP_MB(); /* Flush all pending memory write invalidates.  */
6218   /* find out who we are and what we should do */
6219   {
6220     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6221     KA_TRACE(
6222         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6223     if (gtid == KMP_GTID_SHUTDOWN) {
6224       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6225                     "already shutdown\n"));
6226       return;
6227     } else if (gtid == KMP_GTID_MONITOR) {
6228       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6229                     "registered, or system shutdown\n"));
6230       return;
6231     } else if (gtid == KMP_GTID_DNE) {
6232       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6233                     "shutdown\n"));
6234       /* we don't know who we are, but we may still shutdown the library */
6235     } else if (KMP_UBER_GTID(gtid)) {
6236       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6237       if (__kmp_root[gtid]->r.r_active) {
6238         __kmp_global.g.g_abort = -1;
6239         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6240         __kmp_unregister_library();
6241         KA_TRACE(10,
6242                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6243                   gtid));
6244         return;
6245       } else {
6246         KA_TRACE(
6247             10,
6248             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6249         __kmp_unregister_root_current_thread(gtid);
6250       }
6251     } else {
6252 /* worker threads may call this function through the atexit handler, if they
6253  * call exit() */
6254 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6255    TODO: do a thorough shutdown instead */
6256 #ifdef DUMP_DEBUG_ON_EXIT
6257       if (__kmp_debug_buf)
6258         __kmp_dump_debug_buffer();
6259 #endif
6260       // added unregister library call here when we switch to shm linux
6261       // if we don't, it will leave lots of files in /dev/shm
6262       // cleanup shared memory file before exiting.
6263       __kmp_unregister_library();
6264       return;
6265     }
6266   }
6267   /* synchronize the termination process */
6268   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6269 
6270   /* have we already finished */
6271   if (__kmp_global.g.g_abort) {
6272     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6273     /* TODO abort? */
6274     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6275     return;
6276   }
6277   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6278     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6279     return;
6280   }
6281 
6282   /* We need this lock to enforce mutex between this reading of
6283      __kmp_threads_capacity and the writing by __kmp_register_root.
6284      Alternatively, we can use a counter of roots that is atomically updated by
6285      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6286      __kmp_internal_end_*.  */
6287   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6288 
6289   /* now we can safely conduct the actual termination */
6290   __kmp_internal_end();
6291 
6292   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6293   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6294 
6295   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6296 
6297 #ifdef DUMP_DEBUG_ON_EXIT
6298   if (__kmp_debug_buf)
6299     __kmp_dump_debug_buffer();
6300 #endif
6301 
6302 #if KMP_OS_WINDOWS
6303   __kmp_close_console();
6304 #endif
6305 
6306   __kmp_fini_allocator();
6307 
6308 } // __kmp_internal_end_library
6309 
6310 void __kmp_internal_end_thread(int gtid_req) {
6311   int i;
6312 
6313   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6314   /* this shouldn't be a race condition because __kmp_internal_end() is the
6315    * only place to clear __kmp_serial_init */
6316   /* we'll check this later too, after we get the lock */
6317   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6318   // redundant, because the next check will work in any case.
6319   if (__kmp_global.g.g_abort) {
6320     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6321     /* TODO abort? */
6322     return;
6323   }
6324   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6325     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6326     return;
6327   }
6328 
6329   // If hidden helper team has been initialized, we need to deinit it
6330   if (TCR_4(__kmp_init_hidden_helper) &&
6331       !TCR_4(__kmp_hidden_helper_team_done)) {
6332     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6333     // First release the main thread to let it continue its work
6334     __kmp_hidden_helper_main_thread_release();
6335     // Wait until the hidden helper team has been destroyed
6336     __kmp_hidden_helper_threads_deinitz_wait();
6337   }
6338 
6339   KMP_MB(); /* Flush all pending memory write invalidates.  */
6340 
6341   /* find out who we are and what we should do */
6342   {
6343     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6344     KA_TRACE(10,
6345              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6346     if (gtid == KMP_GTID_SHUTDOWN) {
6347       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6348                     "already shutdown\n"));
6349       return;
6350     } else if (gtid == KMP_GTID_MONITOR) {
6351       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6352                     "registered, or system shutdown\n"));
6353       return;
6354     } else if (gtid == KMP_GTID_DNE) {
6355       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6356                     "shutdown\n"));
6357       return;
6358       /* we don't know who we are */
6359     } else if (KMP_UBER_GTID(gtid)) {
6360       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6361       if (__kmp_root[gtid]->r.r_active) {
6362         __kmp_global.g.g_abort = -1;
6363         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6364         KA_TRACE(10,
6365                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6366                   gtid));
6367         return;
6368       } else {
6369         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6370                       gtid));
6371         __kmp_unregister_root_current_thread(gtid);
6372       }
6373     } else {
6374       /* just a worker thread, let's leave */
6375       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6376 
6377       if (gtid >= 0) {
6378         __kmp_threads[gtid]->th.th_task_team = NULL;
6379       }
6380 
6381       KA_TRACE(10,
6382                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6383                 gtid));
6384       return;
6385     }
6386   }
6387 #if KMP_DYNAMIC_LIB
6388   if (__kmp_pause_status != kmp_hard_paused)
6389   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6390   // because we will better shutdown later in the library destructor.
6391   {
6392     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6393     return;
6394   }
6395 #endif
6396   /* synchronize the termination process */
6397   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6398 
6399   /* have we already finished */
6400   if (__kmp_global.g.g_abort) {
6401     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6402     /* TODO abort? */
6403     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6404     return;
6405   }
6406   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6407     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6408     return;
6409   }
6410 
6411   /* We need this lock to enforce mutex between this reading of
6412      __kmp_threads_capacity and the writing by __kmp_register_root.
6413      Alternatively, we can use a counter of roots that is atomically updated by
6414      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6415      __kmp_internal_end_*.  */
6416 
6417   /* should we finish the run-time?  are all siblings done? */
6418   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6419 
6420   for (i = 0; i < __kmp_threads_capacity; ++i) {
6421     if (KMP_UBER_GTID(i)) {
6422       KA_TRACE(
6423           10,
6424           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6425       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6426       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6427       return;
6428     }
6429   }
6430 
6431   /* now we can safely conduct the actual termination */
6432 
6433   __kmp_internal_end();
6434 
6435   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6436   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6437 
6438   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6439 
6440 #ifdef DUMP_DEBUG_ON_EXIT
6441   if (__kmp_debug_buf)
6442     __kmp_dump_debug_buffer();
6443 #endif
6444 } // __kmp_internal_end_thread
6445 
6446 // -----------------------------------------------------------------------------
6447 // Library registration stuff.
6448 
6449 static long __kmp_registration_flag = 0;
6450 // Random value used to indicate library initialization.
6451 static char *__kmp_registration_str = NULL;
6452 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6453 
6454 static inline char *__kmp_reg_status_name() {
6455 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6456    each thread. If registration and unregistration go in different threads
6457    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6458    env var can not be found, because the name will contain different pid. */
6459 // macOS* complains about name being too long with additional getuid()
6460 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6461   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6462                           (int)getuid());
6463 #else
6464   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6465 #endif
6466 } // __kmp_reg_status_get
6467 
6468 void __kmp_register_library_startup(void) {
6469 
6470   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6471   int done = 0;
6472   union {
6473     double dtime;
6474     long ltime;
6475   } time;
6476 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6477   __kmp_initialize_system_tick();
6478 #endif
6479   __kmp_read_system_time(&time.dtime);
6480   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6481   __kmp_registration_str =
6482       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6483                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6484 
6485   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6486                 __kmp_registration_str));
6487 
6488   while (!done) {
6489 
6490     char *value = NULL; // Actual value of the environment variable.
6491 
6492 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6493     char *shm_name = __kmp_str_format("/%s", name);
6494     int shm_preexist = 0;
6495     char *data1;
6496     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6497     if ((fd1 == -1) && (errno == EEXIST)) {
6498       // file didn't open because it already exists.
6499       // try opening existing file
6500       fd1 = shm_open(shm_name, O_RDWR, 0666);
6501       if (fd1 == -1) { // file didn't open
6502         // error out here
6503         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6504                     __kmp_msg_null);
6505       } else {
6506         // able to open existing file
6507         shm_preexist = 1;
6508       }
6509     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6510       // already exists.
6511       // error out here.
6512       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6513                   __kmp_msg_null);
6514     }
6515     if (shm_preexist == 0) {
6516       // we created SHM now set size
6517       if (ftruncate(fd1, SHM_SIZE) == -1) {
6518         // error occured setting size;
6519         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6520                     KMP_ERR(errno), __kmp_msg_null);
6521       }
6522     }
6523     data1 =
6524         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6525     if (data1 == MAP_FAILED) {
6526       // failed to map shared memory
6527       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6528                   __kmp_msg_null);
6529     }
6530     if (shm_preexist == 0) { // set data to SHM, set value
6531       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6532     }
6533     // Read value from either what we just wrote or existing file.
6534     value = __kmp_str_format("%s", data1); // read value from SHM
6535     munmap(data1, SHM_SIZE);
6536     close(fd1);
6537 #else // Windows and unix with static library
6538     // Set environment variable, but do not overwrite if it is exist.
6539     __kmp_env_set(name, __kmp_registration_str, 0);
6540     // read value to see if it got set
6541     value = __kmp_env_get(name);
6542 #endif
6543 
6544     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6545       done = 1; // Ok, environment variable set successfully, exit the loop.
6546     } else {
6547       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6548       // Check whether it alive or dead.
6549       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6550       char *tail = value;
6551       char *flag_addr_str = NULL;
6552       char *flag_val_str = NULL;
6553       char const *file_name = NULL;
6554       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6555       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6556       file_name = tail;
6557       if (tail != NULL) {
6558         long *flag_addr = 0;
6559         unsigned long flag_val = 0;
6560         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6561         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6562         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6563           // First, check whether environment-encoded address is mapped into
6564           // addr space.
6565           // If so, dereference it to see if it still has the right value.
6566           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6567             neighbor = 1;
6568           } else {
6569             // If not, then we know the other copy of the library is no longer
6570             // running.
6571             neighbor = 2;
6572           }
6573         }
6574       }
6575       switch (neighbor) {
6576       case 0: // Cannot parse environment variable -- neighbor status unknown.
6577         // Assume it is the incompatible format of future version of the
6578         // library. Assume the other library is alive.
6579         // WARN( ... ); // TODO: Issue a warning.
6580         file_name = "unknown library";
6581         KMP_FALLTHROUGH();
6582       // Attention! Falling to the next case. That's intentional.
6583       case 1: { // Neighbor is alive.
6584         // Check it is allowed.
6585         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6586         if (!__kmp_str_match_true(duplicate_ok)) {
6587           // That's not allowed. Issue fatal error.
6588           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6589                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6590         }
6591         KMP_INTERNAL_FREE(duplicate_ok);
6592         __kmp_duplicate_library_ok = 1;
6593         done = 1; // Exit the loop.
6594       } break;
6595       case 2: { // Neighbor is dead.
6596 
6597 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6598         // close shared memory.
6599         shm_unlink(shm_name); // this removes file in /dev/shm
6600 #else
6601         // Clear the variable and try to register library again.
6602         __kmp_env_unset(name);
6603 #endif
6604       } break;
6605       default: {
6606         KMP_DEBUG_ASSERT(0);
6607       } break;
6608       }
6609     }
6610     KMP_INTERNAL_FREE((void *)value);
6611 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6612     KMP_INTERNAL_FREE((void *)shm_name);
6613 #endif
6614   } // while
6615   KMP_INTERNAL_FREE((void *)name);
6616 
6617 } // func __kmp_register_library_startup
6618 
6619 void __kmp_unregister_library(void) {
6620 
6621   char *name = __kmp_reg_status_name();
6622   char *value = NULL;
6623 
6624 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6625   char *shm_name = __kmp_str_format("/%s", name);
6626   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6627   if (fd1 == -1) {
6628     // file did not open. return.
6629     return;
6630   }
6631   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6632   if (data1 != MAP_FAILED) {
6633     value = __kmp_str_format("%s", data1); // read value from SHM
6634     munmap(data1, SHM_SIZE);
6635   }
6636   close(fd1);
6637 #else
6638   value = __kmp_env_get(name);
6639 #endif
6640 
6641   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6642   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6643   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6644 //  Ok, this is our variable. Delete it.
6645 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6646     shm_unlink(shm_name); // this removes file in /dev/shm
6647 #else
6648     __kmp_env_unset(name);
6649 #endif
6650   }
6651 
6652 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6653   KMP_INTERNAL_FREE(shm_name);
6654 #endif
6655 
6656   KMP_INTERNAL_FREE(__kmp_registration_str);
6657   KMP_INTERNAL_FREE(value);
6658   KMP_INTERNAL_FREE(name);
6659 
6660   __kmp_registration_flag = 0;
6661   __kmp_registration_str = NULL;
6662 
6663 } // __kmp_unregister_library
6664 
6665 // End of Library registration stuff.
6666 // -----------------------------------------------------------------------------
6667 
6668 #if KMP_MIC_SUPPORTED
6669 
6670 static void __kmp_check_mic_type() {
6671   kmp_cpuid_t cpuid_state = {0};
6672   kmp_cpuid_t *cs_p = &cpuid_state;
6673   __kmp_x86_cpuid(1, 0, cs_p);
6674   // We don't support mic1 at the moment
6675   if ((cs_p->eax & 0xff0) == 0xB10) {
6676     __kmp_mic_type = mic2;
6677   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6678     __kmp_mic_type = mic3;
6679   } else {
6680     __kmp_mic_type = non_mic;
6681   }
6682 }
6683 
6684 #endif /* KMP_MIC_SUPPORTED */
6685 
6686 #if KMP_HAVE_UMWAIT
6687 static void __kmp_user_level_mwait_init() {
6688   struct kmp_cpuid buf;
6689   __kmp_x86_cpuid(7, 0, &buf);
6690   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6691   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6692                 __kmp_umwait_enabled));
6693 }
6694 #elif KMP_HAVE_MWAIT
6695 #ifndef AT_INTELPHIUSERMWAIT
6696 // Spurious, non-existent value that should always fail to return anything.
6697 // Will be replaced with the correct value when we know that.
6698 #define AT_INTELPHIUSERMWAIT 10000
6699 #endif
6700 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6701 // earlier OS is used to build the RTL, we'll use the following internal
6702 // function when the entry is not found.
6703 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6704 unsigned long getauxval(unsigned long) { return 0; }
6705 
6706 static void __kmp_user_level_mwait_init() {
6707   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6708   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6709   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6710   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6711   if (__kmp_mic_type == mic3) {
6712     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6713     if ((res & 0x1) || __kmp_user_level_mwait) {
6714       __kmp_mwait_enabled = TRUE;
6715       if (__kmp_user_level_mwait) {
6716         KMP_INFORM(EnvMwaitWarn);
6717       }
6718     } else {
6719       __kmp_mwait_enabled = FALSE;
6720     }
6721   }
6722   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6723                 "__kmp_mwait_enabled = %d\n",
6724                 __kmp_mic_type, __kmp_mwait_enabled));
6725 }
6726 #endif /* KMP_HAVE_UMWAIT */
6727 
6728 static void __kmp_do_serial_initialize(void) {
6729   int i, gtid;
6730   size_t size;
6731 
6732   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6733 
6734   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6735   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6736   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6737   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6738   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6739 
6740 #if OMPT_SUPPORT
6741   ompt_pre_init();
6742 #endif
6743 #if OMPD_SUPPORT
6744   __kmp_env_dump();
6745   ompd_init();
6746 #endif
6747 
6748   __kmp_validate_locks();
6749 
6750   /* Initialize internal memory allocator */
6751   __kmp_init_allocator();
6752 
6753   /* Register the library startup via an environment variable and check to see
6754      whether another copy of the library is already registered. */
6755 
6756   __kmp_register_library_startup();
6757 
6758   /* TODO reinitialization of library */
6759   if (TCR_4(__kmp_global.g.g_done)) {
6760     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6761   }
6762 
6763   __kmp_global.g.g_abort = 0;
6764   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6765 
6766 /* initialize the locks */
6767 #if KMP_USE_ADAPTIVE_LOCKS
6768 #if KMP_DEBUG_ADAPTIVE_LOCKS
6769   __kmp_init_speculative_stats();
6770 #endif
6771 #endif
6772 #if KMP_STATS_ENABLED
6773   __kmp_stats_init();
6774 #endif
6775   __kmp_init_lock(&__kmp_global_lock);
6776   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6777   __kmp_init_lock(&__kmp_debug_lock);
6778   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6779   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6780   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6781   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6782   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6783   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6784   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6785   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6786   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6787   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6788   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6789   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6790   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6791   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6792   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6793 #if KMP_USE_MONITOR
6794   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6795 #endif
6796   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6797 
6798   /* conduct initialization and initial setup of configuration */
6799 
6800   __kmp_runtime_initialize();
6801 
6802 #if KMP_MIC_SUPPORTED
6803   __kmp_check_mic_type();
6804 #endif
6805 
6806 // Some global variable initialization moved here from kmp_env_initialize()
6807 #ifdef KMP_DEBUG
6808   kmp_diag = 0;
6809 #endif
6810   __kmp_abort_delay = 0;
6811 
6812   // From __kmp_init_dflt_team_nth()
6813   /* assume the entire machine will be used */
6814   __kmp_dflt_team_nth_ub = __kmp_xproc;
6815   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6816     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6817   }
6818   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6819     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6820   }
6821   __kmp_max_nth = __kmp_sys_max_nth;
6822   __kmp_cg_max_nth = __kmp_sys_max_nth;
6823   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6824   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6825     __kmp_teams_max_nth = __kmp_sys_max_nth;
6826   }
6827 
6828   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6829   // part
6830   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6831 #if KMP_USE_MONITOR
6832   __kmp_monitor_wakeups =
6833       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6834   __kmp_bt_intervals =
6835       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6836 #endif
6837   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6838   __kmp_library = library_throughput;
6839   // From KMP_SCHEDULE initialization
6840   __kmp_static = kmp_sch_static_balanced;
6841 // AC: do not use analytical here, because it is non-monotonous
6842 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6843 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6844 // need to repeat assignment
6845 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6846 // bit control and barrier method control parts
6847 #if KMP_FAST_REDUCTION_BARRIER
6848 #define kmp_reduction_barrier_gather_bb ((int)1)
6849 #define kmp_reduction_barrier_release_bb ((int)1)
6850 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6851 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6852 #endif // KMP_FAST_REDUCTION_BARRIER
6853   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6854     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6855     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6856     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6857     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6858 #if KMP_FAST_REDUCTION_BARRIER
6859     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6860       // lin_64 ): hyper,1
6861       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6862       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6863       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6864       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6865     }
6866 #endif // KMP_FAST_REDUCTION_BARRIER
6867   }
6868 #if KMP_FAST_REDUCTION_BARRIER
6869 #undef kmp_reduction_barrier_release_pat
6870 #undef kmp_reduction_barrier_gather_pat
6871 #undef kmp_reduction_barrier_release_bb
6872 #undef kmp_reduction_barrier_gather_bb
6873 #endif // KMP_FAST_REDUCTION_BARRIER
6874 #if KMP_MIC_SUPPORTED
6875   if (__kmp_mic_type == mic2) { // KNC
6876     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6877     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6878     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6879         1; // forkjoin release
6880     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6881     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6882   }
6883 #if KMP_FAST_REDUCTION_BARRIER
6884   if (__kmp_mic_type == mic2) { // KNC
6885     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6886     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6887   }
6888 #endif // KMP_FAST_REDUCTION_BARRIER
6889 #endif // KMP_MIC_SUPPORTED
6890 
6891 // From KMP_CHECKS initialization
6892 #ifdef KMP_DEBUG
6893   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6894 #else
6895   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6896 #endif
6897 
6898   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6899   __kmp_foreign_tp = TRUE;
6900 
6901   __kmp_global.g.g_dynamic = FALSE;
6902   __kmp_global.g.g_dynamic_mode = dynamic_default;
6903 
6904   __kmp_init_nesting_mode();
6905 
6906   __kmp_env_initialize(NULL);
6907 
6908 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6909   __kmp_user_level_mwait_init();
6910 #endif
6911 // Print all messages in message catalog for testing purposes.
6912 #ifdef KMP_DEBUG
6913   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6914   if (__kmp_str_match_true(val)) {
6915     kmp_str_buf_t buffer;
6916     __kmp_str_buf_init(&buffer);
6917     __kmp_i18n_dump_catalog(&buffer);
6918     __kmp_printf("%s", buffer.str);
6919     __kmp_str_buf_free(&buffer);
6920   }
6921   __kmp_env_free(&val);
6922 #endif
6923 
6924   __kmp_threads_capacity =
6925       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6926   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6927   __kmp_tp_capacity = __kmp_default_tp_capacity(
6928       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6929 
6930   // If the library is shut down properly, both pools must be NULL. Just in
6931   // case, set them to NULL -- some memory may leak, but subsequent code will
6932   // work even if pools are not freed.
6933   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6934   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6935   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6936   __kmp_thread_pool = NULL;
6937   __kmp_thread_pool_insert_pt = NULL;
6938   __kmp_team_pool = NULL;
6939 
6940   /* Allocate all of the variable sized records */
6941   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6942    * expandable */
6943   /* Since allocation is cache-aligned, just add extra padding at the end */
6944   size =
6945       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6946       CACHE_LINE;
6947   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6948   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6949                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6950 
6951   /* init thread counts */
6952   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6953                    0); // Asserts fail if the library is reinitializing and
6954   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6955   __kmp_all_nth = 0;
6956   __kmp_nth = 0;
6957 
6958   /* setup the uber master thread and hierarchy */
6959   gtid = __kmp_register_root(TRUE);
6960   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6961   KMP_ASSERT(KMP_UBER_GTID(gtid));
6962   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6963 
6964   KMP_MB(); /* Flush all pending memory write invalidates.  */
6965 
6966   __kmp_common_initialize();
6967 
6968 #if KMP_OS_UNIX
6969   /* invoke the child fork handler */
6970   __kmp_register_atfork();
6971 #endif
6972 
6973 #if !KMP_DYNAMIC_LIB
6974   {
6975     /* Invoke the exit handler when the program finishes, only for static
6976        library. For dynamic library, we already have _fini and DllMain. */
6977     int rc = atexit(__kmp_internal_end_atexit);
6978     if (rc != 0) {
6979       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6980                   __kmp_msg_null);
6981     }
6982   }
6983 #endif
6984 
6985 #if KMP_HANDLE_SIGNALS
6986 #if KMP_OS_UNIX
6987   /* NOTE: make sure that this is called before the user installs their own
6988      signal handlers so that the user handlers are called first. this way they
6989      can return false, not call our handler, avoid terminating the library, and
6990      continue execution where they left off. */
6991   __kmp_install_signals(FALSE);
6992 #endif /* KMP_OS_UNIX */
6993 #if KMP_OS_WINDOWS
6994   __kmp_install_signals(TRUE);
6995 #endif /* KMP_OS_WINDOWS */
6996 #endif
6997 
6998   /* we have finished the serial initialization */
6999   __kmp_init_counter++;
7000 
7001   __kmp_init_serial = TRUE;
7002 
7003   if (__kmp_settings) {
7004     __kmp_env_print();
7005   }
7006 
7007   if (__kmp_display_env || __kmp_display_env_verbose) {
7008     __kmp_env_print_2();
7009   }
7010 
7011 #if OMPT_SUPPORT
7012   ompt_post_init();
7013 #endif
7014 
7015   KMP_MB();
7016 
7017   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7018 }
7019 
7020 void __kmp_serial_initialize(void) {
7021   if (__kmp_init_serial) {
7022     return;
7023   }
7024   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7025   if (__kmp_init_serial) {
7026     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7027     return;
7028   }
7029   __kmp_do_serial_initialize();
7030   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7031 }
7032 
7033 static void __kmp_do_middle_initialize(void) {
7034   int i, j;
7035   int prev_dflt_team_nth;
7036 
7037   if (!__kmp_init_serial) {
7038     __kmp_do_serial_initialize();
7039   }
7040 
7041   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7042 
7043   // Save the previous value for the __kmp_dflt_team_nth so that
7044   // we can avoid some reinitialization if it hasn't changed.
7045   prev_dflt_team_nth = __kmp_dflt_team_nth;
7046 
7047 #if KMP_AFFINITY_SUPPORTED
7048   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7049   // number of cores on the machine.
7050   __kmp_affinity_initialize();
7051 
7052 #endif /* KMP_AFFINITY_SUPPORTED */
7053 
7054   KMP_ASSERT(__kmp_xproc > 0);
7055   if (__kmp_avail_proc == 0) {
7056     __kmp_avail_proc = __kmp_xproc;
7057   }
7058 
7059   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7060   // correct them now
7061   j = 0;
7062   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7063     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7064         __kmp_avail_proc;
7065     j++;
7066   }
7067 
7068   if (__kmp_dflt_team_nth == 0) {
7069 #ifdef KMP_DFLT_NTH_CORES
7070     // Default #threads = #cores
7071     __kmp_dflt_team_nth = __kmp_ncores;
7072     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7073                   "__kmp_ncores (%d)\n",
7074                   __kmp_dflt_team_nth));
7075 #else
7076     // Default #threads = #available OS procs
7077     __kmp_dflt_team_nth = __kmp_avail_proc;
7078     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7079                   "__kmp_avail_proc(%d)\n",
7080                   __kmp_dflt_team_nth));
7081 #endif /* KMP_DFLT_NTH_CORES */
7082   }
7083 
7084   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7085     __kmp_dflt_team_nth = KMP_MIN_NTH;
7086   }
7087   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7088     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7089   }
7090 
7091   if (__kmp_nesting_mode > 0)
7092     __kmp_set_nesting_mode_threads();
7093 
7094   // There's no harm in continuing if the following check fails,
7095   // but it indicates an error in the previous logic.
7096   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7097 
7098   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7099     // Run through the __kmp_threads array and set the num threads icv for each
7100     // root thread that is currently registered with the RTL (which has not
7101     // already explicitly set its nthreads-var with a call to
7102     // omp_set_num_threads()).
7103     for (i = 0; i < __kmp_threads_capacity; i++) {
7104       kmp_info_t *thread = __kmp_threads[i];
7105       if (thread == NULL)
7106         continue;
7107       if (thread->th.th_current_task->td_icvs.nproc != 0)
7108         continue;
7109 
7110       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7111     }
7112   }
7113   KA_TRACE(
7114       20,
7115       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7116        __kmp_dflt_team_nth));
7117 
7118 #ifdef KMP_ADJUST_BLOCKTIME
7119   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7120   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7121     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7122     if (__kmp_nth > __kmp_avail_proc) {
7123       __kmp_zero_bt = TRUE;
7124     }
7125   }
7126 #endif /* KMP_ADJUST_BLOCKTIME */
7127 
7128   /* we have finished middle initialization */
7129   TCW_SYNC_4(__kmp_init_middle, TRUE);
7130 
7131   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7132 }
7133 
7134 void __kmp_middle_initialize(void) {
7135   if (__kmp_init_middle) {
7136     return;
7137   }
7138   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7139   if (__kmp_init_middle) {
7140     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7141     return;
7142   }
7143   __kmp_do_middle_initialize();
7144   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7145 }
7146 
7147 void __kmp_parallel_initialize(void) {
7148   int gtid = __kmp_entry_gtid(); // this might be a new root
7149 
7150   /* synchronize parallel initialization (for sibling) */
7151   if (TCR_4(__kmp_init_parallel))
7152     return;
7153   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7154   if (TCR_4(__kmp_init_parallel)) {
7155     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7156     return;
7157   }
7158 
7159   /* TODO reinitialization after we have already shut down */
7160   if (TCR_4(__kmp_global.g.g_done)) {
7161     KA_TRACE(
7162         10,
7163         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7164     __kmp_infinite_loop();
7165   }
7166 
7167   /* jc: The lock __kmp_initz_lock is already held, so calling
7168      __kmp_serial_initialize would cause a deadlock.  So we call
7169      __kmp_do_serial_initialize directly. */
7170   if (!__kmp_init_middle) {
7171     __kmp_do_middle_initialize();
7172   }
7173   __kmp_assign_root_init_mask();
7174   __kmp_resume_if_hard_paused();
7175 
7176   /* begin initialization */
7177   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7178   KMP_ASSERT(KMP_UBER_GTID(gtid));
7179 
7180 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7181   // Save the FP control regs.
7182   // Worker threads will set theirs to these values at thread startup.
7183   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7184   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7185   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7186 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7187 
7188 #if KMP_OS_UNIX
7189 #if KMP_HANDLE_SIGNALS
7190   /*  must be after __kmp_serial_initialize  */
7191   __kmp_install_signals(TRUE);
7192 #endif
7193 #endif
7194 
7195   __kmp_suspend_initialize();
7196 
7197 #if defined(USE_LOAD_BALANCE)
7198   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7199     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7200   }
7201 #else
7202   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7203     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7204   }
7205 #endif
7206 
7207   if (__kmp_version) {
7208     __kmp_print_version_2();
7209   }
7210 
7211   /* we have finished parallel initialization */
7212   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7213 
7214   KMP_MB();
7215   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7216 
7217   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7218 }
7219 
7220 void __kmp_hidden_helper_initialize() {
7221   if (TCR_4(__kmp_init_hidden_helper))
7222     return;
7223 
7224   // __kmp_parallel_initialize is required before we initialize hidden helper
7225   if (!TCR_4(__kmp_init_parallel))
7226     __kmp_parallel_initialize();
7227 
7228   // Double check. Note that this double check should not be placed before
7229   // __kmp_parallel_initialize as it will cause dead lock.
7230   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7231   if (TCR_4(__kmp_init_hidden_helper)) {
7232     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7233     return;
7234   }
7235 
7236   // Set the count of hidden helper tasks to be executed to zero
7237   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7238 
7239   // Set the global variable indicating that we're initializing hidden helper
7240   // team/threads
7241   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7242 
7243   // Platform independent initialization
7244   __kmp_do_initialize_hidden_helper_threads();
7245 
7246   // Wait here for the finish of initialization of hidden helper teams
7247   __kmp_hidden_helper_threads_initz_wait();
7248 
7249   // We have finished hidden helper initialization
7250   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7251 
7252   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7253 }
7254 
7255 /* ------------------------------------------------------------------------ */
7256 
7257 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7258                                    kmp_team_t *team) {
7259   kmp_disp_t *dispatch;
7260 
7261   KMP_MB();
7262 
7263   /* none of the threads have encountered any constructs, yet. */
7264   this_thr->th.th_local.this_construct = 0;
7265 #if KMP_CACHE_MANAGE
7266   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7267 #endif /* KMP_CACHE_MANAGE */
7268   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7269   KMP_DEBUG_ASSERT(dispatch);
7270   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7271   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7272   // this_thr->th.th_info.ds.ds_tid ] );
7273 
7274   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7275   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7276   if (__kmp_env_consistency_check)
7277     __kmp_push_parallel(gtid, team->t.t_ident);
7278 
7279   KMP_MB(); /* Flush all pending memory write invalidates.  */
7280 }
7281 
7282 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7283                                   kmp_team_t *team) {
7284   if (__kmp_env_consistency_check)
7285     __kmp_pop_parallel(gtid, team->t.t_ident);
7286 
7287   __kmp_finish_implicit_task(this_thr);
7288 }
7289 
7290 int __kmp_invoke_task_func(int gtid) {
7291   int rc;
7292   int tid = __kmp_tid_from_gtid(gtid);
7293   kmp_info_t *this_thr = __kmp_threads[gtid];
7294   kmp_team_t *team = this_thr->th.th_team;
7295 
7296   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7297 #if USE_ITT_BUILD
7298   if (__itt_stack_caller_create_ptr) {
7299     // inform ittnotify about entering user's code
7300     if (team->t.t_stack_id != NULL) {
7301       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7302     } else {
7303       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7304       __kmp_itt_stack_callee_enter(
7305           (__itt_caller)team->t.t_parent->t.t_stack_id);
7306     }
7307   }
7308 #endif /* USE_ITT_BUILD */
7309 #if INCLUDE_SSC_MARKS
7310   SSC_MARK_INVOKING();
7311 #endif
7312 
7313 #if OMPT_SUPPORT
7314   void *dummy;
7315   void **exit_frame_p;
7316   ompt_data_t *my_task_data;
7317   ompt_data_t *my_parallel_data;
7318   int ompt_team_size;
7319 
7320   if (ompt_enabled.enabled) {
7321     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7322                          .ompt_task_info.frame.exit_frame.ptr);
7323   } else {
7324     exit_frame_p = &dummy;
7325   }
7326 
7327   my_task_data =
7328       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7329   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7330   if (ompt_enabled.ompt_callback_implicit_task) {
7331     ompt_team_size = team->t.t_nproc;
7332     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7333         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7334         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7335     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7336   }
7337 #endif
7338 
7339 #if KMP_STATS_ENABLED
7340   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7341   if (previous_state == stats_state_e::TEAMS_REGION) {
7342     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7343   } else {
7344     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7345   }
7346   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7347 #endif
7348 
7349   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7350                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7351 #if OMPT_SUPPORT
7352                               ,
7353                               exit_frame_p
7354 #endif
7355   );
7356 #if OMPT_SUPPORT
7357   *exit_frame_p = NULL;
7358   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7359 #endif
7360 
7361 #if KMP_STATS_ENABLED
7362   if (previous_state == stats_state_e::TEAMS_REGION) {
7363     KMP_SET_THREAD_STATE(previous_state);
7364   }
7365   KMP_POP_PARTITIONED_TIMER();
7366 #endif
7367 
7368 #if USE_ITT_BUILD
7369   if (__itt_stack_caller_create_ptr) {
7370     // inform ittnotify about leaving user's code
7371     if (team->t.t_stack_id != NULL) {
7372       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7373     } else {
7374       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7375       __kmp_itt_stack_callee_leave(
7376           (__itt_caller)team->t.t_parent->t.t_stack_id);
7377     }
7378   }
7379 #endif /* USE_ITT_BUILD */
7380   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7381 
7382   return rc;
7383 }
7384 
7385 void __kmp_teams_master(int gtid) {
7386   // This routine is called by all primary threads in teams construct
7387   kmp_info_t *thr = __kmp_threads[gtid];
7388   kmp_team_t *team = thr->th.th_team;
7389   ident_t *loc = team->t.t_ident;
7390   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7391   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7392   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7393   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7394                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7395 
7396   // This thread is a new CG root.  Set up the proper variables.
7397   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7398   tmp->cg_root = thr; // Make thr the CG root
7399   // Init to thread limit stored when league primary threads were forked
7400   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7401   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7402   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7403                  " cg_nthreads to 1\n",
7404                  thr, tmp));
7405   tmp->up = thr->th.th_cg_roots;
7406   thr->th.th_cg_roots = tmp;
7407 
7408 // Launch league of teams now, but not let workers execute
7409 // (they hang on fork barrier until next parallel)
7410 #if INCLUDE_SSC_MARKS
7411   SSC_MARK_FORKING();
7412 #endif
7413   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7414                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7415                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7416 #if INCLUDE_SSC_MARKS
7417   SSC_MARK_JOINING();
7418 #endif
7419   // If the team size was reduced from the limit, set it to the new size
7420   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7421     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7422   // AC: last parameter "1" eliminates join barrier which won't work because
7423   // worker threads are in a fork barrier waiting for more parallel regions
7424   __kmp_join_call(loc, gtid
7425 #if OMPT_SUPPORT
7426                   ,
7427                   fork_context_intel
7428 #endif
7429                   ,
7430                   1);
7431 }
7432 
7433 int __kmp_invoke_teams_master(int gtid) {
7434   kmp_info_t *this_thr = __kmp_threads[gtid];
7435   kmp_team_t *team = this_thr->th.th_team;
7436 #if KMP_DEBUG
7437   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7438     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7439                      (void *)__kmp_teams_master);
7440 #endif
7441   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7442 #if OMPT_SUPPORT
7443   int tid = __kmp_tid_from_gtid(gtid);
7444   ompt_data_t *task_data =
7445       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7446   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7447   if (ompt_enabled.ompt_callback_implicit_task) {
7448     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7449         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7450         ompt_task_initial);
7451     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7452   }
7453 #endif
7454   __kmp_teams_master(gtid);
7455 #if OMPT_SUPPORT
7456   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7457 #endif
7458   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7459   return 1;
7460 }
7461 
7462 /* this sets the requested number of threads for the next parallel region
7463    encountered by this team. since this should be enclosed in the forkjoin
7464    critical section it should avoid race conditions with asymmetrical nested
7465    parallelism */
7466 
7467 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7468   kmp_info_t *thr = __kmp_threads[gtid];
7469 
7470   if (num_threads > 0)
7471     thr->th.th_set_nproc = num_threads;
7472 }
7473 
7474 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7475                                     int num_threads) {
7476   KMP_DEBUG_ASSERT(thr);
7477   // Remember the number of threads for inner parallel regions
7478   if (!TCR_4(__kmp_init_middle))
7479     __kmp_middle_initialize(); // get internal globals calculated
7480   __kmp_assign_root_init_mask();
7481   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7482   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7483 
7484   if (num_threads == 0) {
7485     if (__kmp_teams_thread_limit > 0) {
7486       num_threads = __kmp_teams_thread_limit;
7487     } else {
7488       num_threads = __kmp_avail_proc / num_teams;
7489     }
7490     // adjust num_threads w/o warning as it is not user setting
7491     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7492     // no thread_limit clause specified -  do not change thread-limit-var ICV
7493     if (num_threads > __kmp_dflt_team_nth) {
7494       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7495     }
7496     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7497       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7498     } // prevent team size to exceed thread-limit-var
7499     if (num_teams * num_threads > __kmp_teams_max_nth) {
7500       num_threads = __kmp_teams_max_nth / num_teams;
7501     }
7502     if (num_threads == 0) {
7503       num_threads = 1;
7504     }
7505   } else {
7506     // This thread will be the primary thread of the league primary threads
7507     // Store new thread limit; old limit is saved in th_cg_roots list
7508     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7509     // num_threads = min(num_threads, nthreads-var)
7510     if (num_threads > __kmp_dflt_team_nth) {
7511       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7512     }
7513     if (num_teams * num_threads > __kmp_teams_max_nth) {
7514       int new_threads = __kmp_teams_max_nth / num_teams;
7515       if (new_threads == 0) {
7516         new_threads = 1;
7517       }
7518       if (new_threads != num_threads) {
7519         if (!__kmp_reserve_warn) { // user asked for too many threads
7520           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7521           __kmp_msg(kmp_ms_warning,
7522                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7523                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7524         }
7525       }
7526       num_threads = new_threads;
7527     }
7528   }
7529   thr->th.th_teams_size.nth = num_threads;
7530 }
7531 
7532 /* this sets the requested number of teams for the teams region and/or
7533    the number of threads for the next parallel region encountered  */
7534 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7535                           int num_threads) {
7536   kmp_info_t *thr = __kmp_threads[gtid];
7537   KMP_DEBUG_ASSERT(num_teams >= 0);
7538   KMP_DEBUG_ASSERT(num_threads >= 0);
7539 
7540   if (num_teams == 0) {
7541     if (__kmp_nteams > 0) {
7542       num_teams = __kmp_nteams;
7543     } else {
7544       num_teams = 1; // default number of teams is 1.
7545     }
7546   }
7547   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7548     if (!__kmp_reserve_warn) {
7549       __kmp_reserve_warn = 1;
7550       __kmp_msg(kmp_ms_warning,
7551                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7552                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7553     }
7554     num_teams = __kmp_teams_max_nth;
7555   }
7556   // Set number of teams (number of threads in the outer "parallel" of the
7557   // teams)
7558   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7559 
7560   __kmp_push_thread_limit(thr, num_teams, num_threads);
7561 }
7562 
7563 /* This sets the requested number of teams for the teams region and/or
7564    the number of threads for the next parallel region encountered  */
7565 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7566                              int num_teams_ub, int num_threads) {
7567   kmp_info_t *thr = __kmp_threads[gtid];
7568   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7569   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7570   KMP_DEBUG_ASSERT(num_threads >= 0);
7571 
7572   if (num_teams_lb > num_teams_ub) {
7573     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7574                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7575   }
7576 
7577   int num_teams = 1; // defalt number of teams is 1.
7578 
7579   if (num_teams_lb == 0 && num_teams_ub > 0)
7580     num_teams_lb = num_teams_ub;
7581 
7582   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7583     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7584     if (num_teams > __kmp_teams_max_nth) {
7585       if (!__kmp_reserve_warn) {
7586         __kmp_reserve_warn = 1;
7587         __kmp_msg(kmp_ms_warning,
7588                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7589                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7590       }
7591       num_teams = __kmp_teams_max_nth;
7592     }
7593   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7594     num_teams = num_teams_ub;
7595   } else { // num_teams_lb <= num_teams <= num_teams_ub
7596     if (num_threads == 0) {
7597       if (num_teams_ub > __kmp_teams_max_nth) {
7598         num_teams = num_teams_lb;
7599       } else {
7600         num_teams = num_teams_ub;
7601       }
7602     } else {
7603       num_teams = (num_threads > __kmp_teams_max_nth)
7604                       ? num_teams
7605                       : __kmp_teams_max_nth / num_threads;
7606       if (num_teams < num_teams_lb) {
7607         num_teams = num_teams_lb;
7608       } else if (num_teams > num_teams_ub) {
7609         num_teams = num_teams_ub;
7610       }
7611     }
7612   }
7613   // Set number of teams (number of threads in the outer "parallel" of the
7614   // teams)
7615   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7616 
7617   __kmp_push_thread_limit(thr, num_teams, num_threads);
7618 }
7619 
7620 // Set the proc_bind var to use in the following parallel region.
7621 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7622   kmp_info_t *thr = __kmp_threads[gtid];
7623   thr->th.th_set_proc_bind = proc_bind;
7624 }
7625 
7626 /* Launch the worker threads into the microtask. */
7627 
7628 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7629   kmp_info_t *this_thr = __kmp_threads[gtid];
7630 
7631 #ifdef KMP_DEBUG
7632   int f;
7633 #endif /* KMP_DEBUG */
7634 
7635   KMP_DEBUG_ASSERT(team);
7636   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7637   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7638   KMP_MB(); /* Flush all pending memory write invalidates.  */
7639 
7640   team->t.t_construct = 0; /* no single directives seen yet */
7641   team->t.t_ordered.dt.t_value =
7642       0; /* thread 0 enters the ordered section first */
7643 
7644   /* Reset the identifiers on the dispatch buffer */
7645   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7646   if (team->t.t_max_nproc > 1) {
7647     int i;
7648     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7649       team->t.t_disp_buffer[i].buffer_index = i;
7650       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7651     }
7652   } else {
7653     team->t.t_disp_buffer[0].buffer_index = 0;
7654     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7655   }
7656 
7657   KMP_MB(); /* Flush all pending memory write invalidates.  */
7658   KMP_ASSERT(this_thr->th.th_team == team);
7659 
7660 #ifdef KMP_DEBUG
7661   for (f = 0; f < team->t.t_nproc; f++) {
7662     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7663                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7664   }
7665 #endif /* KMP_DEBUG */
7666 
7667   /* release the worker threads so they may begin working */
7668   __kmp_fork_barrier(gtid, 0);
7669 }
7670 
7671 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7672   kmp_info_t *this_thr = __kmp_threads[gtid];
7673 
7674   KMP_DEBUG_ASSERT(team);
7675   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7676   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7677   KMP_MB(); /* Flush all pending memory write invalidates.  */
7678 
7679   /* Join barrier after fork */
7680 
7681 #ifdef KMP_DEBUG
7682   if (__kmp_threads[gtid] &&
7683       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7684     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7685                  __kmp_threads[gtid]);
7686     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7687                  "team->t.t_nproc=%d\n",
7688                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7689                  team->t.t_nproc);
7690     __kmp_print_structure();
7691   }
7692   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7693                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7694 #endif /* KMP_DEBUG */
7695 
7696   __kmp_join_barrier(gtid); /* wait for everyone */
7697 #if OMPT_SUPPORT
7698   if (ompt_enabled.enabled &&
7699       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7700     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7701     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7702     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7703 #if OMPT_OPTIONAL
7704     void *codeptr = NULL;
7705     if (KMP_MASTER_TID(ds_tid) &&
7706         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7707          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7708       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7709 
7710     if (ompt_enabled.ompt_callback_sync_region_wait) {
7711       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7712           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7713           codeptr);
7714     }
7715     if (ompt_enabled.ompt_callback_sync_region) {
7716       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7717           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7718           codeptr);
7719     }
7720 #endif
7721     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7722       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7723           ompt_scope_end, NULL, task_data, 0, ds_tid,
7724           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7725     }
7726   }
7727 #endif
7728 
7729   KMP_MB(); /* Flush all pending memory write invalidates.  */
7730   KMP_ASSERT(this_thr->th.th_team == team);
7731 }
7732 
7733 /* ------------------------------------------------------------------------ */
7734 
7735 #ifdef USE_LOAD_BALANCE
7736 
7737 // Return the worker threads actively spinning in the hot team, if we
7738 // are at the outermost level of parallelism.  Otherwise, return 0.
7739 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7740   int i;
7741   int retval;
7742   kmp_team_t *hot_team;
7743 
7744   if (root->r.r_active) {
7745     return 0;
7746   }
7747   hot_team = root->r.r_hot_team;
7748   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7749     return hot_team->t.t_nproc - 1; // Don't count primary thread
7750   }
7751 
7752   // Skip the primary thread - it is accounted for elsewhere.
7753   retval = 0;
7754   for (i = 1; i < hot_team->t.t_nproc; i++) {
7755     if (hot_team->t.t_threads[i]->th.th_active) {
7756       retval++;
7757     }
7758   }
7759   return retval;
7760 }
7761 
7762 // Perform an automatic adjustment to the number of
7763 // threads used by the next parallel region.
7764 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7765   int retval;
7766   int pool_active;
7767   int hot_team_active;
7768   int team_curr_active;
7769   int system_active;
7770 
7771   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7772                 set_nproc));
7773   KMP_DEBUG_ASSERT(root);
7774   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7775                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7776   KMP_DEBUG_ASSERT(set_nproc > 1);
7777 
7778   if (set_nproc == 1) {
7779     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7780     return 1;
7781   }
7782 
7783   // Threads that are active in the thread pool, active in the hot team for this
7784   // particular root (if we are at the outer par level), and the currently
7785   // executing thread (to become the primary thread) are available to add to the
7786   // new team, but are currently contributing to the system load, and must be
7787   // accounted for.
7788   pool_active = __kmp_thread_pool_active_nth;
7789   hot_team_active = __kmp_active_hot_team_nproc(root);
7790   team_curr_active = pool_active + hot_team_active + 1;
7791 
7792   // Check the system load.
7793   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7794   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7795                 "hot team active = %d\n",
7796                 system_active, pool_active, hot_team_active));
7797 
7798   if (system_active < 0) {
7799     // There was an error reading the necessary info from /proc, so use the
7800     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7801     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7802     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7803     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7804 
7805     // Make this call behave like the thread limit algorithm.
7806     retval = __kmp_avail_proc - __kmp_nth +
7807              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7808     if (retval > set_nproc) {
7809       retval = set_nproc;
7810     }
7811     if (retval < KMP_MIN_NTH) {
7812       retval = KMP_MIN_NTH;
7813     }
7814 
7815     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7816                   retval));
7817     return retval;
7818   }
7819 
7820   // There is a slight delay in the load balance algorithm in detecting new
7821   // running procs. The real system load at this instant should be at least as
7822   // large as the #active omp thread that are available to add to the team.
7823   if (system_active < team_curr_active) {
7824     system_active = team_curr_active;
7825   }
7826   retval = __kmp_avail_proc - system_active + team_curr_active;
7827   if (retval > set_nproc) {
7828     retval = set_nproc;
7829   }
7830   if (retval < KMP_MIN_NTH) {
7831     retval = KMP_MIN_NTH;
7832   }
7833 
7834   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7835   return retval;
7836 } // __kmp_load_balance_nproc()
7837 
7838 #endif /* USE_LOAD_BALANCE */
7839 
7840 /* ------------------------------------------------------------------------ */
7841 
7842 /* NOTE: this is called with the __kmp_init_lock held */
7843 void __kmp_cleanup(void) {
7844   int f;
7845 
7846   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7847 
7848   if (TCR_4(__kmp_init_parallel)) {
7849 #if KMP_HANDLE_SIGNALS
7850     __kmp_remove_signals();
7851 #endif
7852     TCW_4(__kmp_init_parallel, FALSE);
7853   }
7854 
7855   if (TCR_4(__kmp_init_middle)) {
7856 #if KMP_AFFINITY_SUPPORTED
7857     __kmp_affinity_uninitialize();
7858 #endif /* KMP_AFFINITY_SUPPORTED */
7859     __kmp_cleanup_hierarchy();
7860     TCW_4(__kmp_init_middle, FALSE);
7861   }
7862 
7863   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7864 
7865   if (__kmp_init_serial) {
7866     __kmp_runtime_destroy();
7867     __kmp_init_serial = FALSE;
7868   }
7869 
7870   __kmp_cleanup_threadprivate_caches();
7871 
7872   for (f = 0; f < __kmp_threads_capacity; f++) {
7873     if (__kmp_root[f] != NULL) {
7874       __kmp_free(__kmp_root[f]);
7875       __kmp_root[f] = NULL;
7876     }
7877   }
7878   __kmp_free(__kmp_threads);
7879   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7880   // there is no need in freeing __kmp_root.
7881   __kmp_threads = NULL;
7882   __kmp_root = NULL;
7883   __kmp_threads_capacity = 0;
7884 
7885 #if KMP_USE_DYNAMIC_LOCK
7886   __kmp_cleanup_indirect_user_locks();
7887 #else
7888   __kmp_cleanup_user_locks();
7889 #endif
7890 #if OMPD_SUPPORT
7891   if (ompd_state) {
7892     __kmp_free(ompd_env_block);
7893     ompd_env_block = NULL;
7894     ompd_env_block_size = 0;
7895   }
7896 #endif
7897 
7898 #if KMP_AFFINITY_SUPPORTED
7899   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7900   __kmp_cpuinfo_file = NULL;
7901 #endif /* KMP_AFFINITY_SUPPORTED */
7902 
7903 #if KMP_USE_ADAPTIVE_LOCKS
7904 #if KMP_DEBUG_ADAPTIVE_LOCKS
7905   __kmp_print_speculative_stats();
7906 #endif
7907 #endif
7908   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7909   __kmp_nested_nth.nth = NULL;
7910   __kmp_nested_nth.size = 0;
7911   __kmp_nested_nth.used = 0;
7912   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7913   __kmp_nested_proc_bind.bind_types = NULL;
7914   __kmp_nested_proc_bind.size = 0;
7915   __kmp_nested_proc_bind.used = 0;
7916   if (__kmp_affinity_format) {
7917     KMP_INTERNAL_FREE(__kmp_affinity_format);
7918     __kmp_affinity_format = NULL;
7919   }
7920 
7921   __kmp_i18n_catclose();
7922 
7923 #if KMP_USE_HIER_SCHED
7924   __kmp_hier_scheds.deallocate();
7925 #endif
7926 
7927 #if KMP_STATS_ENABLED
7928   __kmp_stats_fini();
7929 #endif
7930 
7931   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7932 }
7933 
7934 /* ------------------------------------------------------------------------ */
7935 
7936 int __kmp_ignore_mppbeg(void) {
7937   char *env;
7938 
7939   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7940     if (__kmp_str_match_false(env))
7941       return FALSE;
7942   }
7943   // By default __kmpc_begin() is no-op.
7944   return TRUE;
7945 }
7946 
7947 int __kmp_ignore_mppend(void) {
7948   char *env;
7949 
7950   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7951     if (__kmp_str_match_false(env))
7952       return FALSE;
7953   }
7954   // By default __kmpc_end() is no-op.
7955   return TRUE;
7956 }
7957 
7958 void __kmp_internal_begin(void) {
7959   int gtid;
7960   kmp_root_t *root;
7961 
7962   /* this is a very important step as it will register new sibling threads
7963      and assign these new uber threads a new gtid */
7964   gtid = __kmp_entry_gtid();
7965   root = __kmp_threads[gtid]->th.th_root;
7966   KMP_ASSERT(KMP_UBER_GTID(gtid));
7967 
7968   if (root->r.r_begin)
7969     return;
7970   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7971   if (root->r.r_begin) {
7972     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7973     return;
7974   }
7975 
7976   root->r.r_begin = TRUE;
7977 
7978   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7979 }
7980 
7981 /* ------------------------------------------------------------------------ */
7982 
7983 void __kmp_user_set_library(enum library_type arg) {
7984   int gtid;
7985   kmp_root_t *root;
7986   kmp_info_t *thread;
7987 
7988   /* first, make sure we are initialized so we can get our gtid */
7989 
7990   gtid = __kmp_entry_gtid();
7991   thread = __kmp_threads[gtid];
7992 
7993   root = thread->th.th_root;
7994 
7995   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7996                 library_serial));
7997   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7998                                   thread */
7999     KMP_WARNING(SetLibraryIncorrectCall);
8000     return;
8001   }
8002 
8003   switch (arg) {
8004   case library_serial:
8005     thread->th.th_set_nproc = 0;
8006     set__nproc(thread, 1);
8007     break;
8008   case library_turnaround:
8009     thread->th.th_set_nproc = 0;
8010     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8011                                            : __kmp_dflt_team_nth_ub);
8012     break;
8013   case library_throughput:
8014     thread->th.th_set_nproc = 0;
8015     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8016                                            : __kmp_dflt_team_nth_ub);
8017     break;
8018   default:
8019     KMP_FATAL(UnknownLibraryType, arg);
8020   }
8021 
8022   __kmp_aux_set_library(arg);
8023 }
8024 
8025 void __kmp_aux_set_stacksize(size_t arg) {
8026   if (!__kmp_init_serial)
8027     __kmp_serial_initialize();
8028 
8029 #if KMP_OS_DARWIN
8030   if (arg & (0x1000 - 1)) {
8031     arg &= ~(0x1000 - 1);
8032     if (arg + 0x1000) /* check for overflow if we round up */
8033       arg += 0x1000;
8034   }
8035 #endif
8036   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8037 
8038   /* only change the default stacksize before the first parallel region */
8039   if (!TCR_4(__kmp_init_parallel)) {
8040     size_t value = arg; /* argument is in bytes */
8041 
8042     if (value < __kmp_sys_min_stksize)
8043       value = __kmp_sys_min_stksize;
8044     else if (value > KMP_MAX_STKSIZE)
8045       value = KMP_MAX_STKSIZE;
8046 
8047     __kmp_stksize = value;
8048 
8049     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8050   }
8051 
8052   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8053 }
8054 
8055 /* set the behaviour of the runtime library */
8056 /* TODO this can cause some odd behaviour with sibling parallelism... */
8057 void __kmp_aux_set_library(enum library_type arg) {
8058   __kmp_library = arg;
8059 
8060   switch (__kmp_library) {
8061   case library_serial: {
8062     KMP_INFORM(LibraryIsSerial);
8063   } break;
8064   case library_turnaround:
8065     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8066       __kmp_use_yield = 2; // only yield when oversubscribed
8067     break;
8068   case library_throughput:
8069     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8070       __kmp_dflt_blocktime = 200;
8071     break;
8072   default:
8073     KMP_FATAL(UnknownLibraryType, arg);
8074   }
8075 }
8076 
8077 /* Getting team information common for all team API */
8078 // Returns NULL if not in teams construct
8079 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8080   kmp_info_t *thr = __kmp_entry_thread();
8081   teams_serialized = 0;
8082   if (thr->th.th_teams_microtask) {
8083     kmp_team_t *team = thr->th.th_team;
8084     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8085     int ii = team->t.t_level;
8086     teams_serialized = team->t.t_serialized;
8087     int level = tlevel + 1;
8088     KMP_DEBUG_ASSERT(ii >= tlevel);
8089     while (ii > level) {
8090       for (teams_serialized = team->t.t_serialized;
8091            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8092       }
8093       if (team->t.t_serialized && (!teams_serialized)) {
8094         team = team->t.t_parent;
8095         continue;
8096       }
8097       if (ii > level) {
8098         team = team->t.t_parent;
8099         ii--;
8100       }
8101     }
8102     return team;
8103   }
8104   return NULL;
8105 }
8106 
8107 int __kmp_aux_get_team_num() {
8108   int serialized;
8109   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8110   if (team) {
8111     if (serialized > 1) {
8112       return 0; // teams region is serialized ( 1 team of 1 thread ).
8113     } else {
8114       return team->t.t_master_tid;
8115     }
8116   }
8117   return 0;
8118 }
8119 
8120 int __kmp_aux_get_num_teams() {
8121   int serialized;
8122   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8123   if (team) {
8124     if (serialized > 1) {
8125       return 1;
8126     } else {
8127       return team->t.t_parent->t.t_nproc;
8128     }
8129   }
8130   return 1;
8131 }
8132 
8133 /* ------------------------------------------------------------------------ */
8134 
8135 /*
8136  * Affinity Format Parser
8137  *
8138  * Field is in form of: %[[[0].]size]type
8139  * % and type are required (%% means print a literal '%')
8140  * type is either single char or long name surrounded by {},
8141  * e.g., N or {num_threads}
8142  * 0 => leading zeros
8143  * . => right justified when size is specified
8144  * by default output is left justified
8145  * size is the *minimum* field length
8146  * All other characters are printed as is
8147  *
8148  * Available field types:
8149  * L {thread_level}      - omp_get_level()
8150  * n {thread_num}        - omp_get_thread_num()
8151  * h {host}              - name of host machine
8152  * P {process_id}        - process id (integer)
8153  * T {thread_identifier} - native thread identifier (integer)
8154  * N {num_threads}       - omp_get_num_threads()
8155  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8156  * a {thread_affinity}   - comma separated list of integers or integer ranges
8157  *                         (values of affinity mask)
8158  *
8159  * Implementation-specific field types can be added
8160  * If a type is unknown, print "undefined"
8161  */
8162 
8163 // Structure holding the short name, long name, and corresponding data type
8164 // for snprintf.  A table of these will represent the entire valid keyword
8165 // field types.
8166 typedef struct kmp_affinity_format_field_t {
8167   char short_name; // from spec e.g., L -> thread level
8168   const char *long_name; // from spec thread_level -> thread level
8169   char field_format; // data type for snprintf (typically 'd' or 's'
8170   // for integer or string)
8171 } kmp_affinity_format_field_t;
8172 
8173 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8174 #if KMP_AFFINITY_SUPPORTED
8175     {'A', "thread_affinity", 's'},
8176 #endif
8177     {'t', "team_num", 'd'},
8178     {'T', "num_teams", 'd'},
8179     {'L', "nesting_level", 'd'},
8180     {'n', "thread_num", 'd'},
8181     {'N', "num_threads", 'd'},
8182     {'a', "ancestor_tnum", 'd'},
8183     {'H', "host", 's'},
8184     {'P', "process_id", 'd'},
8185     {'i', "native_thread_id", 'd'}};
8186 
8187 // Return the number of characters it takes to hold field
8188 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8189                                             const char **ptr,
8190                                             kmp_str_buf_t *field_buffer) {
8191   int rc, format_index, field_value;
8192   const char *width_left, *width_right;
8193   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8194   static const int FORMAT_SIZE = 20;
8195   char format[FORMAT_SIZE] = {0};
8196   char absolute_short_name = 0;
8197 
8198   KMP_DEBUG_ASSERT(gtid >= 0);
8199   KMP_DEBUG_ASSERT(th);
8200   KMP_DEBUG_ASSERT(**ptr == '%');
8201   KMP_DEBUG_ASSERT(field_buffer);
8202 
8203   __kmp_str_buf_clear(field_buffer);
8204 
8205   // Skip the initial %
8206   (*ptr)++;
8207 
8208   // Check for %% first
8209   if (**ptr == '%') {
8210     __kmp_str_buf_cat(field_buffer, "%", 1);
8211     (*ptr)++; // skip over the second %
8212     return 1;
8213   }
8214 
8215   // Parse field modifiers if they are present
8216   pad_zeros = false;
8217   if (**ptr == '0') {
8218     pad_zeros = true;
8219     (*ptr)++; // skip over 0
8220   }
8221   right_justify = false;
8222   if (**ptr == '.') {
8223     right_justify = true;
8224     (*ptr)++; // skip over .
8225   }
8226   // Parse width of field: [width_left, width_right)
8227   width_left = width_right = NULL;
8228   if (**ptr >= '0' && **ptr <= '9') {
8229     width_left = *ptr;
8230     SKIP_DIGITS(*ptr);
8231     width_right = *ptr;
8232   }
8233 
8234   // Create the format for KMP_SNPRINTF based on flags parsed above
8235   format_index = 0;
8236   format[format_index++] = '%';
8237   if (!right_justify)
8238     format[format_index++] = '-';
8239   if (pad_zeros)
8240     format[format_index++] = '0';
8241   if (width_left && width_right) {
8242     int i = 0;
8243     // Only allow 8 digit number widths.
8244     // This also prevents overflowing format variable
8245     while (i < 8 && width_left < width_right) {
8246       format[format_index++] = *width_left;
8247       width_left++;
8248       i++;
8249     }
8250   }
8251 
8252   // Parse a name (long or short)
8253   // Canonicalize the name into absolute_short_name
8254   found_valid_name = false;
8255   parse_long_name = (**ptr == '{');
8256   if (parse_long_name)
8257     (*ptr)++; // skip initial left brace
8258   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8259                              sizeof(__kmp_affinity_format_table[0]);
8260        ++i) {
8261     char short_name = __kmp_affinity_format_table[i].short_name;
8262     const char *long_name = __kmp_affinity_format_table[i].long_name;
8263     char field_format = __kmp_affinity_format_table[i].field_format;
8264     if (parse_long_name) {
8265       size_t length = KMP_STRLEN(long_name);
8266       if (strncmp(*ptr, long_name, length) == 0) {
8267         found_valid_name = true;
8268         (*ptr) += length; // skip the long name
8269       }
8270     } else if (**ptr == short_name) {
8271       found_valid_name = true;
8272       (*ptr)++; // skip the short name
8273     }
8274     if (found_valid_name) {
8275       format[format_index++] = field_format;
8276       format[format_index++] = '\0';
8277       absolute_short_name = short_name;
8278       break;
8279     }
8280   }
8281   if (parse_long_name) {
8282     if (**ptr != '}') {
8283       absolute_short_name = 0;
8284     } else {
8285       (*ptr)++; // skip over the right brace
8286     }
8287   }
8288 
8289   // Attempt to fill the buffer with the requested
8290   // value using snprintf within __kmp_str_buf_print()
8291   switch (absolute_short_name) {
8292   case 't':
8293     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8294     break;
8295   case 'T':
8296     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8297     break;
8298   case 'L':
8299     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8300     break;
8301   case 'n':
8302     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8303     break;
8304   case 'H': {
8305     static const int BUFFER_SIZE = 256;
8306     char buf[BUFFER_SIZE];
8307     __kmp_expand_host_name(buf, BUFFER_SIZE);
8308     rc = __kmp_str_buf_print(field_buffer, format, buf);
8309   } break;
8310   case 'P':
8311     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8312     break;
8313   case 'i':
8314     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8315     break;
8316   case 'N':
8317     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8318     break;
8319   case 'a':
8320     field_value =
8321         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8322     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8323     break;
8324 #if KMP_AFFINITY_SUPPORTED
8325   case 'A': {
8326     kmp_str_buf_t buf;
8327     __kmp_str_buf_init(&buf);
8328     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8329     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8330     __kmp_str_buf_free(&buf);
8331   } break;
8332 #endif
8333   default:
8334     // According to spec, If an implementation does not have info for field
8335     // type, then "undefined" is printed
8336     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8337     // Skip the field
8338     if (parse_long_name) {
8339       SKIP_TOKEN(*ptr);
8340       if (**ptr == '}')
8341         (*ptr)++;
8342     } else {
8343       (*ptr)++;
8344     }
8345   }
8346 
8347   KMP_ASSERT(format_index <= FORMAT_SIZE);
8348   return rc;
8349 }
8350 
8351 /*
8352  * Return number of characters needed to hold the affinity string
8353  * (not including null byte character)
8354  * The resultant string is printed to buffer, which the caller can then
8355  * handle afterwards
8356  */
8357 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8358                                   kmp_str_buf_t *buffer) {
8359   const char *parse_ptr;
8360   size_t retval;
8361   const kmp_info_t *th;
8362   kmp_str_buf_t field;
8363 
8364   KMP_DEBUG_ASSERT(buffer);
8365   KMP_DEBUG_ASSERT(gtid >= 0);
8366 
8367   __kmp_str_buf_init(&field);
8368   __kmp_str_buf_clear(buffer);
8369 
8370   th = __kmp_threads[gtid];
8371   retval = 0;
8372 
8373   // If format is NULL or zero-length string, then we use
8374   // affinity-format-var ICV
8375   parse_ptr = format;
8376   if (parse_ptr == NULL || *parse_ptr == '\0') {
8377     parse_ptr = __kmp_affinity_format;
8378   }
8379   KMP_DEBUG_ASSERT(parse_ptr);
8380 
8381   while (*parse_ptr != '\0') {
8382     // Parse a field
8383     if (*parse_ptr == '%') {
8384       // Put field in the buffer
8385       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8386       __kmp_str_buf_catbuf(buffer, &field);
8387       retval += rc;
8388     } else {
8389       // Put literal character in buffer
8390       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8391       retval++;
8392       parse_ptr++;
8393     }
8394   }
8395   __kmp_str_buf_free(&field);
8396   return retval;
8397 }
8398 
8399 // Displays the affinity string to stdout
8400 void __kmp_aux_display_affinity(int gtid, const char *format) {
8401   kmp_str_buf_t buf;
8402   __kmp_str_buf_init(&buf);
8403   __kmp_aux_capture_affinity(gtid, format, &buf);
8404   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8405   __kmp_str_buf_free(&buf);
8406 }
8407 
8408 /* ------------------------------------------------------------------------ */
8409 
8410 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8411   int blocktime = arg; /* argument is in milliseconds */
8412 #if KMP_USE_MONITOR
8413   int bt_intervals;
8414 #endif
8415   kmp_int8 bt_set;
8416 
8417   __kmp_save_internal_controls(thread);
8418 
8419   /* Normalize and set blocktime for the teams */
8420   if (blocktime < KMP_MIN_BLOCKTIME)
8421     blocktime = KMP_MIN_BLOCKTIME;
8422   else if (blocktime > KMP_MAX_BLOCKTIME)
8423     blocktime = KMP_MAX_BLOCKTIME;
8424 
8425   set__blocktime_team(thread->th.th_team, tid, blocktime);
8426   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8427 
8428 #if KMP_USE_MONITOR
8429   /* Calculate and set blocktime intervals for the teams */
8430   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8431 
8432   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8433   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8434 #endif
8435 
8436   /* Set whether blocktime has been set to "TRUE" */
8437   bt_set = TRUE;
8438 
8439   set__bt_set_team(thread->th.th_team, tid, bt_set);
8440   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8441 #if KMP_USE_MONITOR
8442   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8443                 "bt_intervals=%d, monitor_updates=%d\n",
8444                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8445                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8446                 __kmp_monitor_wakeups));
8447 #else
8448   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8449                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8450                 thread->th.th_team->t.t_id, tid, blocktime));
8451 #endif
8452 }
8453 
8454 void __kmp_aux_set_defaults(char const *str, size_t len) {
8455   if (!__kmp_init_serial) {
8456     __kmp_serial_initialize();
8457   }
8458   __kmp_env_initialize(str);
8459 
8460   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8461     __kmp_env_print();
8462   }
8463 } // __kmp_aux_set_defaults
8464 
8465 /* ------------------------------------------------------------------------ */
8466 /* internal fast reduction routines */
8467 
8468 PACKED_REDUCTION_METHOD_T
8469 __kmp_determine_reduction_method(
8470     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8471     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8472     kmp_critical_name *lck) {
8473 
8474   // Default reduction method: critical construct ( lck != NULL, like in current
8475   // PAROPT )
8476   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8477   // can be selected by RTL
8478   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8479   // can be selected by RTL
8480   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8481   // among generated by PAROPT.
8482 
8483   PACKED_REDUCTION_METHOD_T retval;
8484 
8485   int team_size;
8486 
8487   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8488   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8489 
8490 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8491   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8492 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8493 
8494   retval = critical_reduce_block;
8495 
8496   // another choice of getting a team size (with 1 dynamic deference) is slower
8497   team_size = __kmp_get_team_num_threads(global_tid);
8498   if (team_size == 1) {
8499 
8500     retval = empty_reduce_block;
8501 
8502   } else {
8503 
8504     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8505 
8506 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8507     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8508 
8509 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8510     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8511 
8512     int teamsize_cutoff = 4;
8513 
8514 #if KMP_MIC_SUPPORTED
8515     if (__kmp_mic_type != non_mic) {
8516       teamsize_cutoff = 8;
8517     }
8518 #endif
8519     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8520     if (tree_available) {
8521       if (team_size <= teamsize_cutoff) {
8522         if (atomic_available) {
8523           retval = atomic_reduce_block;
8524         }
8525       } else {
8526         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8527       }
8528     } else if (atomic_available) {
8529       retval = atomic_reduce_block;
8530     }
8531 #else
8532 #error "Unknown or unsupported OS"
8533 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8534        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8535 
8536 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8537 
8538 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8539 
8540     // basic tuning
8541 
8542     if (atomic_available) {
8543       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8544         retval = atomic_reduce_block;
8545       }
8546     } // otherwise: use critical section
8547 
8548 #elif KMP_OS_DARWIN
8549 
8550     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8551     if (atomic_available && (num_vars <= 3)) {
8552       retval = atomic_reduce_block;
8553     } else if (tree_available) {
8554       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8555           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8556         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8557       }
8558     } // otherwise: use critical section
8559 
8560 #else
8561 #error "Unknown or unsupported OS"
8562 #endif
8563 
8564 #else
8565 #error "Unknown or unsupported architecture"
8566 #endif
8567   }
8568 
8569   // KMP_FORCE_REDUCTION
8570 
8571   // If the team is serialized (team_size == 1), ignore the forced reduction
8572   // method and stay with the unsynchronized method (empty_reduce_block)
8573   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8574       team_size != 1) {
8575 
8576     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8577 
8578     int atomic_available, tree_available;
8579 
8580     switch ((forced_retval = __kmp_force_reduction_method)) {
8581     case critical_reduce_block:
8582       KMP_ASSERT(lck); // lck should be != 0
8583       break;
8584 
8585     case atomic_reduce_block:
8586       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8587       if (!atomic_available) {
8588         KMP_WARNING(RedMethodNotSupported, "atomic");
8589         forced_retval = critical_reduce_block;
8590       }
8591       break;
8592 
8593     case tree_reduce_block:
8594       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8595       if (!tree_available) {
8596         KMP_WARNING(RedMethodNotSupported, "tree");
8597         forced_retval = critical_reduce_block;
8598       } else {
8599 #if KMP_FAST_REDUCTION_BARRIER
8600         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8601 #endif
8602       }
8603       break;
8604 
8605     default:
8606       KMP_ASSERT(0); // "unsupported method specified"
8607     }
8608 
8609     retval = forced_retval;
8610   }
8611 
8612   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8613 
8614 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8615 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8616 
8617   return (retval);
8618 }
8619 // this function is for testing set/get/determine reduce method
8620 kmp_int32 __kmp_get_reduce_method(void) {
8621   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8622 }
8623 
8624 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8625 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8626 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8627 
8628 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8629 // OpenMP is used subsequently.
8630 void __kmp_hard_pause() {
8631   __kmp_pause_status = kmp_hard_paused;
8632   __kmp_internal_end_thread(-1);
8633 }
8634 
8635 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8636 void __kmp_resume_if_soft_paused() {
8637   if (__kmp_pause_status == kmp_soft_paused) {
8638     __kmp_pause_status = kmp_not_paused;
8639 
8640     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8641       kmp_info_t *thread = __kmp_threads[gtid];
8642       if (thread) { // Wake it if sleeping
8643         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8644                          thread);
8645         if (fl.is_sleeping())
8646           fl.resume(gtid);
8647         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8648           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8649         } else { // thread holds the lock and may sleep soon
8650           do { // until either the thread sleeps, or we can get the lock
8651             if (fl.is_sleeping()) {
8652               fl.resume(gtid);
8653               break;
8654             } else if (__kmp_try_suspend_mx(thread)) {
8655               __kmp_unlock_suspend_mx(thread);
8656               break;
8657             }
8658           } while (1);
8659         }
8660       }
8661     }
8662   }
8663 }
8664 
8665 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8666 // TODO: add warning messages
8667 int __kmp_pause_resource(kmp_pause_status_t level) {
8668   if (level == kmp_not_paused) { // requesting resume
8669     if (__kmp_pause_status == kmp_not_paused) {
8670       // error message about runtime not being paused, so can't resume
8671       return 1;
8672     } else {
8673       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8674                        __kmp_pause_status == kmp_hard_paused);
8675       __kmp_pause_status = kmp_not_paused;
8676       return 0;
8677     }
8678   } else if (level == kmp_soft_paused) { // requesting soft pause
8679     if (__kmp_pause_status != kmp_not_paused) {
8680       // error message about already being paused
8681       return 1;
8682     } else {
8683       __kmp_soft_pause();
8684       return 0;
8685     }
8686   } else if (level == kmp_hard_paused) { // requesting hard pause
8687     if (__kmp_pause_status != kmp_not_paused) {
8688       // error message about already being paused
8689       return 1;
8690     } else {
8691       __kmp_hard_pause();
8692       return 0;
8693     }
8694   } else {
8695     // error message about invalid level
8696     return 1;
8697   }
8698 }
8699 
8700 void __kmp_omp_display_env(int verbose) {
8701   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8702   if (__kmp_init_serial == 0)
8703     __kmp_do_serial_initialize();
8704   __kmp_display_env_impl(!verbose, verbose);
8705   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8706 }
8707 
8708 // Globals and functions for hidden helper task
8709 kmp_info_t **__kmp_hidden_helper_threads;
8710 kmp_info_t *__kmp_hidden_helper_main_thread;
8711 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8712 #if KMP_OS_LINUX
8713 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8714 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8715 #else
8716 kmp_int32 __kmp_hidden_helper_threads_num = 0;
8717 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8718 #endif
8719 
8720 namespace {
8721 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8722 
8723 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8724   // This is an explicit synchronization on all hidden helper threads in case
8725   // that when a regular thread pushes a hidden helper task to one hidden
8726   // helper thread, the thread has not been awaken once since they're released
8727   // by the main thread after creating the team.
8728   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8729   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8730          __kmp_hidden_helper_threads_num)
8731     ;
8732 
8733   // If main thread, then wait for signal
8734   if (__kmpc_master(nullptr, *gtid)) {
8735     // First, unset the initial state and release the initial thread
8736     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8737     __kmp_hidden_helper_initz_release();
8738     __kmp_hidden_helper_main_thread_wait();
8739     // Now wake up all worker threads
8740     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8741       __kmp_hidden_helper_worker_thread_signal();
8742     }
8743   }
8744 }
8745 } // namespace
8746 
8747 void __kmp_hidden_helper_threads_initz_routine() {
8748   // Create a new root for hidden helper team/threads
8749   const int gtid = __kmp_register_root(TRUE);
8750   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8751   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8752   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8753       __kmp_hidden_helper_threads_num;
8754 
8755   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8756 
8757   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8758 
8759   // Set the initialization flag to FALSE
8760   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8761 
8762   __kmp_hidden_helper_threads_deinitz_release();
8763 }
8764 
8765 /* Nesting Mode:
8766    Set via KMP_NESTING_MODE, which takes an integer.
8767    Note: we skip duplicate topology levels, and skip levels with only
8768       one entity.
8769    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
8770    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
8771       in the topology, and initializes the number of threads at each of those
8772       levels to the number of entities at each level, respectively, below the
8773       entity at the parent level.
8774    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
8775       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
8776       the user to turn nesting on explicitly. This is an even more experimental
8777       option to this experimental feature, and may change or go away in the
8778       future.
8779 */
8780 
8781 // Allocate space to store nesting levels
8782 void __kmp_init_nesting_mode() {
8783   int levels = KMP_HW_LAST;
8784   __kmp_nesting_mode_nlevels = levels;
8785   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
8786   for (int i = 0; i < levels; ++i)
8787     __kmp_nesting_nth_level[i] = 0;
8788   if (__kmp_nested_nth.size < levels) {
8789     __kmp_nested_nth.nth =
8790         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
8791     __kmp_nested_nth.size = levels;
8792   }
8793 }
8794 
8795 // Set # threads for top levels of nesting; must be called after topology set
8796 void __kmp_set_nesting_mode_threads() {
8797   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
8798 
8799   if (__kmp_nesting_mode == 1)
8800     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
8801   else if (__kmp_nesting_mode > 1)
8802     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8803 
8804   if (__kmp_topology) { // use topology info
8805     int loc, hw_level;
8806     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
8807                                 loc < __kmp_nesting_mode_nlevels;
8808          loc++, hw_level++) {
8809       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
8810       if (__kmp_nesting_nth_level[loc] == 1)
8811         loc--;
8812     }
8813     // Make sure all cores are used
8814     if (__kmp_nesting_mode > 1 && loc > 1) {
8815       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
8816       int num_cores = __kmp_topology->get_count(core_level);
8817       int upper_levels = 1;
8818       for (int level = 0; level < loc - 1; ++level)
8819         upper_levels *= __kmp_nesting_nth_level[level];
8820       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
8821         __kmp_nesting_nth_level[loc - 1] =
8822             num_cores / __kmp_nesting_nth_level[loc - 2];
8823     }
8824     __kmp_nesting_mode_nlevels = loc;
8825     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8826   } else { // no topology info available; provide a reasonable guesstimation
8827     if (__kmp_avail_proc >= 4) {
8828       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
8829       __kmp_nesting_nth_level[1] = 2;
8830       __kmp_nesting_mode_nlevels = 2;
8831     } else {
8832       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
8833       __kmp_nesting_mode_nlevels = 1;
8834     }
8835     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8836   }
8837   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
8838     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
8839   }
8840   set__nproc(thread, __kmp_nesting_nth_level[0]);
8841   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
8842     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8843   if (get__max_active_levels(thread) > 1) {
8844     // if max levels was set, set nesting mode levels to same
8845     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
8846   }
8847   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
8848     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
8849 }
8850