1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #include "tsan_annotations.h"
51 
52 #if KMP_OS_WINDOWS
53 // windows does not need include files as it doesn't use shared memory
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60 
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63     KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] =
67     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68 
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71     KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75 
76 /* ------------------------------------------------------------------------ */
77 
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81 
82 /* Forward declarations */
83 
84 void __kmp_cleanup(void);
85 
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87                                   int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89                                   kmp_internal_control_t *new_icvs,
90                                   ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93                                    int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99                           kmp_internal_control_t *new_icvs, ident_t *loc);
100 
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104 
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111 
112 /* Calculate the identifier of the current thread */
113 /* fast (and somewhat portable) way to get unique identifier of executing
114    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
115 int __kmp_get_global_thread_id() {
116   int i;
117   kmp_info_t **other_threads;
118   size_t stack_data;
119   char *stack_addr;
120   size_t stack_size;
121   char *stack_base;
122 
123   KA_TRACE(
124       1000,
125       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
126        __kmp_nth, __kmp_all_nth));
127 
128   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
129      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
130      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
131      __kmp_init_gtid for this to work. */
132 
133   if (!TCR_4(__kmp_init_gtid))
134     return KMP_GTID_DNE;
135 
136 #ifdef KMP_TDATA_GTID
137   if (TCR_4(__kmp_gtid_mode) >= 3) {
138     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
139     return __kmp_gtid;
140   }
141 #endif
142   if (TCR_4(__kmp_gtid_mode) >= 2) {
143     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
144     return __kmp_gtid_get_specific();
145   }
146   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
147 
148   stack_addr = (char *)&stack_data;
149   other_threads = __kmp_threads;
150 
151   /* ATT: The code below is a source of potential bugs due to unsynchronized
152      access to __kmp_threads array. For example:
153      1. Current thread loads other_threads[i] to thr and checks it, it is
154         non-NULL.
155      2. Current thread is suspended by OS.
156      3. Another thread unregisters and finishes (debug versions of free()
157         may fill memory with something like 0xEF).
158      4. Current thread is resumed.
159      5. Current thread reads junk from *thr.
160      TODO: Fix it.  --ln  */
161 
162   for (i = 0; i < __kmp_threads_capacity; i++) {
163 
164     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
165     if (!thr)
166       continue;
167 
168     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
169     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
170 
171     /* stack grows down -- search through all of the active threads */
172 
173     if (stack_addr <= stack_base) {
174       size_t stack_diff = stack_base - stack_addr;
175 
176       if (stack_diff <= stack_size) {
177         /* The only way we can be closer than the allocated */
178         /* stack size is if we are running on this thread. */
179         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
180         return i;
181       }
182     }
183   }
184 
185   /* get specific to try and determine our gtid */
186   KA_TRACE(1000,
187            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
188             "thread, using TLS\n"));
189   i = __kmp_gtid_get_specific();
190 
191   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
192 
193   /* if we havn't been assigned a gtid, then return code */
194   if (i < 0)
195     return i;
196 
197   /* dynamically updated stack window for uber threads to avoid get_specific
198      call */
199   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
200     KMP_FATAL(StackOverflow, i);
201   }
202 
203   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204   if (stack_addr > stack_base) {
205     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
206     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
207             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
208                 stack_base);
209   } else {
210     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
211             stack_base - stack_addr);
212   }
213 
214   /* Reprint stack bounds for ubermaster since they have been refined */
215   if (__kmp_storage_map) {
216     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
217     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
218     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
219                                  other_threads[i]->th.th_info.ds.ds_stacksize,
220                                  "th_%d stack (refinement)", i);
221   }
222   return i;
223 }
224 
225 int __kmp_get_global_thread_id_reg() {
226   int gtid;
227 
228   if (!__kmp_init_serial) {
229     gtid = KMP_GTID_DNE;
230   } else
231 #ifdef KMP_TDATA_GTID
232       if (TCR_4(__kmp_gtid_mode) >= 3) {
233     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
234     gtid = __kmp_gtid;
235   } else
236 #endif
237       if (TCR_4(__kmp_gtid_mode) >= 2) {
238     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
239     gtid = __kmp_gtid_get_specific();
240   } else {
241     KA_TRACE(1000,
242              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
243     gtid = __kmp_get_global_thread_id();
244   }
245 
246   /* we must be a new uber master sibling thread */
247   if (gtid == KMP_GTID_DNE) {
248     KA_TRACE(10,
249              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
250               "Registering a new gtid.\n"));
251     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
252     if (!__kmp_init_serial) {
253       __kmp_do_serial_initialize();
254       gtid = __kmp_gtid_get_specific();
255     } else {
256       gtid = __kmp_register_root(FALSE);
257     }
258     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
259     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
260   }
261 
262   KMP_DEBUG_ASSERT(gtid >= 0);
263 
264   return gtid;
265 }
266 
267 /* caller must hold forkjoin_lock */
268 void __kmp_check_stack_overlap(kmp_info_t *th) {
269   int f;
270   char *stack_beg = NULL;
271   char *stack_end = NULL;
272   int gtid;
273 
274   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
275   if (__kmp_storage_map) {
276     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
277     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
278 
279     gtid = __kmp_gtid_from_thread(th);
280 
281     if (gtid == KMP_GTID_MONITOR) {
282       __kmp_print_storage_map_gtid(
283           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
284           "th_%s stack (%s)", "mon",
285           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
286     } else {
287       __kmp_print_storage_map_gtid(
288           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
289           "th_%d stack (%s)", gtid,
290           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
291     }
292   }
293 
294   /* No point in checking ubermaster threads since they use refinement and
295    * cannot overlap */
296   gtid = __kmp_gtid_from_thread(th);
297   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
298     KA_TRACE(10,
299              ("__kmp_check_stack_overlap: performing extensive checking\n"));
300     if (stack_beg == NULL) {
301       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
302       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
303     }
304 
305     for (f = 0; f < __kmp_threads_capacity; f++) {
306       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
307 
308       if (f_th && f_th != th) {
309         char *other_stack_end =
310             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
311         char *other_stack_beg =
312             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
313         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
314             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
315 
316           /* Print the other stack values before the abort */
317           if (__kmp_storage_map)
318             __kmp_print_storage_map_gtid(
319                 -1, other_stack_beg, other_stack_end,
320                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
321                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
322 
323           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
324                       __kmp_msg_null);
325         }
326       }
327     }
328   }
329   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
330 }
331 
332 /* ------------------------------------------------------------------------ */
333 
334 void __kmp_infinite_loop(void) {
335   static int done = FALSE;
336 
337   while (!done) {
338     KMP_YIELD(TRUE);
339   }
340 }
341 
342 #define MAX_MESSAGE 512
343 
344 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
345                                   char const *format, ...) {
346   char buffer[MAX_MESSAGE];
347   va_list ap;
348 
349   va_start(ap, format);
350   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
351                p2, (unsigned long)size, format);
352   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
353   __kmp_vprintf(kmp_err, buffer, ap);
354 #if KMP_PRINT_DATA_PLACEMENT
355   int node;
356   if (gtid >= 0) {
357     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
358       if (__kmp_storage_map_verbose) {
359         node = __kmp_get_host_node(p1);
360         if (node < 0) /* doesn't work, so don't try this next time */
361           __kmp_storage_map_verbose = FALSE;
362         else {
363           char *last;
364           int lastNode;
365           int localProc = __kmp_get_cpu_from_gtid(gtid);
366 
367           const int page_size = KMP_GET_PAGE_SIZE();
368 
369           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
370           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
371           if (localProc >= 0)
372             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
373                                  localProc >> 1);
374           else
375             __kmp_printf_no_lock("  GTID %d\n", gtid);
376 #if KMP_USE_PRCTL
377           /* The more elaborate format is disabled for now because of the prctl
378            * hanging bug. */
379           do {
380             last = p1;
381             lastNode = node;
382             /* This loop collates adjacent pages with the same host node. */
383             do {
384               (char *)p1 += page_size;
385             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
386             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
387                                  lastNode);
388           } while (p1 <= p2);
389 #else
390           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
391                                (char *)p1 + (page_size - 1),
392                                __kmp_get_host_node(p1));
393           if (p1 < p2) {
394             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
395                                  (char *)p2 + (page_size - 1),
396                                  __kmp_get_host_node(p2));
397           }
398 #endif
399         }
400       }
401     } else
402       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
403   }
404 #endif /* KMP_PRINT_DATA_PLACEMENT */
405   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406 }
407 
408 void __kmp_warn(char const *format, ...) {
409   char buffer[MAX_MESSAGE];
410   va_list ap;
411 
412   if (__kmp_generate_warnings == kmp_warnings_off) {
413     return;
414   }
415 
416   va_start(ap, format);
417 
418   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
419   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
420   __kmp_vprintf(kmp_err, buffer, ap);
421   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
422 
423   va_end(ap);
424 }
425 
426 void __kmp_abort_process() {
427   // Later threads may stall here, but that's ok because abort() will kill them.
428   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
429 
430   if (__kmp_debug_buf) {
431     __kmp_dump_debug_buffer();
432   }
433 
434   if (KMP_OS_WINDOWS) {
435     // Let other threads know of abnormal termination and prevent deadlock
436     // if abort happened during library initialization or shutdown
437     __kmp_global.g.g_abort = SIGABRT;
438 
439     /* On Windows* OS by default abort() causes pop-up error box, which stalls
440        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
441        boxes. _set_abort_behavior() works well, but this function is not
442        available in VS7 (this is not problem for DLL, but it is a problem for
443        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
444        help, at least in some versions of MS C RTL.
445 
446        It seems following sequence is the only way to simulate abort() and
447        avoid pop-up error box. */
448     raise(SIGABRT);
449     _exit(3); // Just in case, if signal ignored, exit anyway.
450   } else {
451     __kmp_unregister_library();
452     abort();
453   }
454 
455   __kmp_infinite_loop();
456   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
457 
458 } // __kmp_abort_process
459 
460 void __kmp_abort_thread(void) {
461   // TODO: Eliminate g_abort global variable and this function.
462   // In case of abort just call abort(), it will kill all the threads.
463   __kmp_infinite_loop();
464 } // __kmp_abort_thread
465 
466 /* Print out the storage map for the major kmp_info_t thread data structures
467    that are allocated together. */
468 
469 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
470   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
471                                gtid);
472 
473   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
474                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
475 
476   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
477                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
478 
479   __kmp_print_storage_map_gtid(
480       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
481       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
482 
483   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
484                                &thr->th.th_bar[bs_plain_barrier + 1],
485                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
486                                gtid);
487 
488   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
489                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
490                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
491                                gtid);
492 
493 #if KMP_FAST_REDUCTION_BARRIER
494   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
495                                &thr->th.th_bar[bs_reduction_barrier + 1],
496                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
497                                gtid);
498 #endif // KMP_FAST_REDUCTION_BARRIER
499 }
500 
501 /* Print out the storage map for the major kmp_team_t team data structures
502    that are allocated together. */
503 
504 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
505                                          int team_id, int num_thr) {
506   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
507   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
508                                header, team_id);
509 
510   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
511                                &team->t.t_bar[bs_last_barrier],
512                                sizeof(kmp_balign_team_t) * bs_last_barrier,
513                                "%s_%d.t_bar", header, team_id);
514 
515   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
516                                &team->t.t_bar[bs_plain_barrier + 1],
517                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
518                                header, team_id);
519 
520   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
521                                &team->t.t_bar[bs_forkjoin_barrier + 1],
522                                sizeof(kmp_balign_team_t),
523                                "%s_%d.t_bar[forkjoin]", header, team_id);
524 
525 #if KMP_FAST_REDUCTION_BARRIER
526   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
527                                &team->t.t_bar[bs_reduction_barrier + 1],
528                                sizeof(kmp_balign_team_t),
529                                "%s_%d.t_bar[reduction]", header, team_id);
530 #endif // KMP_FAST_REDUCTION_BARRIER
531 
532   __kmp_print_storage_map_gtid(
533       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
534       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
535 
536   __kmp_print_storage_map_gtid(
537       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
538       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
539 
540   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
541                                &team->t.t_disp_buffer[num_disp_buff],
542                                sizeof(dispatch_shared_info_t) * num_disp_buff,
543                                "%s_%d.t_disp_buffer", header, team_id);
544 }
545 
546 static void __kmp_init_allocator() {
547   __kmp_init_memkind();
548   __kmp_init_target_mem();
549 }
550 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
551 
552 /* ------------------------------------------------------------------------ */
553 
554 #if KMP_DYNAMIC_LIB
555 #if KMP_OS_WINDOWS
556 
557 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
558   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
559 
560   switch (fdwReason) {
561 
562   case DLL_PROCESS_ATTACH:
563     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
564 
565     return TRUE;
566 
567   case DLL_PROCESS_DETACH:
568     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
569 
570     // According to Windows* documentation for DllMain entry point:
571     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
572     //   lpReserved == NULL when FreeLibrary() is called,
573     //   lpReserved != NULL when the process is terminated.
574     // When FreeLibrary() is called, worker threads remain alive. So the
575     // runtime's state is consistent and executing proper shutdown is OK.
576     // When the process is terminated, worker threads have exited or been
577     // forcefully terminated by the OS and only the shutdown thread remains.
578     // This can leave the runtime in an inconsistent state.
579     // Hence, only attempt proper cleanup when FreeLibrary() is called.
580     // Otherwise, rely on OS to reclaim resources.
581     if (lpReserved == NULL)
582       __kmp_internal_end_library(__kmp_gtid_get_specific());
583 
584     return TRUE;
585 
586   case DLL_THREAD_ATTACH:
587     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
588 
589     /* if we want to register new siblings all the time here call
590      * __kmp_get_gtid(); */
591     return TRUE;
592 
593   case DLL_THREAD_DETACH:
594     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
595 
596     __kmp_internal_end_thread(__kmp_gtid_get_specific());
597     return TRUE;
598   }
599 
600   return TRUE;
601 }
602 
603 #endif /* KMP_OS_WINDOWS */
604 #endif /* KMP_DYNAMIC_LIB */
605 
606 /* __kmp_parallel_deo -- Wait until it's our turn. */
607 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
608   int gtid = *gtid_ref;
609 #ifdef BUILD_PARALLEL_ORDERED
610   kmp_team_t *team = __kmp_team_from_gtid(gtid);
611 #endif /* BUILD_PARALLEL_ORDERED */
612 
613   if (__kmp_env_consistency_check) {
614     if (__kmp_threads[gtid]->th.th_root->r.r_active)
615 #if KMP_USE_DYNAMIC_LOCK
616       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
617 #else
618       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
619 #endif
620   }
621 #ifdef BUILD_PARALLEL_ORDERED
622   if (!team->t.t_serialized) {
623     KMP_MB();
624     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
625              NULL);
626     KMP_MB();
627   }
628 #endif /* BUILD_PARALLEL_ORDERED */
629 }
630 
631 /* __kmp_parallel_dxo -- Signal the next task. */
632 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
633   int gtid = *gtid_ref;
634 #ifdef BUILD_PARALLEL_ORDERED
635   int tid = __kmp_tid_from_gtid(gtid);
636   kmp_team_t *team = __kmp_team_from_gtid(gtid);
637 #endif /* BUILD_PARALLEL_ORDERED */
638 
639   if (__kmp_env_consistency_check) {
640     if (__kmp_threads[gtid]->th.th_root->r.r_active)
641       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
642   }
643 #ifdef BUILD_PARALLEL_ORDERED
644   if (!team->t.t_serialized) {
645     KMP_MB(); /* Flush all pending memory write invalidates.  */
646 
647     /* use the tid of the next thread in this team */
648     /* TODO replace with general release procedure */
649     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
650 
651     KMP_MB(); /* Flush all pending memory write invalidates.  */
652   }
653 #endif /* BUILD_PARALLEL_ORDERED */
654 }
655 
656 /* ------------------------------------------------------------------------ */
657 /* The BARRIER for a SINGLE process section is always explicit   */
658 
659 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
660   int status;
661   kmp_info_t *th;
662   kmp_team_t *team;
663 
664   if (!TCR_4(__kmp_init_parallel))
665     __kmp_parallel_initialize();
666   __kmp_resume_if_soft_paused();
667 
668   th = __kmp_threads[gtid];
669   team = th->th.th_team;
670   status = 0;
671 
672   th->th.th_ident = id_ref;
673 
674   if (team->t.t_serialized) {
675     status = 1;
676   } else {
677     kmp_int32 old_this = th->th.th_local.this_construct;
678 
679     ++th->th.th_local.this_construct;
680     /* try to set team count to thread count--success means thread got the
681        single block */
682     /* TODO: Should this be acquire or release? */
683     if (team->t.t_construct == old_this) {
684       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
685                                               th->th.th_local.this_construct);
686     }
687 #if USE_ITT_BUILD
688     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
689         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
690         team->t.t_active_level == 1) {
691       // Only report metadata by primary thread of active team at level 1
692       __kmp_itt_metadata_single(id_ref);
693     }
694 #endif /* USE_ITT_BUILD */
695   }
696 
697   if (__kmp_env_consistency_check) {
698     if (status && push_ws) {
699       __kmp_push_workshare(gtid, ct_psingle, id_ref);
700     } else {
701       __kmp_check_workshare(gtid, ct_psingle, id_ref);
702     }
703   }
704 #if USE_ITT_BUILD
705   if (status) {
706     __kmp_itt_single_start(gtid);
707   }
708 #endif /* USE_ITT_BUILD */
709   return status;
710 }
711 
712 void __kmp_exit_single(int gtid) {
713 #if USE_ITT_BUILD
714   __kmp_itt_single_end(gtid);
715 #endif /* USE_ITT_BUILD */
716   if (__kmp_env_consistency_check)
717     __kmp_pop_workshare(gtid, ct_psingle, NULL);
718 }
719 
720 /* determine if we can go parallel or must use a serialized parallel region and
721  * how many threads we can use
722  * set_nproc is the number of threads requested for the team
723  * returns 0 if we should serialize or only use one thread,
724  * otherwise the number of threads to use
725  * The forkjoin lock is held by the caller. */
726 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
727                                  int master_tid, int set_nthreads,
728                                  int enter_teams) {
729   int capacity;
730   int new_nthreads;
731   KMP_DEBUG_ASSERT(__kmp_init_serial);
732   KMP_DEBUG_ASSERT(root && parent_team);
733   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
734 
735   // If dyn-var is set, dynamically adjust the number of desired threads,
736   // according to the method specified by dynamic_mode.
737   new_nthreads = set_nthreads;
738   if (!get__dynamic_2(parent_team, master_tid)) {
739     ;
740   }
741 #ifdef USE_LOAD_BALANCE
742   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
743     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
744     if (new_nthreads == 1) {
745       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
746                     "reservation to 1 thread\n",
747                     master_tid));
748       return 1;
749     }
750     if (new_nthreads < set_nthreads) {
751       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
752                     "reservation to %d threads\n",
753                     master_tid, new_nthreads));
754     }
755   }
756 #endif /* USE_LOAD_BALANCE */
757   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
758     new_nthreads = __kmp_avail_proc - __kmp_nth +
759                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
760     if (new_nthreads <= 1) {
761       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
762                     "reservation to 1 thread\n",
763                     master_tid));
764       return 1;
765     }
766     if (new_nthreads < set_nthreads) {
767       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
768                     "reservation to %d threads\n",
769                     master_tid, new_nthreads));
770     } else {
771       new_nthreads = set_nthreads;
772     }
773   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
774     if (set_nthreads > 2) {
775       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
776       new_nthreads = (new_nthreads % set_nthreads) + 1;
777       if (new_nthreads == 1) {
778         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
779                       "reservation to 1 thread\n",
780                       master_tid));
781         return 1;
782       }
783       if (new_nthreads < set_nthreads) {
784         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
785                       "reservation to %d threads\n",
786                       master_tid, new_nthreads));
787       }
788     }
789   } else {
790     KMP_ASSERT(0);
791   }
792 
793   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
794   if (__kmp_nth + new_nthreads -
795           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
796       __kmp_max_nth) {
797     int tl_nthreads = __kmp_max_nth - __kmp_nth +
798                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
799     if (tl_nthreads <= 0) {
800       tl_nthreads = 1;
801     }
802 
803     // If dyn-var is false, emit a 1-time warning.
804     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
805       __kmp_reserve_warn = 1;
806       __kmp_msg(kmp_ms_warning,
807                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
808                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
809     }
810     if (tl_nthreads == 1) {
811       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
812                     "reduced reservation to 1 thread\n",
813                     master_tid));
814       return 1;
815     }
816     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
817                   "reservation to %d threads\n",
818                   master_tid, tl_nthreads));
819     new_nthreads = tl_nthreads;
820   }
821 
822   // Respect OMP_THREAD_LIMIT
823   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
824   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
825   if (cg_nthreads + new_nthreads -
826           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
827       max_cg_threads) {
828     int tl_nthreads = max_cg_threads - cg_nthreads +
829                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
830     if (tl_nthreads <= 0) {
831       tl_nthreads = 1;
832     }
833 
834     // If dyn-var is false, emit a 1-time warning.
835     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
836       __kmp_reserve_warn = 1;
837       __kmp_msg(kmp_ms_warning,
838                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
839                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
840     }
841     if (tl_nthreads == 1) {
842       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
843                     "reduced reservation to 1 thread\n",
844                     master_tid));
845       return 1;
846     }
847     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
848                   "reservation to %d threads\n",
849                   master_tid, tl_nthreads));
850     new_nthreads = tl_nthreads;
851   }
852 
853   // Check if the threads array is large enough, or needs expanding.
854   // See comment in __kmp_register_root() about the adjustment if
855   // __kmp_threads[0] == NULL.
856   capacity = __kmp_threads_capacity;
857   if (TCR_PTR(__kmp_threads[0]) == NULL) {
858     --capacity;
859   }
860   // If it is not for initializing the hidden helper team, we need to take
861   // __kmp_hidden_helper_threads_num out of the capacity because it is included
862   // in __kmp_threads_capacity.
863   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
864     capacity -= __kmp_hidden_helper_threads_num;
865   }
866   if (__kmp_nth + new_nthreads -
867           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
868       capacity) {
869     // Expand the threads array.
870     int slotsRequired = __kmp_nth + new_nthreads -
871                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
872                         capacity;
873     int slotsAdded = __kmp_expand_threads(slotsRequired);
874     if (slotsAdded < slotsRequired) {
875       // The threads array was not expanded enough.
876       new_nthreads -= (slotsRequired - slotsAdded);
877       KMP_ASSERT(new_nthreads >= 1);
878 
879       // If dyn-var is false, emit a 1-time warning.
880       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
881         __kmp_reserve_warn = 1;
882         if (__kmp_tp_cached) {
883           __kmp_msg(kmp_ms_warning,
884                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
885                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
886                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
887         } else {
888           __kmp_msg(kmp_ms_warning,
889                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
890                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
891         }
892       }
893     }
894   }
895 
896 #ifdef KMP_DEBUG
897   if (new_nthreads == 1) {
898     KC_TRACE(10,
899              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
900               "dead roots and rechecking; requested %d threads\n",
901               __kmp_get_gtid(), set_nthreads));
902   } else {
903     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
904                   " %d threads\n",
905                   __kmp_get_gtid(), new_nthreads, set_nthreads));
906   }
907 #endif // KMP_DEBUG
908   return new_nthreads;
909 }
910 
911 /* Allocate threads from the thread pool and assign them to the new team. We are
912    assured that there are enough threads available, because we checked on that
913    earlier within critical section forkjoin */
914 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
915                                     kmp_info_t *master_th, int master_gtid) {
916   int i;
917   int use_hot_team;
918 
919   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
920   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
921   KMP_MB();
922 
923   /* first, let's setup the primary thread */
924   master_th->th.th_info.ds.ds_tid = 0;
925   master_th->th.th_team = team;
926   master_th->th.th_team_nproc = team->t.t_nproc;
927   master_th->th.th_team_master = master_th;
928   master_th->th.th_team_serialized = FALSE;
929   master_th->th.th_dispatch = &team->t.t_dispatch[0];
930 
931 /* make sure we are not the optimized hot team */
932 #if KMP_NESTED_HOT_TEAMS
933   use_hot_team = 0;
934   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
935   if (hot_teams) { // hot teams array is not allocated if
936     // KMP_HOT_TEAMS_MAX_LEVEL=0
937     int level = team->t.t_active_level - 1; // index in array of hot teams
938     if (master_th->th.th_teams_microtask) { // are we inside the teams?
939       if (master_th->th.th_teams_size.nteams > 1) {
940         ++level; // level was not increased in teams construct for
941         // team_of_masters
942       }
943       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
944           master_th->th.th_teams_level == team->t.t_level) {
945         ++level; // level was not increased in teams construct for
946         // team_of_workers before the parallel
947       } // team->t.t_level will be increased inside parallel
948     }
949     if (level < __kmp_hot_teams_max_level) {
950       if (hot_teams[level].hot_team) {
951         // hot team has already been allocated for given level
952         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
953         use_hot_team = 1; // the team is ready to use
954       } else {
955         use_hot_team = 0; // AC: threads are not allocated yet
956         hot_teams[level].hot_team = team; // remember new hot team
957         hot_teams[level].hot_team_nth = team->t.t_nproc;
958       }
959     } else {
960       use_hot_team = 0;
961     }
962   }
963 #else
964   use_hot_team = team == root->r.r_hot_team;
965 #endif
966   if (!use_hot_team) {
967 
968     /* install the primary thread */
969     team->t.t_threads[0] = master_th;
970     __kmp_initialize_info(master_th, team, 0, master_gtid);
971 
972     /* now, install the worker threads */
973     for (i = 1; i < team->t.t_nproc; i++) {
974 
975       /* fork or reallocate a new thread and install it in team */
976       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
977       team->t.t_threads[i] = thr;
978       KMP_DEBUG_ASSERT(thr);
979       KMP_DEBUG_ASSERT(thr->th.th_team == team);
980       /* align team and thread arrived states */
981       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
982                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
983                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
984                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
985                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
986                     team->t.t_bar[bs_plain_barrier].b_arrived));
987       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
988       thr->th.th_teams_level = master_th->th.th_teams_level;
989       thr->th.th_teams_size = master_th->th.th_teams_size;
990       { // Initialize threads' barrier data.
991         int b;
992         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
993         for (b = 0; b < bs_last_barrier; ++b) {
994           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
995           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
996 #if USE_DEBUGGER
997           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
998 #endif
999         }
1000       }
1001     }
1002 
1003 #if KMP_AFFINITY_SUPPORTED
1004     __kmp_partition_places(team);
1005 #endif
1006   }
1007 
1008   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1009     for (i = 0; i < team->t.t_nproc; i++) {
1010       kmp_info_t *thr = team->t.t_threads[i];
1011       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1012           thr->th.th_prev_level != team->t.t_level) {
1013         team->t.t_display_affinity = 1;
1014         break;
1015       }
1016     }
1017   }
1018 
1019   KMP_MB();
1020 }
1021 
1022 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1023 // Propagate any changes to the floating point control registers out to the team
1024 // We try to avoid unnecessary writes to the relevant cache line in the team
1025 // structure, so we don't make changes unless they are needed.
1026 inline static void propagateFPControl(kmp_team_t *team) {
1027   if (__kmp_inherit_fp_control) {
1028     kmp_int16 x87_fpu_control_word;
1029     kmp_uint32 mxcsr;
1030 
1031     // Get primary thread's values of FPU control flags (both X87 and vector)
1032     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1033     __kmp_store_mxcsr(&mxcsr);
1034     mxcsr &= KMP_X86_MXCSR_MASK;
1035 
1036     // There is no point looking at t_fp_control_saved here.
1037     // If it is TRUE, we still have to update the values if they are different
1038     // from those we now have. If it is FALSE we didn't save anything yet, but
1039     // our objective is the same. We have to ensure that the values in the team
1040     // are the same as those we have.
1041     // So, this code achieves what we need whether or not t_fp_control_saved is
1042     // true. By checking whether the value needs updating we avoid unnecessary
1043     // writes that would put the cache-line into a written state, causing all
1044     // threads in the team to have to read it again.
1045     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1046     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1047     // Although we don't use this value, other code in the runtime wants to know
1048     // whether it should restore them. So we must ensure it is correct.
1049     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1050   } else {
1051     // Similarly here. Don't write to this cache-line in the team structure
1052     // unless we have to.
1053     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1054   }
1055 }
1056 
1057 // Do the opposite, setting the hardware registers to the updated values from
1058 // the team.
1059 inline static void updateHWFPControl(kmp_team_t *team) {
1060   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1061     // Only reset the fp control regs if they have been changed in the team.
1062     // the parallel region that we are exiting.
1063     kmp_int16 x87_fpu_control_word;
1064     kmp_uint32 mxcsr;
1065     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1066     __kmp_store_mxcsr(&mxcsr);
1067     mxcsr &= KMP_X86_MXCSR_MASK;
1068 
1069     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1070       __kmp_clear_x87_fpu_status_word();
1071       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1072     }
1073 
1074     if (team->t.t_mxcsr != mxcsr) {
1075       __kmp_load_mxcsr(&team->t.t_mxcsr);
1076     }
1077   }
1078 }
1079 #else
1080 #define propagateFPControl(x) ((void)0)
1081 #define updateHWFPControl(x) ((void)0)
1082 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1083 
1084 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1085                                      int realloc); // forward declaration
1086 
1087 /* Run a parallel region that has been serialized, so runs only in a team of the
1088    single primary thread. */
1089 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1090   kmp_info_t *this_thr;
1091   kmp_team_t *serial_team;
1092 
1093   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1094 
1095   /* Skip all this code for autopar serialized loops since it results in
1096      unacceptable overhead */
1097   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1098     return;
1099 
1100   if (!TCR_4(__kmp_init_parallel))
1101     __kmp_parallel_initialize();
1102   __kmp_resume_if_soft_paused();
1103 
1104   this_thr = __kmp_threads[global_tid];
1105   serial_team = this_thr->th.th_serial_team;
1106 
1107   /* utilize the serialized team held by this thread */
1108   KMP_DEBUG_ASSERT(serial_team);
1109   KMP_MB();
1110 
1111   if (__kmp_tasking_mode != tskm_immediate_exec) {
1112     KMP_DEBUG_ASSERT(
1113         this_thr->th.th_task_team ==
1114         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1115     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1116                      NULL);
1117     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1118                   "team %p, new task_team = NULL\n",
1119                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1120     this_thr->th.th_task_team = NULL;
1121   }
1122 
1123   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1124   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1125     proc_bind = proc_bind_false;
1126   } else if (proc_bind == proc_bind_default) {
1127     // No proc_bind clause was specified, so use the current value
1128     // of proc-bind-var for this parallel region.
1129     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1130   }
1131   // Reset for next parallel region
1132   this_thr->th.th_set_proc_bind = proc_bind_default;
1133 
1134 #if OMPT_SUPPORT
1135   ompt_data_t ompt_parallel_data = ompt_data_none;
1136   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1137   if (ompt_enabled.enabled &&
1138       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1139 
1140     ompt_task_info_t *parent_task_info;
1141     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1142 
1143     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1144     if (ompt_enabled.ompt_callback_parallel_begin) {
1145       int team_size = 1;
1146 
1147       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1148           &(parent_task_info->task_data), &(parent_task_info->frame),
1149           &ompt_parallel_data, team_size,
1150           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1151     }
1152   }
1153 #endif // OMPT_SUPPORT
1154 
1155   if (this_thr->th.th_team != serial_team) {
1156     // Nested level will be an index in the nested nthreads array
1157     int level = this_thr->th.th_team->t.t_level;
1158 
1159     if (serial_team->t.t_serialized) {
1160       /* this serial team was already used
1161          TODO increase performance by making this locks more specific */
1162       kmp_team_t *new_team;
1163 
1164       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1165 
1166       new_team =
1167           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1168 #if OMPT_SUPPORT
1169                               ompt_parallel_data,
1170 #endif
1171                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1172                               0 USE_NESTED_HOT_ARG(NULL));
1173       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1174       KMP_ASSERT(new_team);
1175 
1176       /* setup new serialized team and install it */
1177       new_team->t.t_threads[0] = this_thr;
1178       new_team->t.t_parent = this_thr->th.th_team;
1179       serial_team = new_team;
1180       this_thr->th.th_serial_team = serial_team;
1181 
1182       KF_TRACE(
1183           10,
1184           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1185            global_tid, serial_team));
1186 
1187       /* TODO the above breaks the requirement that if we run out of resources,
1188          then we can still guarantee that serialized teams are ok, since we may
1189          need to allocate a new one */
1190     } else {
1191       KF_TRACE(
1192           10,
1193           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1194            global_tid, serial_team));
1195     }
1196 
1197     /* we have to initialize this serial team */
1198     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1199     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1200     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1201     serial_team->t.t_ident = loc;
1202     serial_team->t.t_serialized = 1;
1203     serial_team->t.t_nproc = 1;
1204     serial_team->t.t_parent = this_thr->th.th_team;
1205     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1206     this_thr->th.th_team = serial_team;
1207     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1208 
1209     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1210                   this_thr->th.th_current_task));
1211     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1212     this_thr->th.th_current_task->td_flags.executing = 0;
1213 
1214     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1215 
1216     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1217        implicit task for each serialized task represented by
1218        team->t.t_serialized? */
1219     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1220               &this_thr->th.th_current_task->td_parent->td_icvs);
1221 
1222     // Thread value exists in the nested nthreads array for the next nested
1223     // level
1224     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1225       this_thr->th.th_current_task->td_icvs.nproc =
1226           __kmp_nested_nth.nth[level + 1];
1227     }
1228 
1229     if (__kmp_nested_proc_bind.used &&
1230         (level + 1 < __kmp_nested_proc_bind.used)) {
1231       this_thr->th.th_current_task->td_icvs.proc_bind =
1232           __kmp_nested_proc_bind.bind_types[level + 1];
1233     }
1234 
1235 #if USE_DEBUGGER
1236     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1237 #endif
1238     this_thr->th.th_info.ds.ds_tid = 0;
1239 
1240     /* set thread cache values */
1241     this_thr->th.th_team_nproc = 1;
1242     this_thr->th.th_team_master = this_thr;
1243     this_thr->th.th_team_serialized = 1;
1244 
1245     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1246     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1247     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1248 
1249     propagateFPControl(serial_team);
1250 
1251     /* check if we need to allocate dispatch buffers stack */
1252     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1253     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1254       serial_team->t.t_dispatch->th_disp_buffer =
1255           (dispatch_private_info_t *)__kmp_allocate(
1256               sizeof(dispatch_private_info_t));
1257     }
1258     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1259 
1260     KMP_MB();
1261 
1262   } else {
1263     /* this serialized team is already being used,
1264      * that's fine, just add another nested level */
1265     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1266     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1267     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1268     ++serial_team->t.t_serialized;
1269     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1270 
1271     // Nested level will be an index in the nested nthreads array
1272     int level = this_thr->th.th_team->t.t_level;
1273     // Thread value exists in the nested nthreads array for the next nested
1274     // level
1275     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1276       this_thr->th.th_current_task->td_icvs.nproc =
1277           __kmp_nested_nth.nth[level + 1];
1278     }
1279     serial_team->t.t_level++;
1280     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1281                   "of serial team %p to %d\n",
1282                   global_tid, serial_team, serial_team->t.t_level));
1283 
1284     /* allocate/push dispatch buffers stack */
1285     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1286     {
1287       dispatch_private_info_t *disp_buffer =
1288           (dispatch_private_info_t *)__kmp_allocate(
1289               sizeof(dispatch_private_info_t));
1290       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1291       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1292     }
1293     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1294 
1295     KMP_MB();
1296   }
1297   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1298 
1299   // Perform the display affinity functionality for
1300   // serialized parallel regions
1301   if (__kmp_display_affinity) {
1302     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1303         this_thr->th.th_prev_num_threads != 1) {
1304       // NULL means use the affinity-format-var ICV
1305       __kmp_aux_display_affinity(global_tid, NULL);
1306       this_thr->th.th_prev_level = serial_team->t.t_level;
1307       this_thr->th.th_prev_num_threads = 1;
1308     }
1309   }
1310 
1311   if (__kmp_env_consistency_check)
1312     __kmp_push_parallel(global_tid, NULL);
1313 #if OMPT_SUPPORT
1314   serial_team->t.ompt_team_info.master_return_address = codeptr;
1315   if (ompt_enabled.enabled &&
1316       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1317     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1318         OMPT_GET_FRAME_ADDRESS(0);
1319 
1320     ompt_lw_taskteam_t lw_taskteam;
1321     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1322                             &ompt_parallel_data, codeptr);
1323 
1324     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1325     // don't use lw_taskteam after linking. content was swaped
1326 
1327     /* OMPT implicit task begin */
1328     if (ompt_enabled.ompt_callback_implicit_task) {
1329       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1330           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1331           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1332           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1333       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1334           __kmp_tid_from_gtid(global_tid);
1335     }
1336 
1337     /* OMPT state */
1338     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1339     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1340         OMPT_GET_FRAME_ADDRESS(0);
1341   }
1342 #endif
1343 }
1344 
1345 /* most of the work for a fork */
1346 /* return true if we really went parallel, false if serialized */
1347 int __kmp_fork_call(ident_t *loc, int gtid,
1348                     enum fork_context_e call_context, // Intel, GNU, ...
1349                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1350                     kmp_va_list ap) {
1351   void **argv;
1352   int i;
1353   int master_tid;
1354   int master_this_cons;
1355   kmp_team_t *team;
1356   kmp_team_t *parent_team;
1357   kmp_info_t *master_th;
1358   kmp_root_t *root;
1359   int nthreads;
1360   int master_active;
1361   int master_set_numthreads;
1362   int level;
1363   int active_level;
1364   int teams_level;
1365 #if KMP_NESTED_HOT_TEAMS
1366   kmp_hot_team_ptr_t **p_hot_teams;
1367 #endif
1368   { // KMP_TIME_BLOCK
1369     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1370     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1371 
1372     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1373     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1374       /* Some systems prefer the stack for the root thread(s) to start with */
1375       /* some gap from the parent stack to prevent false sharing. */
1376       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1377       /* These 2 lines below are so this does not get optimized out */
1378       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1379         __kmp_stkpadding += (short)((kmp_int64)dummy);
1380     }
1381 
1382     /* initialize if needed */
1383     KMP_DEBUG_ASSERT(
1384         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1385     if (!TCR_4(__kmp_init_parallel))
1386       __kmp_parallel_initialize();
1387     __kmp_resume_if_soft_paused();
1388 
1389     /* setup current data */
1390     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1391     // shutdown
1392     parent_team = master_th->th.th_team;
1393     master_tid = master_th->th.th_info.ds.ds_tid;
1394     master_this_cons = master_th->th.th_local.this_construct;
1395     root = master_th->th.th_root;
1396     master_active = root->r.r_active;
1397     master_set_numthreads = master_th->th.th_set_nproc;
1398 
1399 #if OMPT_SUPPORT
1400     ompt_data_t ompt_parallel_data = ompt_data_none;
1401     ompt_data_t *parent_task_data;
1402     ompt_frame_t *ompt_frame;
1403     ompt_data_t *implicit_task_data;
1404     void *return_address = NULL;
1405 
1406     if (ompt_enabled.enabled) {
1407       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1408                                     NULL, NULL);
1409       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1410     }
1411 #endif
1412 
1413     // Assign affinity to root thread if it hasn't happened yet
1414     __kmp_assign_root_init_mask();
1415 
1416     // Nested level will be an index in the nested nthreads array
1417     level = parent_team->t.t_level;
1418     // used to launch non-serial teams even if nested is not allowed
1419     active_level = parent_team->t.t_active_level;
1420     // needed to check nesting inside the teams
1421     teams_level = master_th->th.th_teams_level;
1422 #if KMP_NESTED_HOT_TEAMS
1423     p_hot_teams = &master_th->th.th_hot_teams;
1424     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1425       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1426           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1427       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1428       // it is either actual or not needed (when active_level > 0)
1429       (*p_hot_teams)[0].hot_team_nth = 1;
1430     }
1431 #endif
1432 
1433 #if OMPT_SUPPORT
1434     if (ompt_enabled.enabled) {
1435       if (ompt_enabled.ompt_callback_parallel_begin) {
1436         int team_size = master_set_numthreads
1437                             ? master_set_numthreads
1438                             : get__nproc_2(parent_team, master_tid);
1439         int flags = OMPT_INVOKER(call_context) |
1440                     ((microtask == (microtask_t)__kmp_teams_master)
1441                          ? ompt_parallel_league
1442                          : ompt_parallel_team);
1443         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1444             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1445             return_address);
1446       }
1447       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1448     }
1449 #endif
1450 
1451     master_th->th.th_ident = loc;
1452 
1453     if (master_th->th.th_teams_microtask && ap &&
1454         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1455       // AC: This is start of parallel that is nested inside teams construct.
1456       // The team is actual (hot), all workers are ready at the fork barrier.
1457       // No lock needed to initialize the team a bit, then free workers.
1458       parent_team->t.t_ident = loc;
1459       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1460       parent_team->t.t_argc = argc;
1461       argv = (void **)parent_team->t.t_argv;
1462       for (i = argc - 1; i >= 0; --i)
1463         *argv++ = va_arg(kmp_va_deref(ap), void *);
1464       // Increment our nested depth levels, but not increase the serialization
1465       if (parent_team == master_th->th.th_serial_team) {
1466         // AC: we are in serialized parallel
1467         __kmpc_serialized_parallel(loc, gtid);
1468         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1469 
1470         if (call_context == fork_context_gnu) {
1471           // AC: need to decrement t_serialized for enquiry functions to work
1472           // correctly, will restore at join time
1473           parent_team->t.t_serialized--;
1474           return TRUE;
1475         }
1476 
1477 #if OMPD_SUPPORT
1478         parent_team->t.t_pkfn = microtask;
1479 #endif
1480 
1481 #if OMPT_SUPPORT
1482         void *dummy;
1483         void **exit_frame_p;
1484 
1485         ompt_lw_taskteam_t lw_taskteam;
1486 
1487         if (ompt_enabled.enabled) {
1488           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1489                                   &ompt_parallel_data, return_address);
1490           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1491 
1492           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1493           // don't use lw_taskteam after linking. content was swaped
1494 
1495           /* OMPT implicit task begin */
1496           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1497           if (ompt_enabled.ompt_callback_implicit_task) {
1498             OMPT_CUR_TASK_INFO(master_th)->thread_num =
1499                 __kmp_tid_from_gtid(gtid);
1500             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1501                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1502                 implicit_task_data, 1,
1503                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1504           }
1505 
1506           /* OMPT state */
1507           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1508         } else {
1509           exit_frame_p = &dummy;
1510         }
1511 #endif
1512         // AC: need to decrement t_serialized for enquiry functions to work
1513         // correctly, will restore at join time
1514         parent_team->t.t_serialized--;
1515 
1516         {
1517           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1518           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1519           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1520 #if OMPT_SUPPORT
1521                                  ,
1522                                  exit_frame_p
1523 #endif
1524           );
1525         }
1526 
1527 #if OMPT_SUPPORT
1528         if (ompt_enabled.enabled) {
1529           *exit_frame_p = NULL;
1530           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1531           if (ompt_enabled.ompt_callback_implicit_task) {
1532             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1533                 ompt_scope_end, NULL, implicit_task_data, 1,
1534                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1535           }
1536           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1537           __ompt_lw_taskteam_unlink(master_th);
1538           if (ompt_enabled.ompt_callback_parallel_end) {
1539             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1540                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1541                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1542                 return_address);
1543           }
1544           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1545         }
1546 #endif
1547         return TRUE;
1548       }
1549 
1550       parent_team->t.t_pkfn = microtask;
1551       parent_team->t.t_invoke = invoker;
1552       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1553       parent_team->t.t_active_level++;
1554       parent_team->t.t_level++;
1555       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1556 
1557 #if OMPT_SUPPORT
1558       if (ompt_enabled.enabled) {
1559         ompt_lw_taskteam_t lw_taskteam;
1560         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1561                                 &ompt_parallel_data, return_address);
1562         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1563       }
1564 #endif
1565 
1566       /* Change number of threads in the team if requested */
1567       if (master_set_numthreads) { // The parallel has num_threads clause
1568         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1569           // AC: only can reduce number of threads dynamically, can't increase
1570           kmp_info_t **other_threads = parent_team->t.t_threads;
1571           parent_team->t.t_nproc = master_set_numthreads;
1572           for (i = 0; i < master_set_numthreads; ++i) {
1573             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1574           }
1575           // Keep extra threads hot in the team for possible next parallels
1576         }
1577         master_th->th.th_set_nproc = 0;
1578       }
1579 
1580 #if USE_DEBUGGER
1581       if (__kmp_debugging) { // Let debugger override number of threads.
1582         int nth = __kmp_omp_num_threads(loc);
1583         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1584           master_set_numthreads = nth;
1585         }
1586       }
1587 #endif
1588 
1589 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1590       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1591            KMP_ITT_DEBUG) &&
1592           __kmp_forkjoin_frames_mode == 3 &&
1593           parent_team->t.t_active_level == 1 // only report frames at level 1
1594           && master_th->th.th_teams_size.nteams == 1) {
1595         kmp_uint64 tmp_time = __itt_get_timestamp();
1596         master_th->th.th_frame_time = tmp_time;
1597         parent_team->t.t_region_time = tmp_time;
1598       }
1599       if (__itt_stack_caller_create_ptr) {
1600         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1601         // create new stack stitching id before entering fork barrier
1602         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1603       }
1604 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1605 
1606       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1607                     "master_th=%p, gtid=%d\n",
1608                     root, parent_team, master_th, gtid));
1609       __kmp_internal_fork(loc, gtid, parent_team);
1610       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1611                     "master_th=%p, gtid=%d\n",
1612                     root, parent_team, master_th, gtid));
1613 
1614       if (call_context == fork_context_gnu)
1615         return TRUE;
1616 
1617       /* Invoke microtask for PRIMARY thread */
1618       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1619                     parent_team->t.t_id, parent_team->t.t_pkfn));
1620 
1621       if (!parent_team->t.t_invoke(gtid)) {
1622         KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1623       }
1624       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1625                     parent_team->t.t_id, parent_team->t.t_pkfn));
1626       KMP_MB(); /* Flush all pending memory write invalidates.  */
1627 
1628       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1629 
1630       return TRUE;
1631     } // Parallel closely nested in teams construct
1632 
1633 #if KMP_DEBUG
1634     if (__kmp_tasking_mode != tskm_immediate_exec) {
1635       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1636                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1637     }
1638 #endif
1639 
1640     int enter_teams = 0;
1641     if (parent_team->t.t_active_level >=
1642         master_th->th.th_current_task->td_icvs.max_active_levels) {
1643       nthreads = 1;
1644     } else {
1645       enter_teams = ((ap == NULL && active_level == 0) ||
1646                      (ap && teams_level > 0 && teams_level == level));
1647       nthreads =
1648           master_set_numthreads
1649               ? master_set_numthreads
1650               : get__nproc_2(
1651                     parent_team,
1652                     master_tid); // TODO: get nproc directly from current task
1653 
1654       // Check if we need to take forkjoin lock? (no need for serialized
1655       // parallel out of teams construct). This code moved here from
1656       // __kmp_reserve_threads() to speedup nested serialized parallels.
1657       if (nthreads > 1) {
1658         if ((get__max_active_levels(master_th) == 1 &&
1659              (root->r.r_in_parallel && !enter_teams)) ||
1660             (__kmp_library == library_serial)) {
1661           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1662                         " threads\n",
1663                         gtid, nthreads));
1664           nthreads = 1;
1665         }
1666       }
1667       if (nthreads > 1) {
1668         /* determine how many new threads we can use */
1669         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1670         /* AC: If we execute teams from parallel region (on host), then teams
1671            should be created but each can only have 1 thread if nesting is
1672            disabled. If teams called from serial region, then teams and their
1673            threads should be created regardless of the nesting setting. */
1674         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1675                                          nthreads, enter_teams);
1676         if (nthreads == 1) {
1677           // Free lock for single thread execution here; for multi-thread
1678           // execution it will be freed later after team of threads created
1679           // and initialized
1680           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1681         }
1682       }
1683     }
1684     KMP_DEBUG_ASSERT(nthreads > 0);
1685 
1686     // If we temporarily changed the set number of threads then restore it now
1687     master_th->th.th_set_nproc = 0;
1688 
1689     /* create a serialized parallel region? */
1690     if (nthreads == 1) {
1691 /* josh todo: hypothetical question: what do we do for OS X*? */
1692 #if KMP_OS_LINUX &&                                                            \
1693     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1694       void *args[argc];
1695 #else
1696       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1697 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1698           KMP_ARCH_AARCH64) */
1699 
1700       KA_TRACE(20,
1701                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1702 
1703       __kmpc_serialized_parallel(loc, gtid);
1704 
1705 #if OMPD_SUPPORT
1706       master_th->th.th_serial_team->t.t_pkfn = microtask;
1707 #endif
1708 
1709       if (call_context == fork_context_intel) {
1710         /* TODO this sucks, use the compiler itself to pass args! :) */
1711         master_th->th.th_serial_team->t.t_ident = loc;
1712         if (!ap) {
1713           // revert change made in __kmpc_serialized_parallel()
1714           master_th->th.th_serial_team->t.t_level--;
1715           // Get args from parent team for teams construct
1716 
1717 #if OMPT_SUPPORT
1718           void *dummy;
1719           void **exit_frame_p;
1720           ompt_task_info_t *task_info;
1721 
1722           ompt_lw_taskteam_t lw_taskteam;
1723 
1724           if (ompt_enabled.enabled) {
1725             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1726                                     &ompt_parallel_data, return_address);
1727 
1728             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1729             // don't use lw_taskteam after linking. content was swaped
1730 
1731             task_info = OMPT_CUR_TASK_INFO(master_th);
1732             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1733             if (ompt_enabled.ompt_callback_implicit_task) {
1734               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1735                   __kmp_tid_from_gtid(gtid);
1736               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1737                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1738                   &(task_info->task_data), 1,
1739                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1740                   ompt_task_implicit);
1741             }
1742 
1743             /* OMPT state */
1744             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1745           } else {
1746             exit_frame_p = &dummy;
1747           }
1748 #endif
1749 
1750           {
1751             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1752             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1753             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1754                                    parent_team->t.t_argv
1755 #if OMPT_SUPPORT
1756                                    ,
1757                                    exit_frame_p
1758 #endif
1759             );
1760           }
1761 
1762 #if OMPT_SUPPORT
1763           if (ompt_enabled.enabled) {
1764             *exit_frame_p = NULL;
1765             if (ompt_enabled.ompt_callback_implicit_task) {
1766               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1767                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1768                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1769                   ompt_task_implicit);
1770             }
1771             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1772             __ompt_lw_taskteam_unlink(master_th);
1773             if (ompt_enabled.ompt_callback_parallel_end) {
1774               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1775                   &ompt_parallel_data, parent_task_data,
1776                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1777                   return_address);
1778             }
1779             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1780           }
1781 #endif
1782         } else if (microtask == (microtask_t)__kmp_teams_master) {
1783           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1784                            master_th->th.th_serial_team);
1785           team = master_th->th.th_team;
1786           // team->t.t_pkfn = microtask;
1787           team->t.t_invoke = invoker;
1788           __kmp_alloc_argv_entries(argc, team, TRUE);
1789           team->t.t_argc = argc;
1790           argv = (void **)team->t.t_argv;
1791           if (ap) {
1792             for (i = argc - 1; i >= 0; --i)
1793               *argv++ = va_arg(kmp_va_deref(ap), void *);
1794           } else {
1795             for (i = 0; i < argc; ++i)
1796               // Get args from parent team for teams construct
1797               argv[i] = parent_team->t.t_argv[i];
1798           }
1799           // AC: revert change made in __kmpc_serialized_parallel()
1800           //     because initial code in teams should have level=0
1801           team->t.t_level--;
1802           // AC: call special invoker for outer "parallel" of teams construct
1803           invoker(gtid);
1804 #if OMPT_SUPPORT
1805           if (ompt_enabled.enabled) {
1806             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1807             if (ompt_enabled.ompt_callback_implicit_task) {
1808               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1809                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1810                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1811             }
1812             if (ompt_enabled.ompt_callback_parallel_end) {
1813               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1814                   &ompt_parallel_data, parent_task_data,
1815                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1816                   return_address);
1817             }
1818             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1819           }
1820 #endif
1821         } else {
1822           argv = args;
1823           for (i = argc - 1; i >= 0; --i)
1824             *argv++ = va_arg(kmp_va_deref(ap), void *);
1825           KMP_MB();
1826 
1827 #if OMPT_SUPPORT
1828           void *dummy;
1829           void **exit_frame_p;
1830           ompt_task_info_t *task_info;
1831 
1832           ompt_lw_taskteam_t lw_taskteam;
1833 
1834           if (ompt_enabled.enabled) {
1835             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1836                                     &ompt_parallel_data, return_address);
1837             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1838             // don't use lw_taskteam after linking. content was swaped
1839             task_info = OMPT_CUR_TASK_INFO(master_th);
1840             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1841 
1842             /* OMPT implicit task begin */
1843             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1844             if (ompt_enabled.ompt_callback_implicit_task) {
1845               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1846                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1847                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1848                   ompt_task_implicit);
1849               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1850                   __kmp_tid_from_gtid(gtid);
1851             }
1852 
1853             /* OMPT state */
1854             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1855           } else {
1856             exit_frame_p = &dummy;
1857           }
1858 #endif
1859 
1860           {
1861             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1862             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1863             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1864 #if OMPT_SUPPORT
1865                                    ,
1866                                    exit_frame_p
1867 #endif
1868             );
1869           }
1870 
1871 #if OMPT_SUPPORT
1872           if (ompt_enabled.enabled) {
1873             *exit_frame_p = NULL;
1874             if (ompt_enabled.ompt_callback_implicit_task) {
1875               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1876                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1877                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1878                   ompt_task_implicit);
1879             }
1880 
1881             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1882             __ompt_lw_taskteam_unlink(master_th);
1883             if (ompt_enabled.ompt_callback_parallel_end) {
1884               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1885                   &ompt_parallel_data, parent_task_data,
1886                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1887                   return_address);
1888             }
1889             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1890           }
1891 #endif
1892         }
1893       } else if (call_context == fork_context_gnu) {
1894 #if OMPT_SUPPORT
1895         ompt_lw_taskteam_t lwt;
1896         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1897                                 return_address);
1898 
1899         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1900         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1901 // don't use lw_taskteam after linking. content was swaped
1902 #endif
1903 
1904         // we were called from GNU native code
1905         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1906         return FALSE;
1907       } else {
1908         KMP_ASSERT2(call_context < fork_context_last,
1909                     "__kmp_fork_call: unknown fork_context parameter");
1910       }
1911 
1912       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1913       KMP_MB();
1914       return FALSE;
1915     } // if (nthreads == 1)
1916 
1917     // GEH: only modify the executing flag in the case when not serialized
1918     //      serialized case is handled in kmpc_serialized_parallel
1919     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1920                   "curtask=%p, curtask_max_aclevel=%d\n",
1921                   parent_team->t.t_active_level, master_th,
1922                   master_th->th.th_current_task,
1923                   master_th->th.th_current_task->td_icvs.max_active_levels));
1924     // TODO: GEH - cannot do this assertion because root thread not set up as
1925     // executing
1926     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1927     master_th->th.th_current_task->td_flags.executing = 0;
1928 
1929     if (!master_th->th.th_teams_microtask || level > teams_level) {
1930       /* Increment our nested depth level */
1931       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1932     }
1933 
1934     // See if we need to make a copy of the ICVs.
1935     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1936     if ((level + 1 < __kmp_nested_nth.used) &&
1937         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1938       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1939     } else {
1940       nthreads_icv = 0; // don't update
1941     }
1942 
1943     // Figure out the proc_bind_policy for the new team.
1944     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1945     kmp_proc_bind_t proc_bind_icv =
1946         proc_bind_default; // proc_bind_default means don't update
1947     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1948       proc_bind = proc_bind_false;
1949     } else {
1950       if (proc_bind == proc_bind_default) {
1951         // No proc_bind clause specified; use current proc-bind-var for this
1952         // parallel region
1953         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1954       }
1955       /* else: The proc_bind policy was specified explicitly on parallel clause.
1956          This overrides proc-bind-var for this parallel region, but does not
1957          change proc-bind-var. */
1958       // Figure the value of proc-bind-var for the child threads.
1959       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1960           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1961            master_th->th.th_current_task->td_icvs.proc_bind)) {
1962         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1963       }
1964     }
1965 
1966     // Reset for next parallel region
1967     master_th->th.th_set_proc_bind = proc_bind_default;
1968 
1969     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1970       kmp_internal_control_t new_icvs;
1971       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1972       new_icvs.next = NULL;
1973       if (nthreads_icv > 0) {
1974         new_icvs.nproc = nthreads_icv;
1975       }
1976       if (proc_bind_icv != proc_bind_default) {
1977         new_icvs.proc_bind = proc_bind_icv;
1978       }
1979 
1980       /* allocate a new parallel team */
1981       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1982       team = __kmp_allocate_team(root, nthreads, nthreads,
1983 #if OMPT_SUPPORT
1984                                  ompt_parallel_data,
1985 #endif
1986                                  proc_bind, &new_icvs,
1987                                  argc USE_NESTED_HOT_ARG(master_th));
1988     } else {
1989       /* allocate a new parallel team */
1990       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1991       team = __kmp_allocate_team(root, nthreads, nthreads,
1992 #if OMPT_SUPPORT
1993                                  ompt_parallel_data,
1994 #endif
1995                                  proc_bind,
1996                                  &master_th->th.th_current_task->td_icvs,
1997                                  argc USE_NESTED_HOT_ARG(master_th));
1998     }
1999     KF_TRACE(
2000         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2001 
2002     /* setup the new team */
2003     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2004     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2005     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2006     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2007     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2008 #if OMPT_SUPPORT
2009     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2010                           return_address);
2011 #endif
2012     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2013     // TODO: parent_team->t.t_level == INT_MAX ???
2014     if (!master_th->th.th_teams_microtask || level > teams_level) {
2015       int new_level = parent_team->t.t_level + 1;
2016       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2017       new_level = parent_team->t.t_active_level + 1;
2018       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2019     } else {
2020       // AC: Do not increase parallel level at start of the teams construct
2021       int new_level = parent_team->t.t_level;
2022       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2023       new_level = parent_team->t.t_active_level;
2024       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2025     }
2026     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2027     // set primary thread's schedule as new run-time schedule
2028     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2029 
2030     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2031     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2032 
2033     // Update the floating point rounding in the team if required.
2034     propagateFPControl(team);
2035 #if OMPD_SUPPORT
2036     if (ompd_state & OMPD_ENABLE_BP)
2037       ompd_bp_parallel_begin();
2038 #endif
2039 
2040     if (__kmp_tasking_mode != tskm_immediate_exec) {
2041       // Set primary thread's task team to team's task team. Unless this is hot
2042       // team, it should be NULL.
2043       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2044                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2045       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2046                     "%p, new task_team %p / team %p\n",
2047                     __kmp_gtid_from_thread(master_th),
2048                     master_th->th.th_task_team, parent_team,
2049                     team->t.t_task_team[master_th->th.th_task_state], team));
2050 
2051       if (active_level || master_th->th.th_task_team) {
2052         // Take a memo of primary thread's task_state
2053         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2054         if (master_th->th.th_task_state_top >=
2055             master_th->th.th_task_state_stack_sz) { // increase size
2056           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2057           kmp_uint8 *old_stack, *new_stack;
2058           kmp_uint32 i;
2059           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2060           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2061             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2062           }
2063           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2064                ++i) { // zero-init rest of stack
2065             new_stack[i] = 0;
2066           }
2067           old_stack = master_th->th.th_task_state_memo_stack;
2068           master_th->th.th_task_state_memo_stack = new_stack;
2069           master_th->th.th_task_state_stack_sz = new_size;
2070           __kmp_free(old_stack);
2071         }
2072         // Store primary thread's task_state on stack
2073         master_th->th
2074             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2075             master_th->th.th_task_state;
2076         master_th->th.th_task_state_top++;
2077 #if KMP_NESTED_HOT_TEAMS
2078         if (master_th->th.th_hot_teams &&
2079             active_level < __kmp_hot_teams_max_level &&
2080             team == master_th->th.th_hot_teams[active_level].hot_team) {
2081           // Restore primary thread's nested state if nested hot team
2082           master_th->th.th_task_state =
2083               master_th->th
2084                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2085         } else {
2086 #endif
2087           master_th->th.th_task_state = 0;
2088 #if KMP_NESTED_HOT_TEAMS
2089         }
2090 #endif
2091       }
2092 #if !KMP_NESTED_HOT_TEAMS
2093       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2094                        (team == root->r.r_hot_team));
2095 #endif
2096     }
2097 
2098     KA_TRACE(
2099         20,
2100         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2101          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2102          team->t.t_nproc));
2103     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2104                      (team->t.t_master_tid == 0 &&
2105                       (team->t.t_parent == root->r.r_root_team ||
2106                        team->t.t_parent->t.t_serialized)));
2107     KMP_MB();
2108 
2109     /* now, setup the arguments */
2110     argv = (void **)team->t.t_argv;
2111     if (ap) {
2112       for (i = argc - 1; i >= 0; --i) {
2113         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2114         KMP_CHECK_UPDATE(*argv, new_argv);
2115         argv++;
2116       }
2117     } else {
2118       for (i = 0; i < argc; ++i) {
2119         // Get args from parent team for teams construct
2120         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2121       }
2122     }
2123 
2124     /* now actually fork the threads */
2125     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2126     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2127       root->r.r_active = TRUE;
2128 
2129     __kmp_fork_team_threads(root, team, master_th, gtid);
2130     __kmp_setup_icv_copy(team, nthreads,
2131                          &master_th->th.th_current_task->td_icvs, loc);
2132 
2133 #if OMPT_SUPPORT
2134     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2135 #endif
2136 
2137     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2138 
2139 #if USE_ITT_BUILD
2140     if (team->t.t_active_level == 1 // only report frames at level 1
2141         && !master_th->th.th_teams_microtask) { // not in teams construct
2142 #if USE_ITT_NOTIFY
2143       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2144           (__kmp_forkjoin_frames_mode == 3 ||
2145            __kmp_forkjoin_frames_mode == 1)) {
2146         kmp_uint64 tmp_time = 0;
2147         if (__itt_get_timestamp_ptr)
2148           tmp_time = __itt_get_timestamp();
2149         // Internal fork - report frame begin
2150         master_th->th.th_frame_time = tmp_time;
2151         if (__kmp_forkjoin_frames_mode == 3)
2152           team->t.t_region_time = tmp_time;
2153       } else
2154 // only one notification scheme (either "submit" or "forking/joined", not both)
2155 #endif /* USE_ITT_NOTIFY */
2156           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2157               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2158         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2159         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2160       }
2161     }
2162 #endif /* USE_ITT_BUILD */
2163 
2164     /* now go on and do the work */
2165     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2166     KMP_MB();
2167     KF_TRACE(10,
2168              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2169               root, team, master_th, gtid));
2170 
2171 #if USE_ITT_BUILD
2172     if (__itt_stack_caller_create_ptr) {
2173       // create new stack stitching id before entering fork barrier
2174       if (!enter_teams) {
2175         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2176         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2177       } else if (parent_team->t.t_serialized) {
2178         // keep stack stitching id in the serialized parent_team;
2179         // current team will be used for parallel inside the teams;
2180         // if parent_team is active, then it already keeps stack stitching id
2181         // for the league of teams
2182         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2183         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2184       }
2185     }
2186 #endif /* USE_ITT_BUILD */
2187 
2188     // AC: skip __kmp_internal_fork at teams construct, let only primary
2189     // threads execute
2190     if (ap) {
2191       __kmp_internal_fork(loc, gtid, team);
2192       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2193                     "master_th=%p, gtid=%d\n",
2194                     root, team, master_th, gtid));
2195     }
2196 
2197     if (call_context == fork_context_gnu) {
2198       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2199       return TRUE;
2200     }
2201 
2202     /* Invoke microtask for PRIMARY thread */
2203     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2204                   team->t.t_id, team->t.t_pkfn));
2205   } // END of timer KMP_fork_call block
2206 
2207 #if KMP_STATS_ENABLED
2208   // If beginning a teams construct, then change thread state
2209   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2210   if (!ap) {
2211     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2212   }
2213 #endif
2214 
2215   if (!team->t.t_invoke(gtid)) {
2216     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2217   }
2218 
2219 #if KMP_STATS_ENABLED
2220   // If was beginning of a teams construct, then reset thread state
2221   if (!ap) {
2222     KMP_SET_THREAD_STATE(previous_state);
2223   }
2224 #endif
2225 
2226   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2227                 team->t.t_id, team->t.t_pkfn));
2228   KMP_MB(); /* Flush all pending memory write invalidates.  */
2229 
2230   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2231 #if OMPT_SUPPORT
2232   if (ompt_enabled.enabled) {
2233     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2234   }
2235 #endif
2236 
2237   return TRUE;
2238 }
2239 
2240 #if OMPT_SUPPORT
2241 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2242                                             kmp_team_t *team) {
2243   // restore state outside the region
2244   thread->th.ompt_thread_info.state =
2245       ((team->t.t_serialized) ? ompt_state_work_serial
2246                               : ompt_state_work_parallel);
2247 }
2248 
2249 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2250                                    kmp_team_t *team, ompt_data_t *parallel_data,
2251                                    int flags, void *codeptr) {
2252   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2253   if (ompt_enabled.ompt_callback_parallel_end) {
2254     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2255         parallel_data, &(task_info->task_data), flags, codeptr);
2256   }
2257 
2258   task_info->frame.enter_frame = ompt_data_none;
2259   __kmp_join_restore_state(thread, team);
2260 }
2261 #endif
2262 
2263 void __kmp_join_call(ident_t *loc, int gtid
2264 #if OMPT_SUPPORT
2265                      ,
2266                      enum fork_context_e fork_context
2267 #endif
2268                      ,
2269                      int exit_teams) {
2270   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2271   kmp_team_t *team;
2272   kmp_team_t *parent_team;
2273   kmp_info_t *master_th;
2274   kmp_root_t *root;
2275   int master_active;
2276 
2277   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2278 
2279   /* setup current data */
2280   master_th = __kmp_threads[gtid];
2281   root = master_th->th.th_root;
2282   team = master_th->th.th_team;
2283   parent_team = team->t.t_parent;
2284 
2285   master_th->th.th_ident = loc;
2286 
2287 #if OMPT_SUPPORT
2288   void *team_microtask = (void *)team->t.t_pkfn;
2289   // For GOMP interface with serialized parallel, need the
2290   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2291   // and end-parallel events.
2292   if (ompt_enabled.enabled &&
2293       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2294     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2295   }
2296 #endif
2297 
2298 #if KMP_DEBUG
2299   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2300     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2301                   "th_task_team = %p\n",
2302                   __kmp_gtid_from_thread(master_th), team,
2303                   team->t.t_task_team[master_th->th.th_task_state],
2304                   master_th->th.th_task_team));
2305     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2306                      team->t.t_task_team[master_th->th.th_task_state]);
2307   }
2308 #endif
2309 
2310   if (team->t.t_serialized) {
2311     if (master_th->th.th_teams_microtask) {
2312       // We are in teams construct
2313       int level = team->t.t_level;
2314       int tlevel = master_th->th.th_teams_level;
2315       if (level == tlevel) {
2316         // AC: we haven't incremented it earlier at start of teams construct,
2317         //     so do it here - at the end of teams construct
2318         team->t.t_level++;
2319       } else if (level == tlevel + 1) {
2320         // AC: we are exiting parallel inside teams, need to increment
2321         // serialization in order to restore it in the next call to
2322         // __kmpc_end_serialized_parallel
2323         team->t.t_serialized++;
2324       }
2325     }
2326     __kmpc_end_serialized_parallel(loc, gtid);
2327 
2328 #if OMPT_SUPPORT
2329     if (ompt_enabled.enabled) {
2330       __kmp_join_restore_state(master_th, parent_team);
2331     }
2332 #endif
2333 
2334     return;
2335   }
2336 
2337   master_active = team->t.t_master_active;
2338 
2339   if (!exit_teams) {
2340     // AC: No barrier for internal teams at exit from teams construct.
2341     //     But there is barrier for external team (league).
2342     __kmp_internal_join(loc, gtid, team);
2343 #if USE_ITT_BUILD
2344     if (__itt_stack_caller_create_ptr) {
2345       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2346       // destroy the stack stitching id after join barrier
2347       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2348       team->t.t_stack_id = NULL;
2349     }
2350 #endif
2351   } else {
2352     master_th->th.th_task_state =
2353         0; // AC: no tasking in teams (out of any parallel)
2354 #if USE_ITT_BUILD
2355     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2356       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2357       // destroy the stack stitching id on exit from the teams construct
2358       // if parent_team is active, then the id will be destroyed later on
2359       // by master of the league of teams
2360       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2361       parent_team->t.t_stack_id = NULL;
2362     }
2363 #endif
2364   }
2365 
2366   KMP_MB();
2367 
2368 #if OMPT_SUPPORT
2369   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2370   void *codeptr = team->t.ompt_team_info.master_return_address;
2371 #endif
2372 
2373 #if USE_ITT_BUILD
2374   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2375   if (team->t.t_active_level == 1 &&
2376       (!master_th->th.th_teams_microtask || /* not in teams construct */
2377        master_th->th.th_teams_size.nteams == 1)) {
2378     master_th->th.th_ident = loc;
2379     // only one notification scheme (either "submit" or "forking/joined", not
2380     // both)
2381     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2382         __kmp_forkjoin_frames_mode == 3)
2383       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2384                              master_th->th.th_frame_time, 0, loc,
2385                              master_th->th.th_team_nproc, 1);
2386     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2387              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2388       __kmp_itt_region_joined(gtid);
2389   } // active_level == 1
2390 #endif /* USE_ITT_BUILD */
2391 
2392   if (master_th->th.th_teams_microtask && !exit_teams &&
2393       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2394       team->t.t_level == master_th->th.th_teams_level + 1) {
2395 // AC: We need to leave the team structure intact at the end of parallel
2396 // inside the teams construct, so that at the next parallel same (hot) team
2397 // works, only adjust nesting levels
2398 #if OMPT_SUPPORT
2399     ompt_data_t ompt_parallel_data = ompt_data_none;
2400     if (ompt_enabled.enabled) {
2401       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2402       if (ompt_enabled.ompt_callback_implicit_task) {
2403         int ompt_team_size = team->t.t_nproc;
2404         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2405             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2406             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2407       }
2408       task_info->frame.exit_frame = ompt_data_none;
2409       task_info->task_data = ompt_data_none;
2410       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2411       __ompt_lw_taskteam_unlink(master_th);
2412     }
2413 #endif
2414     /* Decrement our nested depth level */
2415     team->t.t_level--;
2416     team->t.t_active_level--;
2417     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2418 
2419     // Restore number of threads in the team if needed. This code relies on
2420     // the proper adjustment of th_teams_size.nth after the fork in
2421     // __kmp_teams_master on each teams primary thread in the case that
2422     // __kmp_reserve_threads reduced it.
2423     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2424       int old_num = master_th->th.th_team_nproc;
2425       int new_num = master_th->th.th_teams_size.nth;
2426       kmp_info_t **other_threads = team->t.t_threads;
2427       team->t.t_nproc = new_num;
2428       for (int i = 0; i < old_num; ++i) {
2429         other_threads[i]->th.th_team_nproc = new_num;
2430       }
2431       // Adjust states of non-used threads of the team
2432       for (int i = old_num; i < new_num; ++i) {
2433         // Re-initialize thread's barrier data.
2434         KMP_DEBUG_ASSERT(other_threads[i]);
2435         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2436         for (int b = 0; b < bs_last_barrier; ++b) {
2437           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2438           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2439 #if USE_DEBUGGER
2440           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2441 #endif
2442         }
2443         if (__kmp_tasking_mode != tskm_immediate_exec) {
2444           // Synchronize thread's task state
2445           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2446         }
2447       }
2448     }
2449 
2450 #if OMPT_SUPPORT
2451     if (ompt_enabled.enabled) {
2452       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2453                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2454     }
2455 #endif
2456 
2457     return;
2458   }
2459 
2460   /* do cleanup and restore the parent team */
2461   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2462   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2463 
2464   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2465 
2466   /* jc: The following lock has instructions with REL and ACQ semantics,
2467      separating the parallel user code called in this parallel region
2468      from the serial user code called after this function returns. */
2469   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2470 
2471   if (!master_th->th.th_teams_microtask ||
2472       team->t.t_level > master_th->th.th_teams_level) {
2473     /* Decrement our nested depth level */
2474     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2475   }
2476   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2477 
2478 #if OMPT_SUPPORT
2479   if (ompt_enabled.enabled) {
2480     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2481     if (ompt_enabled.ompt_callback_implicit_task) {
2482       int flags = (team_microtask == (void *)__kmp_teams_master)
2483                       ? ompt_task_initial
2484                       : ompt_task_implicit;
2485       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2486       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2487           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2488           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2489     }
2490     task_info->frame.exit_frame = ompt_data_none;
2491     task_info->task_data = ompt_data_none;
2492   }
2493 #endif
2494 
2495   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2496                 master_th, team));
2497   __kmp_pop_current_task_from_thread(master_th);
2498 
2499 #if KMP_AFFINITY_SUPPORTED
2500   // Restore master thread's partition.
2501   master_th->th.th_first_place = team->t.t_first_place;
2502   master_th->th.th_last_place = team->t.t_last_place;
2503 #endif // KMP_AFFINITY_SUPPORTED
2504   master_th->th.th_def_allocator = team->t.t_def_allocator;
2505 
2506 #if OMPD_SUPPORT
2507   if (ompd_state & OMPD_ENABLE_BP)
2508     ompd_bp_parallel_end();
2509 #endif
2510   updateHWFPControl(team);
2511 
2512   if (root->r.r_active != master_active)
2513     root->r.r_active = master_active;
2514 
2515   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2516                             master_th)); // this will free worker threads
2517 
2518   /* this race was fun to find. make sure the following is in the critical
2519      region otherwise assertions may fail occasionally since the old team may be
2520      reallocated and the hierarchy appears inconsistent. it is actually safe to
2521      run and won't cause any bugs, but will cause those assertion failures. it's
2522      only one deref&assign so might as well put this in the critical region */
2523   master_th->th.th_team = parent_team;
2524   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2525   master_th->th.th_team_master = parent_team->t.t_threads[0];
2526   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2527 
2528   /* restore serialized team, if need be */
2529   if (parent_team->t.t_serialized &&
2530       parent_team != master_th->th.th_serial_team &&
2531       parent_team != root->r.r_root_team) {
2532     __kmp_free_team(root,
2533                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2534     master_th->th.th_serial_team = parent_team;
2535   }
2536 
2537   if (__kmp_tasking_mode != tskm_immediate_exec) {
2538     if (master_th->th.th_task_state_top >
2539         0) { // Restore task state from memo stack
2540       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2541       // Remember primary thread's state if we re-use this nested hot team
2542       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2543           master_th->th.th_task_state;
2544       --master_th->th.th_task_state_top; // pop
2545       // Now restore state at this level
2546       master_th->th.th_task_state =
2547           master_th->th
2548               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2549     }
2550     // Copy the task team from the parent team to the primary thread
2551     master_th->th.th_task_team =
2552         parent_team->t.t_task_team[master_th->th.th_task_state];
2553     KA_TRACE(20,
2554              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2555               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2556               parent_team));
2557   }
2558 
2559   // TODO: GEH - cannot do this assertion because root thread not set up as
2560   // executing
2561   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2562   master_th->th.th_current_task->td_flags.executing = 1;
2563 
2564   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2565 
2566 #if OMPT_SUPPORT
2567   int flags =
2568       OMPT_INVOKER(fork_context) |
2569       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2570                                                       : ompt_parallel_team);
2571   if (ompt_enabled.enabled) {
2572     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2573                     codeptr);
2574   }
2575 #endif
2576 
2577   KMP_MB();
2578   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2579 }
2580 
2581 /* Check whether we should push an internal control record onto the
2582    serial team stack.  If so, do it.  */
2583 void __kmp_save_internal_controls(kmp_info_t *thread) {
2584 
2585   if (thread->th.th_team != thread->th.th_serial_team) {
2586     return;
2587   }
2588   if (thread->th.th_team->t.t_serialized > 1) {
2589     int push = 0;
2590 
2591     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2592       push = 1;
2593     } else {
2594       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2595           thread->th.th_team->t.t_serialized) {
2596         push = 1;
2597       }
2598     }
2599     if (push) { /* push a record on the serial team's stack */
2600       kmp_internal_control_t *control =
2601           (kmp_internal_control_t *)__kmp_allocate(
2602               sizeof(kmp_internal_control_t));
2603 
2604       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2605 
2606       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2607 
2608       control->next = thread->th.th_team->t.t_control_stack_top;
2609       thread->th.th_team->t.t_control_stack_top = control;
2610     }
2611   }
2612 }
2613 
2614 /* Changes set_nproc */
2615 void __kmp_set_num_threads(int new_nth, int gtid) {
2616   kmp_info_t *thread;
2617   kmp_root_t *root;
2618 
2619   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2620   KMP_DEBUG_ASSERT(__kmp_init_serial);
2621 
2622   if (new_nth < 1)
2623     new_nth = 1;
2624   else if (new_nth > __kmp_max_nth)
2625     new_nth = __kmp_max_nth;
2626 
2627   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2628   thread = __kmp_threads[gtid];
2629   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2630     return; // nothing to do
2631 
2632   __kmp_save_internal_controls(thread);
2633 
2634   set__nproc(thread, new_nth);
2635 
2636   // If this omp_set_num_threads() call will cause the hot team size to be
2637   // reduced (in the absence of a num_threads clause), then reduce it now,
2638   // rather than waiting for the next parallel region.
2639   root = thread->th.th_root;
2640   if (__kmp_init_parallel && (!root->r.r_active) &&
2641       (root->r.r_hot_team->t.t_nproc > new_nth)
2642 #if KMP_NESTED_HOT_TEAMS
2643       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2644 #endif
2645   ) {
2646     kmp_team_t *hot_team = root->r.r_hot_team;
2647     int f;
2648 
2649     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2650 
2651     // Release the extra threads we don't need any more.
2652     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2653       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2654       if (__kmp_tasking_mode != tskm_immediate_exec) {
2655         // When decreasing team size, threads no longer in the team should unref
2656         // task team.
2657         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2658       }
2659       __kmp_free_thread(hot_team->t.t_threads[f]);
2660       hot_team->t.t_threads[f] = NULL;
2661     }
2662     hot_team->t.t_nproc = new_nth;
2663 #if KMP_NESTED_HOT_TEAMS
2664     if (thread->th.th_hot_teams) {
2665       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2666       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2667     }
2668 #endif
2669 
2670     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2671 
2672     // Update the t_nproc field in the threads that are still active.
2673     for (f = 0; f < new_nth; f++) {
2674       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2675       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2676     }
2677     // Special flag in case omp_set_num_threads() call
2678     hot_team->t.t_size_changed = -1;
2679   }
2680 }
2681 
2682 /* Changes max_active_levels */
2683 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2684   kmp_info_t *thread;
2685 
2686   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2687                 "%d = (%d)\n",
2688                 gtid, max_active_levels));
2689   KMP_DEBUG_ASSERT(__kmp_init_serial);
2690 
2691   // validate max_active_levels
2692   if (max_active_levels < 0) {
2693     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2694     // We ignore this call if the user has specified a negative value.
2695     // The current setting won't be changed. The last valid setting will be
2696     // used. A warning will be issued (if warnings are allowed as controlled by
2697     // the KMP_WARNINGS env var).
2698     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2699                   "max_active_levels for thread %d = (%d)\n",
2700                   gtid, max_active_levels));
2701     return;
2702   }
2703   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2704     // it's OK, the max_active_levels is within the valid range: [ 0;
2705     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2706     // We allow a zero value. (implementation defined behavior)
2707   } else {
2708     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2709                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2710     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2711     // Current upper limit is MAX_INT. (implementation defined behavior)
2712     // If the input exceeds the upper limit, we correct the input to be the
2713     // upper limit. (implementation defined behavior)
2714     // Actually, the flow should never get here until we use MAX_INT limit.
2715   }
2716   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2717                 "max_active_levels for thread %d = (%d)\n",
2718                 gtid, max_active_levels));
2719 
2720   thread = __kmp_threads[gtid];
2721 
2722   __kmp_save_internal_controls(thread);
2723 
2724   set__max_active_levels(thread, max_active_levels);
2725 }
2726 
2727 /* Gets max_active_levels */
2728 int __kmp_get_max_active_levels(int gtid) {
2729   kmp_info_t *thread;
2730 
2731   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2732   KMP_DEBUG_ASSERT(__kmp_init_serial);
2733 
2734   thread = __kmp_threads[gtid];
2735   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2736   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2737                 "curtask_maxaclevel=%d\n",
2738                 gtid, thread->th.th_current_task,
2739                 thread->th.th_current_task->td_icvs.max_active_levels));
2740   return thread->th.th_current_task->td_icvs.max_active_levels;
2741 }
2742 
2743 // nteams-var per-device ICV
2744 void __kmp_set_num_teams(int num_teams) {
2745   if (num_teams > 0)
2746     __kmp_nteams = num_teams;
2747 }
2748 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2749 // teams-thread-limit-var per-device ICV
2750 void __kmp_set_teams_thread_limit(int limit) {
2751   if (limit > 0)
2752     __kmp_teams_thread_limit = limit;
2753 }
2754 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2755 
2756 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2757 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2758 
2759 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2760 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2761   kmp_info_t *thread;
2762   kmp_sched_t orig_kind;
2763   //    kmp_team_t *team;
2764 
2765   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2766                 gtid, (int)kind, chunk));
2767   KMP_DEBUG_ASSERT(__kmp_init_serial);
2768 
2769   // Check if the kind parameter is valid, correct if needed.
2770   // Valid parameters should fit in one of two intervals - standard or extended:
2771   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2772   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2773   orig_kind = kind;
2774   kind = __kmp_sched_without_mods(kind);
2775 
2776   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2777       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2778     // TODO: Hint needs attention in case we change the default schedule.
2779     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2780               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2781               __kmp_msg_null);
2782     kind = kmp_sched_default;
2783     chunk = 0; // ignore chunk value in case of bad kind
2784   }
2785 
2786   thread = __kmp_threads[gtid];
2787 
2788   __kmp_save_internal_controls(thread);
2789 
2790   if (kind < kmp_sched_upper_std) {
2791     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2792       // differ static chunked vs. unchunked:  chunk should be invalid to
2793       // indicate unchunked schedule (which is the default)
2794       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2795     } else {
2796       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2797           __kmp_sch_map[kind - kmp_sched_lower - 1];
2798     }
2799   } else {
2800     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2801     //    kmp_sched_lower - 2 ];
2802     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2803         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2804                       kmp_sched_lower - 2];
2805   }
2806   __kmp_sched_apply_mods_intkind(
2807       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2808   if (kind == kmp_sched_auto || chunk < 1) {
2809     // ignore parameter chunk for schedule auto
2810     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2811   } else {
2812     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2813   }
2814 }
2815 
2816 /* Gets def_sched_var ICV values */
2817 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2818   kmp_info_t *thread;
2819   enum sched_type th_type;
2820 
2821   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2822   KMP_DEBUG_ASSERT(__kmp_init_serial);
2823 
2824   thread = __kmp_threads[gtid];
2825 
2826   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2827   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2828   case kmp_sch_static:
2829   case kmp_sch_static_greedy:
2830   case kmp_sch_static_balanced:
2831     *kind = kmp_sched_static;
2832     __kmp_sched_apply_mods_stdkind(kind, th_type);
2833     *chunk = 0; // chunk was not set, try to show this fact via zero value
2834     return;
2835   case kmp_sch_static_chunked:
2836     *kind = kmp_sched_static;
2837     break;
2838   case kmp_sch_dynamic_chunked:
2839     *kind = kmp_sched_dynamic;
2840     break;
2841   case kmp_sch_guided_chunked:
2842   case kmp_sch_guided_iterative_chunked:
2843   case kmp_sch_guided_analytical_chunked:
2844     *kind = kmp_sched_guided;
2845     break;
2846   case kmp_sch_auto:
2847     *kind = kmp_sched_auto;
2848     break;
2849   case kmp_sch_trapezoidal:
2850     *kind = kmp_sched_trapezoidal;
2851     break;
2852 #if KMP_STATIC_STEAL_ENABLED
2853   case kmp_sch_static_steal:
2854     *kind = kmp_sched_static_steal;
2855     break;
2856 #endif
2857   default:
2858     KMP_FATAL(UnknownSchedulingType, th_type);
2859   }
2860 
2861   __kmp_sched_apply_mods_stdkind(kind, th_type);
2862   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2863 }
2864 
2865 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2866 
2867   int ii, dd;
2868   kmp_team_t *team;
2869   kmp_info_t *thr;
2870 
2871   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2872   KMP_DEBUG_ASSERT(__kmp_init_serial);
2873 
2874   // validate level
2875   if (level == 0)
2876     return 0;
2877   if (level < 0)
2878     return -1;
2879   thr = __kmp_threads[gtid];
2880   team = thr->th.th_team;
2881   ii = team->t.t_level;
2882   if (level > ii)
2883     return -1;
2884 
2885   if (thr->th.th_teams_microtask) {
2886     // AC: we are in teams region where multiple nested teams have same level
2887     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2888     if (level <=
2889         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2890       KMP_DEBUG_ASSERT(ii >= tlevel);
2891       // AC: As we need to pass by the teams league, we need to artificially
2892       // increase ii
2893       if (ii == tlevel) {
2894         ii += 2; // three teams have same level
2895       } else {
2896         ii++; // two teams have same level
2897       }
2898     }
2899   }
2900 
2901   if (ii == level)
2902     return __kmp_tid_from_gtid(gtid);
2903 
2904   dd = team->t.t_serialized;
2905   level++;
2906   while (ii > level) {
2907     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2908     }
2909     if ((team->t.t_serialized) && (!dd)) {
2910       team = team->t.t_parent;
2911       continue;
2912     }
2913     if (ii > level) {
2914       team = team->t.t_parent;
2915       dd = team->t.t_serialized;
2916       ii--;
2917     }
2918   }
2919 
2920   return (dd > 1) ? (0) : (team->t.t_master_tid);
2921 }
2922 
2923 int __kmp_get_team_size(int gtid, int level) {
2924 
2925   int ii, dd;
2926   kmp_team_t *team;
2927   kmp_info_t *thr;
2928 
2929   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2930   KMP_DEBUG_ASSERT(__kmp_init_serial);
2931 
2932   // validate level
2933   if (level == 0)
2934     return 1;
2935   if (level < 0)
2936     return -1;
2937   thr = __kmp_threads[gtid];
2938   team = thr->th.th_team;
2939   ii = team->t.t_level;
2940   if (level > ii)
2941     return -1;
2942 
2943   if (thr->th.th_teams_microtask) {
2944     // AC: we are in teams region where multiple nested teams have same level
2945     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2946     if (level <=
2947         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2948       KMP_DEBUG_ASSERT(ii >= tlevel);
2949       // AC: As we need to pass by the teams league, we need to artificially
2950       // increase ii
2951       if (ii == tlevel) {
2952         ii += 2; // three teams have same level
2953       } else {
2954         ii++; // two teams have same level
2955       }
2956     }
2957   }
2958 
2959   while (ii > level) {
2960     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2961     }
2962     if (team->t.t_serialized && (!dd)) {
2963       team = team->t.t_parent;
2964       continue;
2965     }
2966     if (ii > level) {
2967       team = team->t.t_parent;
2968       ii--;
2969     }
2970   }
2971 
2972   return team->t.t_nproc;
2973 }
2974 
2975 kmp_r_sched_t __kmp_get_schedule_global() {
2976   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2977   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2978   // independently. So one can get the updated schedule here.
2979 
2980   kmp_r_sched_t r_sched;
2981 
2982   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2983   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2984   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2985   // different roots (even in OMP 2.5)
2986   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2987   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2988   if (s == kmp_sch_static) {
2989     // replace STATIC with more detailed schedule (balanced or greedy)
2990     r_sched.r_sched_type = __kmp_static;
2991   } else if (s == kmp_sch_guided_chunked) {
2992     // replace GUIDED with more detailed schedule (iterative or analytical)
2993     r_sched.r_sched_type = __kmp_guided;
2994   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2995     r_sched.r_sched_type = __kmp_sched;
2996   }
2997   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2998 
2999   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3000     // __kmp_chunk may be wrong here (if it was not ever set)
3001     r_sched.chunk = KMP_DEFAULT_CHUNK;
3002   } else {
3003     r_sched.chunk = __kmp_chunk;
3004   }
3005 
3006   return r_sched;
3007 }
3008 
3009 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3010    at least argc number of *t_argv entries for the requested team. */
3011 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3012 
3013   KMP_DEBUG_ASSERT(team);
3014   if (!realloc || argc > team->t.t_max_argc) {
3015 
3016     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3017                    "current entries=%d\n",
3018                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3019     /* if previously allocated heap space for args, free them */
3020     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3021       __kmp_free((void *)team->t.t_argv);
3022 
3023     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3024       /* use unused space in the cache line for arguments */
3025       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3026       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3027                      "argv entries\n",
3028                      team->t.t_id, team->t.t_max_argc));
3029       team->t.t_argv = &team->t.t_inline_argv[0];
3030       if (__kmp_storage_map) {
3031         __kmp_print_storage_map_gtid(
3032             -1, &team->t.t_inline_argv[0],
3033             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3034             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3035             team->t.t_id);
3036       }
3037     } else {
3038       /* allocate space for arguments in the heap */
3039       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3040                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3041                                : 2 * argc;
3042       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3043                      "argv entries\n",
3044                      team->t.t_id, team->t.t_max_argc));
3045       team->t.t_argv =
3046           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3047       if (__kmp_storage_map) {
3048         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3049                                      &team->t.t_argv[team->t.t_max_argc],
3050                                      sizeof(void *) * team->t.t_max_argc,
3051                                      "team_%d.t_argv", team->t.t_id);
3052       }
3053     }
3054   }
3055 }
3056 
3057 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3058   int i;
3059   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3060   team->t.t_threads =
3061       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3062   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3063       sizeof(dispatch_shared_info_t) * num_disp_buff);
3064   team->t.t_dispatch =
3065       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3066   team->t.t_implicit_task_taskdata =
3067       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3068   team->t.t_max_nproc = max_nth;
3069 
3070   /* setup dispatch buffers */
3071   for (i = 0; i < num_disp_buff; ++i) {
3072     team->t.t_disp_buffer[i].buffer_index = i;
3073     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3074   }
3075 }
3076 
3077 static void __kmp_free_team_arrays(kmp_team_t *team) {
3078   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3079   int i;
3080   for (i = 0; i < team->t.t_max_nproc; ++i) {
3081     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3082       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3083       team->t.t_dispatch[i].th_disp_buffer = NULL;
3084     }
3085   }
3086 #if KMP_USE_HIER_SCHED
3087   __kmp_dispatch_free_hierarchies(team);
3088 #endif
3089   __kmp_free(team->t.t_threads);
3090   __kmp_free(team->t.t_disp_buffer);
3091   __kmp_free(team->t.t_dispatch);
3092   __kmp_free(team->t.t_implicit_task_taskdata);
3093   team->t.t_threads = NULL;
3094   team->t.t_disp_buffer = NULL;
3095   team->t.t_dispatch = NULL;
3096   team->t.t_implicit_task_taskdata = 0;
3097 }
3098 
3099 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3100   kmp_info_t **oldThreads = team->t.t_threads;
3101 
3102   __kmp_free(team->t.t_disp_buffer);
3103   __kmp_free(team->t.t_dispatch);
3104   __kmp_free(team->t.t_implicit_task_taskdata);
3105   __kmp_allocate_team_arrays(team, max_nth);
3106 
3107   KMP_MEMCPY(team->t.t_threads, oldThreads,
3108              team->t.t_nproc * sizeof(kmp_info_t *));
3109 
3110   __kmp_free(oldThreads);
3111 }
3112 
3113 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3114 
3115   kmp_r_sched_t r_sched =
3116       __kmp_get_schedule_global(); // get current state of scheduling globals
3117 
3118   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3119 
3120   kmp_internal_control_t g_icvs = {
3121     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3122     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3123     // adjustment of threads (per thread)
3124     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3125     // whether blocktime is explicitly set
3126     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3127 #if KMP_USE_MONITOR
3128     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3129 // intervals
3130 #endif
3131     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3132     // next parallel region (per thread)
3133     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3134     __kmp_cg_max_nth, // int thread_limit;
3135     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3136     // for max_active_levels
3137     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3138     // {sched,chunk} pair
3139     __kmp_nested_proc_bind.bind_types[0],
3140     __kmp_default_device,
3141     NULL // struct kmp_internal_control *next;
3142   };
3143 
3144   return g_icvs;
3145 }
3146 
3147 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3148 
3149   kmp_internal_control_t gx_icvs;
3150   gx_icvs.serial_nesting_level =
3151       0; // probably =team->t.t_serial like in save_inter_controls
3152   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3153   gx_icvs.next = NULL;
3154 
3155   return gx_icvs;
3156 }
3157 
3158 static void __kmp_initialize_root(kmp_root_t *root) {
3159   int f;
3160   kmp_team_t *root_team;
3161   kmp_team_t *hot_team;
3162   int hot_team_max_nth;
3163   kmp_r_sched_t r_sched =
3164       __kmp_get_schedule_global(); // get current state of scheduling globals
3165   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3166   KMP_DEBUG_ASSERT(root);
3167   KMP_ASSERT(!root->r.r_begin);
3168 
3169   /* setup the root state structure */
3170   __kmp_init_lock(&root->r.r_begin_lock);
3171   root->r.r_begin = FALSE;
3172   root->r.r_active = FALSE;
3173   root->r.r_in_parallel = 0;
3174   root->r.r_blocktime = __kmp_dflt_blocktime;
3175 #if KMP_AFFINITY_SUPPORTED
3176   root->r.r_affinity_assigned = FALSE;
3177 #endif
3178 
3179   /* setup the root team for this task */
3180   /* allocate the root team structure */
3181   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3182 
3183   root_team =
3184       __kmp_allocate_team(root,
3185                           1, // new_nproc
3186                           1, // max_nproc
3187 #if OMPT_SUPPORT
3188                           ompt_data_none, // root parallel id
3189 #endif
3190                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3191                           0 // argc
3192                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3193                           );
3194 #if USE_DEBUGGER
3195   // Non-NULL value should be assigned to make the debugger display the root
3196   // team.
3197   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3198 #endif
3199 
3200   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3201 
3202   root->r.r_root_team = root_team;
3203   root_team->t.t_control_stack_top = NULL;
3204 
3205   /* initialize root team */
3206   root_team->t.t_threads[0] = NULL;
3207   root_team->t.t_nproc = 1;
3208   root_team->t.t_serialized = 1;
3209   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3210   root_team->t.t_sched.sched = r_sched.sched;
3211   KA_TRACE(
3212       20,
3213       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3214        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3215 
3216   /* setup the  hot team for this task */
3217   /* allocate the hot team structure */
3218   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3219 
3220   hot_team =
3221       __kmp_allocate_team(root,
3222                           1, // new_nproc
3223                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3224 #if OMPT_SUPPORT
3225                           ompt_data_none, // root parallel id
3226 #endif
3227                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3228                           0 // argc
3229                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3230                           );
3231   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3232 
3233   root->r.r_hot_team = hot_team;
3234   root_team->t.t_control_stack_top = NULL;
3235 
3236   /* first-time initialization */
3237   hot_team->t.t_parent = root_team;
3238 
3239   /* initialize hot team */
3240   hot_team_max_nth = hot_team->t.t_max_nproc;
3241   for (f = 0; f < hot_team_max_nth; ++f) {
3242     hot_team->t.t_threads[f] = NULL;
3243   }
3244   hot_team->t.t_nproc = 1;
3245   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3246   hot_team->t.t_sched.sched = r_sched.sched;
3247   hot_team->t.t_size_changed = 0;
3248 }
3249 
3250 #ifdef KMP_DEBUG
3251 
3252 typedef struct kmp_team_list_item {
3253   kmp_team_p const *entry;
3254   struct kmp_team_list_item *next;
3255 } kmp_team_list_item_t;
3256 typedef kmp_team_list_item_t *kmp_team_list_t;
3257 
3258 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3259     kmp_team_list_t list, // List of teams.
3260     kmp_team_p const *team // Team to add.
3261 ) {
3262 
3263   // List must terminate with item where both entry and next are NULL.
3264   // Team is added to the list only once.
3265   // List is sorted in ascending order by team id.
3266   // Team id is *not* a key.
3267 
3268   kmp_team_list_t l;
3269 
3270   KMP_DEBUG_ASSERT(list != NULL);
3271   if (team == NULL) {
3272     return;
3273   }
3274 
3275   __kmp_print_structure_team_accum(list, team->t.t_parent);
3276   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3277 
3278   // Search list for the team.
3279   l = list;
3280   while (l->next != NULL && l->entry != team) {
3281     l = l->next;
3282   }
3283   if (l->next != NULL) {
3284     return; // Team has been added before, exit.
3285   }
3286 
3287   // Team is not found. Search list again for insertion point.
3288   l = list;
3289   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3290     l = l->next;
3291   }
3292 
3293   // Insert team.
3294   {
3295     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3296         sizeof(kmp_team_list_item_t));
3297     *item = *l;
3298     l->entry = team;
3299     l->next = item;
3300   }
3301 }
3302 
3303 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3304 
3305 ) {
3306   __kmp_printf("%s", title);
3307   if (team != NULL) {
3308     __kmp_printf("%2x %p\n", team->t.t_id, team);
3309   } else {
3310     __kmp_printf(" - (nil)\n");
3311   }
3312 }
3313 
3314 static void __kmp_print_structure_thread(char const *title,
3315                                          kmp_info_p const *thread) {
3316   __kmp_printf("%s", title);
3317   if (thread != NULL) {
3318     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3319   } else {
3320     __kmp_printf(" - (nil)\n");
3321   }
3322 }
3323 
3324 void __kmp_print_structure(void) {
3325 
3326   kmp_team_list_t list;
3327 
3328   // Initialize list of teams.
3329   list =
3330       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3331   list->entry = NULL;
3332   list->next = NULL;
3333 
3334   __kmp_printf("\n------------------------------\nGlobal Thread "
3335                "Table\n------------------------------\n");
3336   {
3337     int gtid;
3338     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3339       __kmp_printf("%2d", gtid);
3340       if (__kmp_threads != NULL) {
3341         __kmp_printf(" %p", __kmp_threads[gtid]);
3342       }
3343       if (__kmp_root != NULL) {
3344         __kmp_printf(" %p", __kmp_root[gtid]);
3345       }
3346       __kmp_printf("\n");
3347     }
3348   }
3349 
3350   // Print out __kmp_threads array.
3351   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3352                "----------\n");
3353   if (__kmp_threads != NULL) {
3354     int gtid;
3355     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3356       kmp_info_t const *thread = __kmp_threads[gtid];
3357       if (thread != NULL) {
3358         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3359         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3360         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3361         __kmp_print_structure_team("    Serial Team:  ",
3362                                    thread->th.th_serial_team);
3363         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3364         __kmp_print_structure_thread("    Primary:      ",
3365                                      thread->th.th_team_master);
3366         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3367         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3368         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3369         __kmp_print_structure_thread("    Next in pool: ",
3370                                      thread->th.th_next_pool);
3371         __kmp_printf("\n");
3372         __kmp_print_structure_team_accum(list, thread->th.th_team);
3373         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3374       }
3375     }
3376   } else {
3377     __kmp_printf("Threads array is not allocated.\n");
3378   }
3379 
3380   // Print out __kmp_root array.
3381   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3382                "--------\n");
3383   if (__kmp_root != NULL) {
3384     int gtid;
3385     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3386       kmp_root_t const *root = __kmp_root[gtid];
3387       if (root != NULL) {
3388         __kmp_printf("GTID %2d %p:\n", gtid, root);
3389         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3390         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3391         __kmp_print_structure_thread("    Uber Thread:  ",
3392                                      root->r.r_uber_thread);
3393         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3394         __kmp_printf("    In Parallel:  %2d\n",
3395                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3396         __kmp_printf("\n");
3397         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3398         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3399       }
3400     }
3401   } else {
3402     __kmp_printf("Ubers array is not allocated.\n");
3403   }
3404 
3405   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3406                "--------\n");
3407   while (list->next != NULL) {
3408     kmp_team_p const *team = list->entry;
3409     int i;
3410     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3411     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3412     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3413     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3414     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3415     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3416     for (i = 0; i < team->t.t_nproc; ++i) {
3417       __kmp_printf("    Thread %2d:      ", i);
3418       __kmp_print_structure_thread("", team->t.t_threads[i]);
3419     }
3420     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3421     __kmp_printf("\n");
3422     list = list->next;
3423   }
3424 
3425   // Print out __kmp_thread_pool and __kmp_team_pool.
3426   __kmp_printf("\n------------------------------\nPools\n----------------------"
3427                "--------\n");
3428   __kmp_print_structure_thread("Thread pool:          ",
3429                                CCAST(kmp_info_t *, __kmp_thread_pool));
3430   __kmp_print_structure_team("Team pool:            ",
3431                              CCAST(kmp_team_t *, __kmp_team_pool));
3432   __kmp_printf("\n");
3433 
3434   // Free team list.
3435   while (list != NULL) {
3436     kmp_team_list_item_t *item = list;
3437     list = list->next;
3438     KMP_INTERNAL_FREE(item);
3439   }
3440 }
3441 
3442 #endif
3443 
3444 //---------------------------------------------------------------------------
3445 //  Stuff for per-thread fast random number generator
3446 //  Table of primes
3447 static const unsigned __kmp_primes[] = {
3448     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3449     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3450     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3451     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3452     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3453     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3454     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3455     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3456     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3457     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3458     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3459 
3460 //---------------------------------------------------------------------------
3461 //  __kmp_get_random: Get a random number using a linear congruential method.
3462 unsigned short __kmp_get_random(kmp_info_t *thread) {
3463   unsigned x = thread->th.th_x;
3464   unsigned short r = (unsigned short)(x >> 16);
3465 
3466   thread->th.th_x = x * thread->th.th_a + 1;
3467 
3468   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3469                 thread->th.th_info.ds.ds_tid, r));
3470 
3471   return r;
3472 }
3473 //--------------------------------------------------------
3474 // __kmp_init_random: Initialize a random number generator
3475 void __kmp_init_random(kmp_info_t *thread) {
3476   unsigned seed = thread->th.th_info.ds.ds_tid;
3477 
3478   thread->th.th_a =
3479       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3480   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3481   KA_TRACE(30,
3482            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3483 }
3484 
3485 #if KMP_OS_WINDOWS
3486 /* reclaim array entries for root threads that are already dead, returns number
3487  * reclaimed */
3488 static int __kmp_reclaim_dead_roots(void) {
3489   int i, r = 0;
3490 
3491   for (i = 0; i < __kmp_threads_capacity; ++i) {
3492     if (KMP_UBER_GTID(i) &&
3493         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3494         !__kmp_root[i]
3495              ->r.r_active) { // AC: reclaim only roots died in non-active state
3496       r += __kmp_unregister_root_other_thread(i);
3497     }
3498   }
3499   return r;
3500 }
3501 #endif
3502 
3503 /* This function attempts to create free entries in __kmp_threads and
3504    __kmp_root, and returns the number of free entries generated.
3505 
3506    For Windows* OS static library, the first mechanism used is to reclaim array
3507    entries for root threads that are already dead.
3508 
3509    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3510    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3511    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3512    threadprivate cache array has been created. Synchronization with
3513    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3514 
3515    After any dead root reclamation, if the clipping value allows array expansion
3516    to result in the generation of a total of nNeed free slots, the function does
3517    that expansion. If not, nothing is done beyond the possible initial root
3518    thread reclamation.
3519 
3520    If any argument is negative, the behavior is undefined. */
3521 static int __kmp_expand_threads(int nNeed) {
3522   int added = 0;
3523   int minimumRequiredCapacity;
3524   int newCapacity;
3525   kmp_info_t **newThreads;
3526   kmp_root_t **newRoot;
3527 
3528   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3529   // resizing __kmp_threads does not need additional protection if foreign
3530   // threads are present
3531 
3532 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3533   /* only for Windows static library */
3534   /* reclaim array entries for root threads that are already dead */
3535   added = __kmp_reclaim_dead_roots();
3536 
3537   if (nNeed) {
3538     nNeed -= added;
3539     if (nNeed < 0)
3540       nNeed = 0;
3541   }
3542 #endif
3543   if (nNeed <= 0)
3544     return added;
3545 
3546   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3547   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3548   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3549   // > __kmp_max_nth in one of two ways:
3550   //
3551   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3552   //    may not be reused by another thread, so we may need to increase
3553   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3554   //
3555   // 2) New foreign root(s) are encountered.  We always register new foreign
3556   //    roots. This may cause a smaller # of threads to be allocated at
3557   //    subsequent parallel regions, but the worker threads hang around (and
3558   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3559   //
3560   // Anyway, that is the reason for moving the check to see if
3561   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3562   // instead of having it performed here. -BB
3563 
3564   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3565 
3566   /* compute expansion headroom to check if we can expand */
3567   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3568     /* possible expansion too small -- give up */
3569     return added;
3570   }
3571   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3572 
3573   newCapacity = __kmp_threads_capacity;
3574   do {
3575     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3576                                                           : __kmp_sys_max_nth;
3577   } while (newCapacity < minimumRequiredCapacity);
3578   newThreads = (kmp_info_t **)__kmp_allocate(
3579       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3580   newRoot =
3581       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3582   KMP_MEMCPY(newThreads, __kmp_threads,
3583              __kmp_threads_capacity * sizeof(kmp_info_t *));
3584   KMP_MEMCPY(newRoot, __kmp_root,
3585              __kmp_threads_capacity * sizeof(kmp_root_t *));
3586 
3587   kmp_info_t **temp_threads = __kmp_threads;
3588   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3589   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3590   __kmp_free(temp_threads);
3591   added += newCapacity - __kmp_threads_capacity;
3592   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3593 
3594   if (newCapacity > __kmp_tp_capacity) {
3595     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3596     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3597       __kmp_threadprivate_resize_cache(newCapacity);
3598     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3599       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3600     }
3601     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3602   }
3603 
3604   return added;
3605 }
3606 
3607 /* Register the current thread as a root thread and obtain our gtid. We must
3608    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3609    thread that calls from __kmp_do_serial_initialize() */
3610 int __kmp_register_root(int initial_thread) {
3611   kmp_info_t *root_thread;
3612   kmp_root_t *root;
3613   int gtid;
3614   int capacity;
3615   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3616   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3617   KMP_MB();
3618 
3619   /* 2007-03-02:
3620      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3621      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3622      work as expected -- it may return false (that means there is at least one
3623      empty slot in __kmp_threads array), but it is possible the only free slot
3624      is #0, which is reserved for initial thread and so cannot be used for this
3625      one. Following code workarounds this bug.
3626 
3627      However, right solution seems to be not reserving slot #0 for initial
3628      thread because:
3629      (1) there is no magic in slot #0,
3630      (2) we cannot detect initial thread reliably (the first thread which does
3631         serial initialization may be not a real initial thread).
3632   */
3633   capacity = __kmp_threads_capacity;
3634   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3635     --capacity;
3636   }
3637 
3638   // If it is not for initializing the hidden helper team, we need to take
3639   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3640   // in __kmp_threads_capacity.
3641   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3642     capacity -= __kmp_hidden_helper_threads_num;
3643   }
3644 
3645   /* see if there are too many threads */
3646   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3647     if (__kmp_tp_cached) {
3648       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3649                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3650                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3651     } else {
3652       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3653                   __kmp_msg_null);
3654     }
3655   }
3656 
3657   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3658   // 0: initial thread, also a regular OpenMP thread.
3659   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3660   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3661   // regular OpenMP threads.
3662   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3663     // Find an available thread slot for hidden helper thread. Slots for hidden
3664     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3665     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3666                    gtid <= __kmp_hidden_helper_threads_num;
3667          gtid++)
3668       ;
3669     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3670     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3671                  "hidden helper thread: T#%d\n",
3672                  gtid));
3673   } else {
3674     /* find an available thread slot */
3675     // Don't reassign the zero slot since we need that to only be used by
3676     // initial thread. Slots for hidden helper threads should also be skipped.
3677     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3678       gtid = 0;
3679     } else {
3680       for (gtid = __kmp_hidden_helper_threads_num + 1;
3681            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3682         ;
3683     }
3684     KA_TRACE(
3685         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3686     KMP_ASSERT(gtid < __kmp_threads_capacity);
3687   }
3688 
3689   /* update global accounting */
3690   __kmp_all_nth++;
3691   TCW_4(__kmp_nth, __kmp_nth + 1);
3692 
3693   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3694   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3695   if (__kmp_adjust_gtid_mode) {
3696     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3697       if (TCR_4(__kmp_gtid_mode) != 2) {
3698         TCW_4(__kmp_gtid_mode, 2);
3699       }
3700     } else {
3701       if (TCR_4(__kmp_gtid_mode) != 1) {
3702         TCW_4(__kmp_gtid_mode, 1);
3703       }
3704     }
3705   }
3706 
3707 #ifdef KMP_ADJUST_BLOCKTIME
3708   /* Adjust blocktime to zero if necessary            */
3709   /* Middle initialization might not have occurred yet */
3710   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3711     if (__kmp_nth > __kmp_avail_proc) {
3712       __kmp_zero_bt = TRUE;
3713     }
3714   }
3715 #endif /* KMP_ADJUST_BLOCKTIME */
3716 
3717   /* setup this new hierarchy */
3718   if (!(root = __kmp_root[gtid])) {
3719     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3720     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3721   }
3722 
3723 #if KMP_STATS_ENABLED
3724   // Initialize stats as soon as possible (right after gtid assignment).
3725   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3726   __kmp_stats_thread_ptr->startLife();
3727   KMP_SET_THREAD_STATE(SERIAL_REGION);
3728   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3729 #endif
3730   __kmp_initialize_root(root);
3731 
3732   /* setup new root thread structure */
3733   if (root->r.r_uber_thread) {
3734     root_thread = root->r.r_uber_thread;
3735   } else {
3736     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3737     if (__kmp_storage_map) {
3738       __kmp_print_thread_storage_map(root_thread, gtid);
3739     }
3740     root_thread->th.th_info.ds.ds_gtid = gtid;
3741 #if OMPT_SUPPORT
3742     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3743 #endif
3744     root_thread->th.th_root = root;
3745     if (__kmp_env_consistency_check) {
3746       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3747     }
3748 #if USE_FAST_MEMORY
3749     __kmp_initialize_fast_memory(root_thread);
3750 #endif /* USE_FAST_MEMORY */
3751 
3752 #if KMP_USE_BGET
3753     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3754     __kmp_initialize_bget(root_thread);
3755 #endif
3756     __kmp_init_random(root_thread); // Initialize random number generator
3757   }
3758 
3759   /* setup the serial team held in reserve by the root thread */
3760   if (!root_thread->th.th_serial_team) {
3761     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3762     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3763     root_thread->th.th_serial_team = __kmp_allocate_team(
3764         root, 1, 1,
3765 #if OMPT_SUPPORT
3766         ompt_data_none, // root parallel id
3767 #endif
3768         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3769   }
3770   KMP_ASSERT(root_thread->th.th_serial_team);
3771   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3772                 root_thread->th.th_serial_team));
3773 
3774   /* drop root_thread into place */
3775   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3776 
3777   root->r.r_root_team->t.t_threads[0] = root_thread;
3778   root->r.r_hot_team->t.t_threads[0] = root_thread;
3779   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3780   // AC: the team created in reserve, not for execution (it is unused for now).
3781   root_thread->th.th_serial_team->t.t_serialized = 0;
3782   root->r.r_uber_thread = root_thread;
3783 
3784   /* initialize the thread, get it ready to go */
3785   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3786   TCW_4(__kmp_init_gtid, TRUE);
3787 
3788   /* prepare the primary thread for get_gtid() */
3789   __kmp_gtid_set_specific(gtid);
3790 
3791 #if USE_ITT_BUILD
3792   __kmp_itt_thread_name(gtid);
3793 #endif /* USE_ITT_BUILD */
3794 
3795 #ifdef KMP_TDATA_GTID
3796   __kmp_gtid = gtid;
3797 #endif
3798   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3799   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3800 
3801   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3802                 "plain=%u\n",
3803                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3804                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3805                 KMP_INIT_BARRIER_STATE));
3806   { // Initialize barrier data.
3807     int b;
3808     for (b = 0; b < bs_last_barrier; ++b) {
3809       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3810 #if USE_DEBUGGER
3811       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3812 #endif
3813     }
3814   }
3815   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3816                    KMP_INIT_BARRIER_STATE);
3817 
3818 #if KMP_AFFINITY_SUPPORTED
3819   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3820   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3821   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3822   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3823 #endif /* KMP_AFFINITY_SUPPORTED */
3824   root_thread->th.th_def_allocator = __kmp_def_allocator;
3825   root_thread->th.th_prev_level = 0;
3826   root_thread->th.th_prev_num_threads = 1;
3827 
3828   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3829   tmp->cg_root = root_thread;
3830   tmp->cg_thread_limit = __kmp_cg_max_nth;
3831   tmp->cg_nthreads = 1;
3832   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3833                  " cg_nthreads init to 1\n",
3834                  root_thread, tmp));
3835   tmp->up = NULL;
3836   root_thread->th.th_cg_roots = tmp;
3837 
3838   __kmp_root_counter++;
3839 
3840 #if OMPT_SUPPORT
3841   if (!initial_thread && ompt_enabled.enabled) {
3842 
3843     kmp_info_t *root_thread = ompt_get_thread();
3844 
3845     ompt_set_thread_state(root_thread, ompt_state_overhead);
3846 
3847     if (ompt_enabled.ompt_callback_thread_begin) {
3848       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3849           ompt_thread_initial, __ompt_get_thread_data_internal());
3850     }
3851     ompt_data_t *task_data;
3852     ompt_data_t *parallel_data;
3853     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3854                                   NULL);
3855     if (ompt_enabled.ompt_callback_implicit_task) {
3856       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3857           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3858     }
3859 
3860     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3861   }
3862 #endif
3863 #if OMPD_SUPPORT
3864   if (ompd_state & OMPD_ENABLE_BP)
3865     ompd_bp_thread_begin();
3866 #endif
3867 
3868   KMP_MB();
3869   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3870 
3871   return gtid;
3872 }
3873 
3874 #if KMP_NESTED_HOT_TEAMS
3875 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3876                                 const int max_level) {
3877   int i, n, nth;
3878   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3879   if (!hot_teams || !hot_teams[level].hot_team) {
3880     return 0;
3881   }
3882   KMP_DEBUG_ASSERT(level < max_level);
3883   kmp_team_t *team = hot_teams[level].hot_team;
3884   nth = hot_teams[level].hot_team_nth;
3885   n = nth - 1; // primary thread is not freed
3886   if (level < max_level - 1) {
3887     for (i = 0; i < nth; ++i) {
3888       kmp_info_t *th = team->t.t_threads[i];
3889       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3890       if (i > 0 && th->th.th_hot_teams) {
3891         __kmp_free(th->th.th_hot_teams);
3892         th->th.th_hot_teams = NULL;
3893       }
3894     }
3895   }
3896   __kmp_free_team(root, team, NULL);
3897   return n;
3898 }
3899 #endif
3900 
3901 // Resets a root thread and clear its root and hot teams.
3902 // Returns the number of __kmp_threads entries directly and indirectly freed.
3903 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3904   kmp_team_t *root_team = root->r.r_root_team;
3905   kmp_team_t *hot_team = root->r.r_hot_team;
3906   int n = hot_team->t.t_nproc;
3907   int i;
3908 
3909   KMP_DEBUG_ASSERT(!root->r.r_active);
3910 
3911   root->r.r_root_team = NULL;
3912   root->r.r_hot_team = NULL;
3913   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3914   // before call to __kmp_free_team().
3915   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3916 #if KMP_NESTED_HOT_TEAMS
3917   if (__kmp_hot_teams_max_level >
3918       0) { // need to free nested hot teams and their threads if any
3919     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3920       kmp_info_t *th = hot_team->t.t_threads[i];
3921       if (__kmp_hot_teams_max_level > 1) {
3922         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3923       }
3924       if (th->th.th_hot_teams) {
3925         __kmp_free(th->th.th_hot_teams);
3926         th->th.th_hot_teams = NULL;
3927       }
3928     }
3929   }
3930 #endif
3931   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3932 
3933   // Before we can reap the thread, we need to make certain that all other
3934   // threads in the teams that had this root as ancestor have stopped trying to
3935   // steal tasks.
3936   if (__kmp_tasking_mode != tskm_immediate_exec) {
3937     __kmp_wait_to_unref_task_teams();
3938   }
3939 
3940 #if KMP_OS_WINDOWS
3941   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3942   KA_TRACE(
3943       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3944            "\n",
3945            (LPVOID) & (root->r.r_uber_thread->th),
3946            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3947   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3948 #endif /* KMP_OS_WINDOWS */
3949 
3950 #if OMPD_SUPPORT
3951   if (ompd_state & OMPD_ENABLE_BP)
3952     ompd_bp_thread_end();
3953 #endif
3954 
3955 #if OMPT_SUPPORT
3956   ompt_data_t *task_data;
3957   ompt_data_t *parallel_data;
3958   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3959                                 NULL);
3960   if (ompt_enabled.ompt_callback_implicit_task) {
3961     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3962         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3963   }
3964   if (ompt_enabled.ompt_callback_thread_end) {
3965     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3966         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3967   }
3968 #endif
3969 
3970   TCW_4(__kmp_nth,
3971         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3972   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3973   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3974                  " to %d\n",
3975                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3976                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3977   if (i == 1) {
3978     // need to free contention group structure
3979     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3980                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3981     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3982     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3983     root->r.r_uber_thread->th.th_cg_roots = NULL;
3984   }
3985   __kmp_reap_thread(root->r.r_uber_thread, 1);
3986 
3987   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3988   // instead of freeing.
3989   root->r.r_uber_thread = NULL;
3990   /* mark root as no longer in use */
3991   root->r.r_begin = FALSE;
3992 
3993   return n;
3994 }
3995 
3996 void __kmp_unregister_root_current_thread(int gtid) {
3997   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3998   /* this lock should be ok, since unregister_root_current_thread is never
3999      called during an abort, only during a normal close. furthermore, if you
4000      have the forkjoin lock, you should never try to get the initz lock */
4001   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4002   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4003     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4004                   "exiting T#%d\n",
4005                   gtid));
4006     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4007     return;
4008   }
4009   kmp_root_t *root = __kmp_root[gtid];
4010 
4011   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4012   KMP_ASSERT(KMP_UBER_GTID(gtid));
4013   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4014   KMP_ASSERT(root->r.r_active == FALSE);
4015 
4016   KMP_MB();
4017 
4018   kmp_info_t *thread = __kmp_threads[gtid];
4019   kmp_team_t *team = thread->th.th_team;
4020   kmp_task_team_t *task_team = thread->th.th_task_team;
4021 
4022   // we need to wait for the proxy tasks before finishing the thread
4023   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4024 #if OMPT_SUPPORT
4025     // the runtime is shutting down so we won't report any events
4026     thread->th.ompt_thread_info.state = ompt_state_undefined;
4027 #endif
4028     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4029   }
4030 
4031   __kmp_reset_root(gtid, root);
4032 
4033   KMP_MB();
4034   KC_TRACE(10,
4035            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4036 
4037   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4038 }
4039 
4040 #if KMP_OS_WINDOWS
4041 /* __kmp_forkjoin_lock must be already held
4042    Unregisters a root thread that is not the current thread.  Returns the number
4043    of __kmp_threads entries freed as a result. */
4044 static int __kmp_unregister_root_other_thread(int gtid) {
4045   kmp_root_t *root = __kmp_root[gtid];
4046   int r;
4047 
4048   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4049   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4050   KMP_ASSERT(KMP_UBER_GTID(gtid));
4051   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4052   KMP_ASSERT(root->r.r_active == FALSE);
4053 
4054   r = __kmp_reset_root(gtid, root);
4055   KC_TRACE(10,
4056            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4057   return r;
4058 }
4059 #endif
4060 
4061 #if KMP_DEBUG
4062 void __kmp_task_info() {
4063 
4064   kmp_int32 gtid = __kmp_entry_gtid();
4065   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4066   kmp_info_t *this_thr = __kmp_threads[gtid];
4067   kmp_team_t *steam = this_thr->th.th_serial_team;
4068   kmp_team_t *team = this_thr->th.th_team;
4069 
4070   __kmp_printf(
4071       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4072       "ptask=%p\n",
4073       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4074       team->t.t_implicit_task_taskdata[tid].td_parent);
4075 }
4076 #endif // KMP_DEBUG
4077 
4078 /* TODO optimize with one big memclr, take out what isn't needed, split
4079    responsibility to workers as much as possible, and delay initialization of
4080    features as much as possible  */
4081 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4082                                   int tid, int gtid) {
4083   /* this_thr->th.th_info.ds.ds_gtid is setup in
4084      kmp_allocate_thread/create_worker.
4085      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4086   KMP_DEBUG_ASSERT(this_thr != NULL);
4087   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4088   KMP_DEBUG_ASSERT(team);
4089   KMP_DEBUG_ASSERT(team->t.t_threads);
4090   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4091   kmp_info_t *master = team->t.t_threads[0];
4092   KMP_DEBUG_ASSERT(master);
4093   KMP_DEBUG_ASSERT(master->th.th_root);
4094 
4095   KMP_MB();
4096 
4097   TCW_SYNC_PTR(this_thr->th.th_team, team);
4098 
4099   this_thr->th.th_info.ds.ds_tid = tid;
4100   this_thr->th.th_set_nproc = 0;
4101   if (__kmp_tasking_mode != tskm_immediate_exec)
4102     // When tasking is possible, threads are not safe to reap until they are
4103     // done tasking; this will be set when tasking code is exited in wait
4104     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4105   else // no tasking --> always safe to reap
4106     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4107   this_thr->th.th_set_proc_bind = proc_bind_default;
4108 #if KMP_AFFINITY_SUPPORTED
4109   this_thr->th.th_new_place = this_thr->th.th_current_place;
4110 #endif
4111   this_thr->th.th_root = master->th.th_root;
4112 
4113   /* setup the thread's cache of the team structure */
4114   this_thr->th.th_team_nproc = team->t.t_nproc;
4115   this_thr->th.th_team_master = master;
4116   this_thr->th.th_team_serialized = team->t.t_serialized;
4117   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4118 
4119   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4120 
4121   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4122                 tid, gtid, this_thr, this_thr->th.th_current_task));
4123 
4124   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4125                            team, tid, TRUE);
4126 
4127   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4128                 tid, gtid, this_thr, this_thr->th.th_current_task));
4129   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4130   // __kmp_initialize_team()?
4131 
4132   /* TODO no worksharing in speculative threads */
4133   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4134 
4135   this_thr->th.th_local.this_construct = 0;
4136 
4137   if (!this_thr->th.th_pri_common) {
4138     this_thr->th.th_pri_common =
4139         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4140     if (__kmp_storage_map) {
4141       __kmp_print_storage_map_gtid(
4142           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4143           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4144     }
4145     this_thr->th.th_pri_head = NULL;
4146   }
4147 
4148   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4149       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4150     // Make new thread's CG root same as primary thread's
4151     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4152     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4153     if (tmp) {
4154       // worker changes CG, need to check if old CG should be freed
4155       int i = tmp->cg_nthreads--;
4156       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4157                      " on node %p of thread %p to %d\n",
4158                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4159       if (i == 1) {
4160         __kmp_free(tmp); // last thread left CG --> free it
4161       }
4162     }
4163     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4164     // Increment new thread's CG root's counter to add the new thread
4165     this_thr->th.th_cg_roots->cg_nthreads++;
4166     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4167                    " node %p of thread %p to %d\n",
4168                    this_thr, this_thr->th.th_cg_roots,
4169                    this_thr->th.th_cg_roots->cg_root,
4170                    this_thr->th.th_cg_roots->cg_nthreads));
4171     this_thr->th.th_current_task->td_icvs.thread_limit =
4172         this_thr->th.th_cg_roots->cg_thread_limit;
4173   }
4174 
4175   /* Initialize dynamic dispatch */
4176   {
4177     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4178     // Use team max_nproc since this will never change for the team.
4179     size_t disp_size =
4180         sizeof(dispatch_private_info_t) *
4181         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4182     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4183                   team->t.t_max_nproc));
4184     KMP_ASSERT(dispatch);
4185     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4186     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4187 
4188     dispatch->th_disp_index = 0;
4189     dispatch->th_doacross_buf_idx = 0;
4190     if (!dispatch->th_disp_buffer) {
4191       dispatch->th_disp_buffer =
4192           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4193 
4194       if (__kmp_storage_map) {
4195         __kmp_print_storage_map_gtid(
4196             gtid, &dispatch->th_disp_buffer[0],
4197             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4198                                           ? 1
4199                                           : __kmp_dispatch_num_buffers],
4200             disp_size,
4201             "th_%d.th_dispatch.th_disp_buffer "
4202             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4203             gtid, team->t.t_id, gtid);
4204       }
4205     } else {
4206       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4207     }
4208 
4209     dispatch->th_dispatch_pr_current = 0;
4210     dispatch->th_dispatch_sh_current = 0;
4211 
4212     dispatch->th_deo_fcn = 0; /* ORDERED     */
4213     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4214   }
4215 
4216   this_thr->th.th_next_pool = NULL;
4217 
4218   if (!this_thr->th.th_task_state_memo_stack) {
4219     size_t i;
4220     this_thr->th.th_task_state_memo_stack =
4221         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4222     this_thr->th.th_task_state_top = 0;
4223     this_thr->th.th_task_state_stack_sz = 4;
4224     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4225          ++i) // zero init the stack
4226       this_thr->th.th_task_state_memo_stack[i] = 0;
4227   }
4228 
4229   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4230   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4231 
4232   KMP_MB();
4233 }
4234 
4235 /* allocate a new thread for the requesting team. this is only called from
4236    within a forkjoin critical section. we will first try to get an available
4237    thread from the thread pool. if none is available, we will fork a new one
4238    assuming we are able to create a new one. this should be assured, as the
4239    caller should check on this first. */
4240 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4241                                   int new_tid) {
4242   kmp_team_t *serial_team;
4243   kmp_info_t *new_thr;
4244   int new_gtid;
4245 
4246   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4247   KMP_DEBUG_ASSERT(root && team);
4248 #if !KMP_NESTED_HOT_TEAMS
4249   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4250 #endif
4251   KMP_MB();
4252 
4253   /* first, try to get one from the thread pool */
4254   if (__kmp_thread_pool) {
4255     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4256     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4257     if (new_thr == __kmp_thread_pool_insert_pt) {
4258       __kmp_thread_pool_insert_pt = NULL;
4259     }
4260     TCW_4(new_thr->th.th_in_pool, FALSE);
4261     __kmp_suspend_initialize_thread(new_thr);
4262     __kmp_lock_suspend_mx(new_thr);
4263     if (new_thr->th.th_active_in_pool == TRUE) {
4264       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4265       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4266       new_thr->th.th_active_in_pool = FALSE;
4267     }
4268     __kmp_unlock_suspend_mx(new_thr);
4269 
4270     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4271                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4272     KMP_ASSERT(!new_thr->th.th_team);
4273     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4274 
4275     /* setup the thread structure */
4276     __kmp_initialize_info(new_thr, team, new_tid,
4277                           new_thr->th.th_info.ds.ds_gtid);
4278     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4279 
4280     TCW_4(__kmp_nth, __kmp_nth + 1);
4281 
4282     new_thr->th.th_task_state = 0;
4283     new_thr->th.th_task_state_top = 0;
4284     new_thr->th.th_task_state_stack_sz = 4;
4285 
4286 #ifdef KMP_ADJUST_BLOCKTIME
4287     /* Adjust blocktime back to zero if necessary */
4288     /* Middle initialization might not have occurred yet */
4289     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4290       if (__kmp_nth > __kmp_avail_proc) {
4291         __kmp_zero_bt = TRUE;
4292       }
4293     }
4294 #endif /* KMP_ADJUST_BLOCKTIME */
4295 
4296 #if KMP_DEBUG
4297     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4298     // KMP_BARRIER_PARENT_FLAG.
4299     int b;
4300     kmp_balign_t *balign = new_thr->th.th_bar;
4301     for (b = 0; b < bs_last_barrier; ++b)
4302       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4303 #endif
4304 
4305     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4306                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4307 
4308     KMP_MB();
4309     return new_thr;
4310   }
4311 
4312   /* no, well fork a new one */
4313   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4314   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4315 
4316 #if KMP_USE_MONITOR
4317   // If this is the first worker thread the RTL is creating, then also
4318   // launch the monitor thread.  We try to do this as early as possible.
4319   if (!TCR_4(__kmp_init_monitor)) {
4320     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4321     if (!TCR_4(__kmp_init_monitor)) {
4322       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4323       TCW_4(__kmp_init_monitor, 1);
4324       __kmp_create_monitor(&__kmp_monitor);
4325       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4326 #if KMP_OS_WINDOWS
4327       // AC: wait until monitor has started. This is a fix for CQ232808.
4328       // The reason is that if the library is loaded/unloaded in a loop with
4329       // small (parallel) work in between, then there is high probability that
4330       // monitor thread started after the library shutdown. At shutdown it is
4331       // too late to cope with the problem, because when the primary thread is
4332       // in DllMain (process detach) the monitor has no chances to start (it is
4333       // blocked), and primary thread has no means to inform the monitor that
4334       // the library has gone, because all the memory which the monitor can
4335       // access is going to be released/reset.
4336       while (TCR_4(__kmp_init_monitor) < 2) {
4337         KMP_YIELD(TRUE);
4338       }
4339       KF_TRACE(10, ("after monitor thread has started\n"));
4340 #endif
4341     }
4342     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4343   }
4344 #endif
4345 
4346   KMP_MB();
4347 
4348   {
4349     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4350                              ? 1
4351                              : __kmp_hidden_helper_threads_num + 1;
4352 
4353     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4354          ++new_gtid) {
4355       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4356     }
4357 
4358     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4359       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4360     }
4361   }
4362 
4363   /* allocate space for it. */
4364   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4365 
4366   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4367 
4368 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4369   // suppress race conditions detection on synchronization flags in debug mode
4370   // this helps to analyze library internals eliminating false positives
4371   __itt_suppress_mark_range(
4372       __itt_suppress_range, __itt_suppress_threading_errors,
4373       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4374   __itt_suppress_mark_range(
4375       __itt_suppress_range, __itt_suppress_threading_errors,
4376       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4377 #if KMP_OS_WINDOWS
4378   __itt_suppress_mark_range(
4379       __itt_suppress_range, __itt_suppress_threading_errors,
4380       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4381 #else
4382   __itt_suppress_mark_range(__itt_suppress_range,
4383                             __itt_suppress_threading_errors,
4384                             &new_thr->th.th_suspend_init_count,
4385                             sizeof(new_thr->th.th_suspend_init_count));
4386 #endif
4387   // TODO: check if we need to also suppress b_arrived flags
4388   __itt_suppress_mark_range(__itt_suppress_range,
4389                             __itt_suppress_threading_errors,
4390                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4391                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4392   __itt_suppress_mark_range(__itt_suppress_range,
4393                             __itt_suppress_threading_errors,
4394                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4395                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4396   __itt_suppress_mark_range(__itt_suppress_range,
4397                             __itt_suppress_threading_errors,
4398                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4399                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4400 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4401   if (__kmp_storage_map) {
4402     __kmp_print_thread_storage_map(new_thr, new_gtid);
4403   }
4404 
4405   // add the reserve serialized team, initialized from the team's primary thread
4406   {
4407     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4408     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4409     new_thr->th.th_serial_team = serial_team =
4410         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4411 #if OMPT_SUPPORT
4412                                           ompt_data_none, // root parallel id
4413 #endif
4414                                           proc_bind_default, &r_icvs,
4415                                           0 USE_NESTED_HOT_ARG(NULL));
4416   }
4417   KMP_ASSERT(serial_team);
4418   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4419   // execution (it is unused for now).
4420   serial_team->t.t_threads[0] = new_thr;
4421   KF_TRACE(10,
4422            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4423             new_thr));
4424 
4425   /* setup the thread structures */
4426   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4427 
4428 #if USE_FAST_MEMORY
4429   __kmp_initialize_fast_memory(new_thr);
4430 #endif /* USE_FAST_MEMORY */
4431 
4432 #if KMP_USE_BGET
4433   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4434   __kmp_initialize_bget(new_thr);
4435 #endif
4436 
4437   __kmp_init_random(new_thr); // Initialize random number generator
4438 
4439   /* Initialize these only once when thread is grabbed for a team allocation */
4440   KA_TRACE(20,
4441            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4442             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4443 
4444   int b;
4445   kmp_balign_t *balign = new_thr->th.th_bar;
4446   for (b = 0; b < bs_last_barrier; ++b) {
4447     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4448     balign[b].bb.team = NULL;
4449     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4450     balign[b].bb.use_oncore_barrier = 0;
4451   }
4452 
4453   new_thr->th.th_spin_here = FALSE;
4454   new_thr->th.th_next_waiting = 0;
4455 #if KMP_OS_UNIX
4456   new_thr->th.th_blocking = false;
4457 #endif
4458 
4459 #if KMP_AFFINITY_SUPPORTED
4460   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4461   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4462   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4463   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4464 #endif
4465   new_thr->th.th_def_allocator = __kmp_def_allocator;
4466   new_thr->th.th_prev_level = 0;
4467   new_thr->th.th_prev_num_threads = 1;
4468 
4469   TCW_4(new_thr->th.th_in_pool, FALSE);
4470   new_thr->th.th_active_in_pool = FALSE;
4471   TCW_4(new_thr->th.th_active, TRUE);
4472 
4473   /* adjust the global counters */
4474   __kmp_all_nth++;
4475   __kmp_nth++;
4476 
4477   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4478   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4479   if (__kmp_adjust_gtid_mode) {
4480     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4481       if (TCR_4(__kmp_gtid_mode) != 2) {
4482         TCW_4(__kmp_gtid_mode, 2);
4483       }
4484     } else {
4485       if (TCR_4(__kmp_gtid_mode) != 1) {
4486         TCW_4(__kmp_gtid_mode, 1);
4487       }
4488     }
4489   }
4490 
4491 #ifdef KMP_ADJUST_BLOCKTIME
4492   /* Adjust blocktime back to zero if necessary       */
4493   /* Middle initialization might not have occurred yet */
4494   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4495     if (__kmp_nth > __kmp_avail_proc) {
4496       __kmp_zero_bt = TRUE;
4497     }
4498   }
4499 #endif /* KMP_ADJUST_BLOCKTIME */
4500 
4501   /* actually fork it and create the new worker thread */
4502   KF_TRACE(
4503       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4504   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4505   KF_TRACE(10,
4506            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4507 
4508   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4509                 new_gtid));
4510   KMP_MB();
4511   return new_thr;
4512 }
4513 
4514 /* Reinitialize team for reuse.
4515    The hot team code calls this case at every fork barrier, so EPCC barrier
4516    test are extremely sensitive to changes in it, esp. writes to the team
4517    struct, which cause a cache invalidation in all threads.
4518    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4519 static void __kmp_reinitialize_team(kmp_team_t *team,
4520                                     kmp_internal_control_t *new_icvs,
4521                                     ident_t *loc) {
4522   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4523                 team->t.t_threads[0], team));
4524   KMP_DEBUG_ASSERT(team && new_icvs);
4525   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4526   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4527 
4528   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4529   // Copy ICVs to the primary thread's implicit taskdata
4530   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4531   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4532 
4533   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4534                 team->t.t_threads[0], team));
4535 }
4536 
4537 /* Initialize the team data structure.
4538    This assumes the t_threads and t_max_nproc are already set.
4539    Also, we don't touch the arguments */
4540 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4541                                   kmp_internal_control_t *new_icvs,
4542                                   ident_t *loc) {
4543   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4544 
4545   /* verify */
4546   KMP_DEBUG_ASSERT(team);
4547   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4548   KMP_DEBUG_ASSERT(team->t.t_threads);
4549   KMP_MB();
4550 
4551   team->t.t_master_tid = 0; /* not needed */
4552   /* team->t.t_master_bar;        not needed */
4553   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4554   team->t.t_nproc = new_nproc;
4555 
4556   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4557   team->t.t_next_pool = NULL;
4558   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4559    * up hot team */
4560 
4561   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4562   team->t.t_invoke = NULL; /* not needed */
4563 
4564   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4565   team->t.t_sched.sched = new_icvs->sched.sched;
4566 
4567 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4568   team->t.t_fp_control_saved = FALSE; /* not needed */
4569   team->t.t_x87_fpu_control_word = 0; /* not needed */
4570   team->t.t_mxcsr = 0; /* not needed */
4571 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4572 
4573   team->t.t_construct = 0;
4574 
4575   team->t.t_ordered.dt.t_value = 0;
4576   team->t.t_master_active = FALSE;
4577 
4578 #ifdef KMP_DEBUG
4579   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4580 #endif
4581 #if KMP_OS_WINDOWS
4582   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4583 #endif
4584 
4585   team->t.t_control_stack_top = NULL;
4586 
4587   __kmp_reinitialize_team(team, new_icvs, loc);
4588 
4589   KMP_MB();
4590   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4591 }
4592 
4593 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4594 /* Sets full mask for thread and returns old mask, no changes to structures. */
4595 static void
4596 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4597   if (KMP_AFFINITY_CAPABLE()) {
4598     int status;
4599     if (old_mask != NULL) {
4600       status = __kmp_get_system_affinity(old_mask, TRUE);
4601       int error = errno;
4602       if (status != 0) {
4603         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4604                     __kmp_msg_null);
4605       }
4606     }
4607     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4608   }
4609 }
4610 #endif
4611 
4612 #if KMP_AFFINITY_SUPPORTED
4613 
4614 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4615 // It calculates the worker + primary thread's partition based upon the parent
4616 // thread's partition, and binds each worker to a thread in their partition.
4617 // The primary thread's partition should already include its current binding.
4618 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4619   // Do not partition places for the hidden helper team
4620   if (KMP_HIDDEN_HELPER_TEAM(team))
4621     return;
4622   // Copy the primary thread's place partition to the team struct
4623   kmp_info_t *master_th = team->t.t_threads[0];
4624   KMP_DEBUG_ASSERT(master_th != NULL);
4625   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4626   int first_place = master_th->th.th_first_place;
4627   int last_place = master_th->th.th_last_place;
4628   int masters_place = master_th->th.th_current_place;
4629   team->t.t_first_place = first_place;
4630   team->t.t_last_place = last_place;
4631 
4632   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4633                 "bound to place %d partition = [%d,%d]\n",
4634                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4635                 team->t.t_id, masters_place, first_place, last_place));
4636 
4637   switch (proc_bind) {
4638 
4639   case proc_bind_default:
4640     // Serial teams might have the proc_bind policy set to proc_bind_default.
4641     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4642     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4643     break;
4644 
4645   case proc_bind_primary: {
4646     int f;
4647     int n_th = team->t.t_nproc;
4648     for (f = 1; f < n_th; f++) {
4649       kmp_info_t *th = team->t.t_threads[f];
4650       KMP_DEBUG_ASSERT(th != NULL);
4651       th->th.th_first_place = first_place;
4652       th->th.th_last_place = last_place;
4653       th->th.th_new_place = masters_place;
4654       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4655           team->t.t_display_affinity != 1) {
4656         team->t.t_display_affinity = 1;
4657       }
4658 
4659       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4660                      "partition = [%d,%d]\n",
4661                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4662                      f, masters_place, first_place, last_place));
4663     }
4664   } break;
4665 
4666   case proc_bind_close: {
4667     int f;
4668     int n_th = team->t.t_nproc;
4669     int n_places;
4670     if (first_place <= last_place) {
4671       n_places = last_place - first_place + 1;
4672     } else {
4673       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4674     }
4675     if (n_th <= n_places) {
4676       int place = masters_place;
4677       for (f = 1; f < n_th; f++) {
4678         kmp_info_t *th = team->t.t_threads[f];
4679         KMP_DEBUG_ASSERT(th != NULL);
4680 
4681         if (place == last_place) {
4682           place = first_place;
4683         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4684           place = 0;
4685         } else {
4686           place++;
4687         }
4688         th->th.th_first_place = first_place;
4689         th->th.th_last_place = last_place;
4690         th->th.th_new_place = place;
4691         if (__kmp_display_affinity && place != th->th.th_current_place &&
4692             team->t.t_display_affinity != 1) {
4693           team->t.t_display_affinity = 1;
4694         }
4695 
4696         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4697                        "partition = [%d,%d]\n",
4698                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4699                        team->t.t_id, f, place, first_place, last_place));
4700       }
4701     } else {
4702       int S, rem, gap, s_count;
4703       S = n_th / n_places;
4704       s_count = 0;
4705       rem = n_th - (S * n_places);
4706       gap = rem > 0 ? n_places / rem : n_places;
4707       int place = masters_place;
4708       int gap_ct = gap;
4709       for (f = 0; f < n_th; f++) {
4710         kmp_info_t *th = team->t.t_threads[f];
4711         KMP_DEBUG_ASSERT(th != NULL);
4712 
4713         th->th.th_first_place = first_place;
4714         th->th.th_last_place = last_place;
4715         th->th.th_new_place = place;
4716         if (__kmp_display_affinity && place != th->th.th_current_place &&
4717             team->t.t_display_affinity != 1) {
4718           team->t.t_display_affinity = 1;
4719         }
4720         s_count++;
4721 
4722         if ((s_count == S) && rem && (gap_ct == gap)) {
4723           // do nothing, add an extra thread to place on next iteration
4724         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4725           // we added an extra thread to this place; move to next place
4726           if (place == last_place) {
4727             place = first_place;
4728           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4729             place = 0;
4730           } else {
4731             place++;
4732           }
4733           s_count = 0;
4734           gap_ct = 1;
4735           rem--;
4736         } else if (s_count == S) { // place full; don't add extra
4737           if (place == last_place) {
4738             place = first_place;
4739           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4740             place = 0;
4741           } else {
4742             place++;
4743           }
4744           gap_ct++;
4745           s_count = 0;
4746         }
4747 
4748         KA_TRACE(100,
4749                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4750                   "partition = [%d,%d]\n",
4751                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4752                   th->th.th_new_place, first_place, last_place));
4753       }
4754       KMP_DEBUG_ASSERT(place == masters_place);
4755     }
4756   } break;
4757 
4758   case proc_bind_spread: {
4759     int f;
4760     int n_th = team->t.t_nproc;
4761     int n_places;
4762     int thidx;
4763     if (first_place <= last_place) {
4764       n_places = last_place - first_place + 1;
4765     } else {
4766       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4767     }
4768     if (n_th <= n_places) {
4769       int place = -1;
4770 
4771       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4772         int S = n_places / n_th;
4773         int s_count, rem, gap, gap_ct;
4774 
4775         place = masters_place;
4776         rem = n_places - n_th * S;
4777         gap = rem ? n_th / rem : 1;
4778         gap_ct = gap;
4779         thidx = n_th;
4780         if (update_master_only == 1)
4781           thidx = 1;
4782         for (f = 0; f < thidx; f++) {
4783           kmp_info_t *th = team->t.t_threads[f];
4784           KMP_DEBUG_ASSERT(th != NULL);
4785 
4786           th->th.th_first_place = place;
4787           th->th.th_new_place = place;
4788           if (__kmp_display_affinity && place != th->th.th_current_place &&
4789               team->t.t_display_affinity != 1) {
4790             team->t.t_display_affinity = 1;
4791           }
4792           s_count = 1;
4793           while (s_count < S) {
4794             if (place == last_place) {
4795               place = first_place;
4796             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4797               place = 0;
4798             } else {
4799               place++;
4800             }
4801             s_count++;
4802           }
4803           if (rem && (gap_ct == gap)) {
4804             if (place == last_place) {
4805               place = first_place;
4806             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4807               place = 0;
4808             } else {
4809               place++;
4810             }
4811             rem--;
4812             gap_ct = 0;
4813           }
4814           th->th.th_last_place = place;
4815           gap_ct++;
4816 
4817           if (place == last_place) {
4818             place = first_place;
4819           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4820             place = 0;
4821           } else {
4822             place++;
4823           }
4824 
4825           KA_TRACE(100,
4826                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4827                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4828                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4829                     f, th->th.th_new_place, th->th.th_first_place,
4830                     th->th.th_last_place, __kmp_affinity_num_masks));
4831         }
4832       } else {
4833         /* Having uniform space of available computation places I can create
4834            T partitions of round(P/T) size and put threads into the first
4835            place of each partition. */
4836         double current = static_cast<double>(masters_place);
4837         double spacing =
4838             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4839         int first, last;
4840         kmp_info_t *th;
4841 
4842         thidx = n_th + 1;
4843         if (update_master_only == 1)
4844           thidx = 1;
4845         for (f = 0; f < thidx; f++) {
4846           first = static_cast<int>(current);
4847           last = static_cast<int>(current + spacing) - 1;
4848           KMP_DEBUG_ASSERT(last >= first);
4849           if (first >= n_places) {
4850             if (masters_place) {
4851               first -= n_places;
4852               last -= n_places;
4853               if (first == (masters_place + 1)) {
4854                 KMP_DEBUG_ASSERT(f == n_th);
4855                 first--;
4856               }
4857               if (last == masters_place) {
4858                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4859                 last--;
4860               }
4861             } else {
4862               KMP_DEBUG_ASSERT(f == n_th);
4863               first = 0;
4864               last = 0;
4865             }
4866           }
4867           if (last >= n_places) {
4868             last = (n_places - 1);
4869           }
4870           place = first;
4871           current += spacing;
4872           if (f < n_th) {
4873             KMP_DEBUG_ASSERT(0 <= first);
4874             KMP_DEBUG_ASSERT(n_places > first);
4875             KMP_DEBUG_ASSERT(0 <= last);
4876             KMP_DEBUG_ASSERT(n_places > last);
4877             KMP_DEBUG_ASSERT(last_place >= first_place);
4878             th = team->t.t_threads[f];
4879             KMP_DEBUG_ASSERT(th);
4880             th->th.th_first_place = first;
4881             th->th.th_new_place = place;
4882             th->th.th_last_place = last;
4883             if (__kmp_display_affinity && place != th->th.th_current_place &&
4884                 team->t.t_display_affinity != 1) {
4885               team->t.t_display_affinity = 1;
4886             }
4887             KA_TRACE(100,
4888                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4889                       "partition = [%d,%d], spacing = %.4f\n",
4890                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4891                       team->t.t_id, f, th->th.th_new_place,
4892                       th->th.th_first_place, th->th.th_last_place, spacing));
4893           }
4894         }
4895       }
4896       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4897     } else {
4898       int S, rem, gap, s_count;
4899       S = n_th / n_places;
4900       s_count = 0;
4901       rem = n_th - (S * n_places);
4902       gap = rem > 0 ? n_places / rem : n_places;
4903       int place = masters_place;
4904       int gap_ct = gap;
4905       thidx = n_th;
4906       if (update_master_only == 1)
4907         thidx = 1;
4908       for (f = 0; f < thidx; f++) {
4909         kmp_info_t *th = team->t.t_threads[f];
4910         KMP_DEBUG_ASSERT(th != NULL);
4911 
4912         th->th.th_first_place = place;
4913         th->th.th_last_place = place;
4914         th->th.th_new_place = place;
4915         if (__kmp_display_affinity && place != th->th.th_current_place &&
4916             team->t.t_display_affinity != 1) {
4917           team->t.t_display_affinity = 1;
4918         }
4919         s_count++;
4920 
4921         if ((s_count == S) && rem && (gap_ct == gap)) {
4922           // do nothing, add an extra thread to place on next iteration
4923         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4924           // we added an extra thread to this place; move on to next place
4925           if (place == last_place) {
4926             place = first_place;
4927           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4928             place = 0;
4929           } else {
4930             place++;
4931           }
4932           s_count = 0;
4933           gap_ct = 1;
4934           rem--;
4935         } else if (s_count == S) { // place is full; don't add extra thread
4936           if (place == last_place) {
4937             place = first_place;
4938           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4939             place = 0;
4940           } else {
4941             place++;
4942           }
4943           gap_ct++;
4944           s_count = 0;
4945         }
4946 
4947         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4948                        "partition = [%d,%d]\n",
4949                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4950                        team->t.t_id, f, th->th.th_new_place,
4951                        th->th.th_first_place, th->th.th_last_place));
4952       }
4953       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4954     }
4955   } break;
4956 
4957   default:
4958     break;
4959   }
4960 
4961   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4962 }
4963 
4964 #endif // KMP_AFFINITY_SUPPORTED
4965 
4966 /* allocate a new team data structure to use.  take one off of the free pool if
4967    available */
4968 kmp_team_t *
4969 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4970 #if OMPT_SUPPORT
4971                     ompt_data_t ompt_parallel_data,
4972 #endif
4973                     kmp_proc_bind_t new_proc_bind,
4974                     kmp_internal_control_t *new_icvs,
4975                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4976   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4977   int f;
4978   kmp_team_t *team;
4979   int use_hot_team = !root->r.r_active;
4980   int level = 0;
4981 
4982   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4983   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4984   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4985   KMP_MB();
4986 
4987 #if KMP_NESTED_HOT_TEAMS
4988   kmp_hot_team_ptr_t *hot_teams;
4989   if (master) {
4990     team = master->th.th_team;
4991     level = team->t.t_active_level;
4992     if (master->th.th_teams_microtask) { // in teams construct?
4993       if (master->th.th_teams_size.nteams > 1 &&
4994           ( // #teams > 1
4995               team->t.t_pkfn ==
4996                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4997               master->th.th_teams_level <
4998                   team->t.t_level)) { // or nested parallel inside the teams
4999         ++level; // not increment if #teams==1, or for outer fork of the teams;
5000         // increment otherwise
5001       }
5002     }
5003     hot_teams = master->th.th_hot_teams;
5004     if (level < __kmp_hot_teams_max_level && hot_teams &&
5005         hot_teams[level].hot_team) {
5006       // hot team has already been allocated for given level
5007       use_hot_team = 1;
5008     } else {
5009       use_hot_team = 0;
5010     }
5011   } else {
5012     // check we won't access uninitialized hot_teams, just in case
5013     KMP_DEBUG_ASSERT(new_nproc == 1);
5014   }
5015 #endif
5016   // Optimization to use a "hot" team
5017   if (use_hot_team && new_nproc > 1) {
5018     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5019 #if KMP_NESTED_HOT_TEAMS
5020     team = hot_teams[level].hot_team;
5021 #else
5022     team = root->r.r_hot_team;
5023 #endif
5024 #if KMP_DEBUG
5025     if (__kmp_tasking_mode != tskm_immediate_exec) {
5026       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5027                     "task_team[1] = %p before reinit\n",
5028                     team->t.t_task_team[0], team->t.t_task_team[1]));
5029     }
5030 #endif
5031 
5032     // Has the number of threads changed?
5033     /* Let's assume the most common case is that the number of threads is
5034        unchanged, and put that case first. */
5035     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5036       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5037       // This case can mean that omp_set_num_threads() was called and the hot
5038       // team size was already reduced, so we check the special flag
5039       if (team->t.t_size_changed == -1) {
5040         team->t.t_size_changed = 1;
5041       } else {
5042         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5043       }
5044 
5045       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5046       kmp_r_sched_t new_sched = new_icvs->sched;
5047       // set primary thread's schedule as new run-time schedule
5048       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5049 
5050       __kmp_reinitialize_team(team, new_icvs,
5051                               root->r.r_uber_thread->th.th_ident);
5052 
5053       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5054                     team->t.t_threads[0], team));
5055       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5056 
5057 #if KMP_AFFINITY_SUPPORTED
5058       if ((team->t.t_size_changed == 0) &&
5059           (team->t.t_proc_bind == new_proc_bind)) {
5060         if (new_proc_bind == proc_bind_spread) {
5061           __kmp_partition_places(
5062               team, 1); // add flag to update only master for spread
5063         }
5064         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5065                        "proc_bind = %d, partition = [%d,%d]\n",
5066                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5067                        team->t.t_last_place));
5068       } else {
5069         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5070         __kmp_partition_places(team);
5071       }
5072 #else
5073       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5074 #endif /* KMP_AFFINITY_SUPPORTED */
5075     } else if (team->t.t_nproc > new_nproc) {
5076       KA_TRACE(20,
5077                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5078                 new_nproc));
5079 
5080       team->t.t_size_changed = 1;
5081 #if KMP_NESTED_HOT_TEAMS
5082       if (__kmp_hot_teams_mode == 0) {
5083         // AC: saved number of threads should correspond to team's value in this
5084         // mode, can be bigger in mode 1, when hot team has threads in reserve
5085         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5086         hot_teams[level].hot_team_nth = new_nproc;
5087 #endif // KMP_NESTED_HOT_TEAMS
5088         /* release the extra threads we don't need any more */
5089         for (f = new_nproc; f < team->t.t_nproc; f++) {
5090           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5091           if (__kmp_tasking_mode != tskm_immediate_exec) {
5092             // When decreasing team size, threads no longer in the team should
5093             // unref task team.
5094             team->t.t_threads[f]->th.th_task_team = NULL;
5095           }
5096           __kmp_free_thread(team->t.t_threads[f]);
5097           team->t.t_threads[f] = NULL;
5098         }
5099 #if KMP_NESTED_HOT_TEAMS
5100       } // (__kmp_hot_teams_mode == 0)
5101       else {
5102         // When keeping extra threads in team, switch threads to wait on own
5103         // b_go flag
5104         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5105           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5106           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5107           for (int b = 0; b < bs_last_barrier; ++b) {
5108             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5109               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5110             }
5111             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5112           }
5113         }
5114       }
5115 #endif // KMP_NESTED_HOT_TEAMS
5116       team->t.t_nproc = new_nproc;
5117       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5118       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5119       __kmp_reinitialize_team(team, new_icvs,
5120                               root->r.r_uber_thread->th.th_ident);
5121 
5122       // Update remaining threads
5123       for (f = 0; f < new_nproc; ++f) {
5124         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5125       }
5126 
5127       // restore the current task state of the primary thread: should be the
5128       // implicit task
5129       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5130                     team->t.t_threads[0], team));
5131 
5132       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5133 
5134 #ifdef KMP_DEBUG
5135       for (f = 0; f < team->t.t_nproc; f++) {
5136         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5137                          team->t.t_threads[f]->th.th_team_nproc ==
5138                              team->t.t_nproc);
5139       }
5140 #endif
5141 
5142       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5143 #if KMP_AFFINITY_SUPPORTED
5144       __kmp_partition_places(team);
5145 #endif
5146     } else { // team->t.t_nproc < new_nproc
5147 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5148       kmp_affin_mask_t *old_mask;
5149       if (KMP_AFFINITY_CAPABLE()) {
5150         KMP_CPU_ALLOC(old_mask);
5151       }
5152 #endif
5153 
5154       KA_TRACE(20,
5155                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5156                 new_nproc));
5157 
5158       team->t.t_size_changed = 1;
5159 
5160 #if KMP_NESTED_HOT_TEAMS
5161       int avail_threads = hot_teams[level].hot_team_nth;
5162       if (new_nproc < avail_threads)
5163         avail_threads = new_nproc;
5164       kmp_info_t **other_threads = team->t.t_threads;
5165       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5166         // Adjust barrier data of reserved threads (if any) of the team
5167         // Other data will be set in __kmp_initialize_info() below.
5168         int b;
5169         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5170         for (b = 0; b < bs_last_barrier; ++b) {
5171           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5172           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5173 #if USE_DEBUGGER
5174           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5175 #endif
5176         }
5177       }
5178       if (hot_teams[level].hot_team_nth >= new_nproc) {
5179         // we have all needed threads in reserve, no need to allocate any
5180         // this only possible in mode 1, cannot have reserved threads in mode 0
5181         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5182         team->t.t_nproc = new_nproc; // just get reserved threads involved
5183       } else {
5184         // we may have some threads in reserve, but not enough
5185         team->t.t_nproc =
5186             hot_teams[level]
5187                 .hot_team_nth; // get reserved threads involved if any
5188         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5189 #endif // KMP_NESTED_HOT_TEAMS
5190         if (team->t.t_max_nproc < new_nproc) {
5191           /* reallocate larger arrays */
5192           __kmp_reallocate_team_arrays(team, new_nproc);
5193           __kmp_reinitialize_team(team, new_icvs, NULL);
5194         }
5195 
5196 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5197         /* Temporarily set full mask for primary thread before creation of
5198            workers. The reason is that workers inherit the affinity from the
5199            primary thread, so if a lot of workers are created on the single
5200            core quickly, they don't get a chance to set their own affinity for
5201            a long time. */
5202         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5203 #endif
5204 
5205         /* allocate new threads for the hot team */
5206         for (f = team->t.t_nproc; f < new_nproc; f++) {
5207           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5208           KMP_DEBUG_ASSERT(new_worker);
5209           team->t.t_threads[f] = new_worker;
5210 
5211           KA_TRACE(20,
5212                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5213                     "join=%llu, plain=%llu\n",
5214                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5215                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5216                     team->t.t_bar[bs_plain_barrier].b_arrived));
5217 
5218           { // Initialize barrier data for new threads.
5219             int b;
5220             kmp_balign_t *balign = new_worker->th.th_bar;
5221             for (b = 0; b < bs_last_barrier; ++b) {
5222               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5223               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5224                                KMP_BARRIER_PARENT_FLAG);
5225 #if USE_DEBUGGER
5226               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5227 #endif
5228             }
5229           }
5230         }
5231 
5232 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5233         if (KMP_AFFINITY_CAPABLE()) {
5234           /* Restore initial primary thread's affinity mask */
5235           __kmp_set_system_affinity(old_mask, TRUE);
5236           KMP_CPU_FREE(old_mask);
5237         }
5238 #endif
5239 #if KMP_NESTED_HOT_TEAMS
5240       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5241 #endif // KMP_NESTED_HOT_TEAMS
5242       /* make sure everyone is syncronized */
5243       int old_nproc = team->t.t_nproc; // save old value and use to update only
5244       // new threads below
5245       __kmp_initialize_team(team, new_nproc, new_icvs,
5246                             root->r.r_uber_thread->th.th_ident);
5247 
5248       /* reinitialize the threads */
5249       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5250       for (f = 0; f < team->t.t_nproc; ++f)
5251         __kmp_initialize_info(team->t.t_threads[f], team, f,
5252                               __kmp_gtid_from_tid(f, team));
5253 
5254       if (level) { // set th_task_state for new threads in nested hot team
5255         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5256         // only need to set the th_task_state for the new threads. th_task_state
5257         // for primary thread will not be accurate until after this in
5258         // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5259         // get the correct value.
5260         for (f = old_nproc; f < team->t.t_nproc; ++f)
5261           team->t.t_threads[f]->th.th_task_state =
5262               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5263       } else { // set th_task_state for new threads in non-nested hot team
5264         // copy primary thread's state
5265         kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5266         for (f = old_nproc; f < team->t.t_nproc; ++f)
5267           team->t.t_threads[f]->th.th_task_state = old_state;
5268       }
5269 
5270 #ifdef KMP_DEBUG
5271       for (f = 0; f < team->t.t_nproc; ++f) {
5272         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5273                          team->t.t_threads[f]->th.th_team_nproc ==
5274                              team->t.t_nproc);
5275       }
5276 #endif
5277 
5278       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5279 #if KMP_AFFINITY_SUPPORTED
5280       __kmp_partition_places(team);
5281 #endif
5282     } // Check changes in number of threads
5283 
5284     kmp_info_t *master = team->t.t_threads[0];
5285     if (master->th.th_teams_microtask) {
5286       for (f = 1; f < new_nproc; ++f) {
5287         // propagate teams construct specific info to workers
5288         kmp_info_t *thr = team->t.t_threads[f];
5289         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5290         thr->th.th_teams_level = master->th.th_teams_level;
5291         thr->th.th_teams_size = master->th.th_teams_size;
5292       }
5293     }
5294 #if KMP_NESTED_HOT_TEAMS
5295     if (level) {
5296       // Sync barrier state for nested hot teams, not needed for outermost hot
5297       // team.
5298       for (f = 1; f < new_nproc; ++f) {
5299         kmp_info_t *thr = team->t.t_threads[f];
5300         int b;
5301         kmp_balign_t *balign = thr->th.th_bar;
5302         for (b = 0; b < bs_last_barrier; ++b) {
5303           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5304           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5305 #if USE_DEBUGGER
5306           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5307 #endif
5308         }
5309       }
5310     }
5311 #endif // KMP_NESTED_HOT_TEAMS
5312 
5313     /* reallocate space for arguments if necessary */
5314     __kmp_alloc_argv_entries(argc, team, TRUE);
5315     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5316     // The hot team re-uses the previous task team,
5317     // if untouched during the previous release->gather phase.
5318 
5319     KF_TRACE(10, (" hot_team = %p\n", team));
5320 
5321 #if KMP_DEBUG
5322     if (__kmp_tasking_mode != tskm_immediate_exec) {
5323       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5324                     "task_team[1] = %p after reinit\n",
5325                     team->t.t_task_team[0], team->t.t_task_team[1]));
5326     }
5327 #endif
5328 
5329 #if OMPT_SUPPORT
5330     __ompt_team_assign_id(team, ompt_parallel_data);
5331 #endif
5332 
5333     KMP_MB();
5334 
5335     return team;
5336   }
5337 
5338   /* next, let's try to take one from the team pool */
5339   KMP_MB();
5340   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5341     /* TODO: consider resizing undersized teams instead of reaping them, now
5342        that we have a resizing mechanism */
5343     if (team->t.t_max_nproc >= max_nproc) {
5344       /* take this team from the team pool */
5345       __kmp_team_pool = team->t.t_next_pool;
5346 
5347       /* setup the team for fresh use */
5348       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5349 
5350       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5351                     "task_team[1] %p to NULL\n",
5352                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5353       team->t.t_task_team[0] = NULL;
5354       team->t.t_task_team[1] = NULL;
5355 
5356       /* reallocate space for arguments if necessary */
5357       __kmp_alloc_argv_entries(argc, team, TRUE);
5358       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5359 
5360       KA_TRACE(
5361           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5362                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5363       { // Initialize barrier data.
5364         int b;
5365         for (b = 0; b < bs_last_barrier; ++b) {
5366           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5367 #if USE_DEBUGGER
5368           team->t.t_bar[b].b_master_arrived = 0;
5369           team->t.t_bar[b].b_team_arrived = 0;
5370 #endif
5371         }
5372       }
5373 
5374       team->t.t_proc_bind = new_proc_bind;
5375 
5376       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5377                     team->t.t_id));
5378 
5379 #if OMPT_SUPPORT
5380       __ompt_team_assign_id(team, ompt_parallel_data);
5381 #endif
5382 
5383       KMP_MB();
5384 
5385       return team;
5386     }
5387 
5388     /* reap team if it is too small, then loop back and check the next one */
5389     // not sure if this is wise, but, will be redone during the hot-teams
5390     // rewrite.
5391     /* TODO: Use technique to find the right size hot-team, don't reap them */
5392     team = __kmp_reap_team(team);
5393     __kmp_team_pool = team;
5394   }
5395 
5396   /* nothing available in the pool, no matter, make a new team! */
5397   KMP_MB();
5398   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5399 
5400   /* and set it up */
5401   team->t.t_max_nproc = max_nproc;
5402   /* NOTE well, for some reason allocating one big buffer and dividing it up
5403      seems to really hurt performance a lot on the P4, so, let's not use this */
5404   __kmp_allocate_team_arrays(team, max_nproc);
5405 
5406   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5407   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5408 
5409   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5410                 "%p to NULL\n",
5411                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5412   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5413   // memory, no need to duplicate
5414   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5415   // memory, no need to duplicate
5416 
5417   if (__kmp_storage_map) {
5418     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5419   }
5420 
5421   /* allocate space for arguments */
5422   __kmp_alloc_argv_entries(argc, team, FALSE);
5423   team->t.t_argc = argc;
5424 
5425   KA_TRACE(20,
5426            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5427             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5428   { // Initialize barrier data.
5429     int b;
5430     for (b = 0; b < bs_last_barrier; ++b) {
5431       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5432 #if USE_DEBUGGER
5433       team->t.t_bar[b].b_master_arrived = 0;
5434       team->t.t_bar[b].b_team_arrived = 0;
5435 #endif
5436     }
5437   }
5438 
5439   team->t.t_proc_bind = new_proc_bind;
5440 
5441 #if OMPT_SUPPORT
5442   __ompt_team_assign_id(team, ompt_parallel_data);
5443   team->t.ompt_serialized_team_info = NULL;
5444 #endif
5445 
5446   KMP_MB();
5447 
5448   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5449                 team->t.t_id));
5450 
5451   return team;
5452 }
5453 
5454 /* TODO implement hot-teams at all levels */
5455 /* TODO implement lazy thread release on demand (disband request) */
5456 
5457 /* free the team.  return it to the team pool.  release all the threads
5458  * associated with it */
5459 void __kmp_free_team(kmp_root_t *root,
5460                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5461   int f;
5462   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5463                 team->t.t_id));
5464 
5465   /* verify state */
5466   KMP_DEBUG_ASSERT(root);
5467   KMP_DEBUG_ASSERT(team);
5468   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5469   KMP_DEBUG_ASSERT(team->t.t_threads);
5470 
5471   int use_hot_team = team == root->r.r_hot_team;
5472 #if KMP_NESTED_HOT_TEAMS
5473   int level;
5474   kmp_hot_team_ptr_t *hot_teams;
5475   if (master) {
5476     level = team->t.t_active_level - 1;
5477     if (master->th.th_teams_microtask) { // in teams construct?
5478       if (master->th.th_teams_size.nteams > 1) {
5479         ++level; // level was not increased in teams construct for
5480         // team_of_masters
5481       }
5482       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5483           master->th.th_teams_level == team->t.t_level) {
5484         ++level; // level was not increased in teams construct for
5485         // team_of_workers before the parallel
5486       } // team->t.t_level will be increased inside parallel
5487     }
5488     hot_teams = master->th.th_hot_teams;
5489     if (level < __kmp_hot_teams_max_level) {
5490       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5491       use_hot_team = 1;
5492     }
5493   }
5494 #endif // KMP_NESTED_HOT_TEAMS
5495 
5496   /* team is done working */
5497   TCW_SYNC_PTR(team->t.t_pkfn,
5498                NULL); // Important for Debugging Support Library.
5499 #if KMP_OS_WINDOWS
5500   team->t.t_copyin_counter = 0; // init counter for possible reuse
5501 #endif
5502   // Do not reset pointer to parent team to NULL for hot teams.
5503 
5504   /* if we are non-hot team, release our threads */
5505   if (!use_hot_team) {
5506     if (__kmp_tasking_mode != tskm_immediate_exec) {
5507       // Wait for threads to reach reapable state
5508       for (f = 1; f < team->t.t_nproc; ++f) {
5509         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5510         kmp_info_t *th = team->t.t_threads[f];
5511         volatile kmp_uint32 *state = &th->th.th_reap_state;
5512         while (*state != KMP_SAFE_TO_REAP) {
5513 #if KMP_OS_WINDOWS
5514           // On Windows a thread can be killed at any time, check this
5515           DWORD ecode;
5516           if (!__kmp_is_thread_alive(th, &ecode)) {
5517             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5518             break;
5519           }
5520 #endif
5521           // first check if thread is sleeping
5522           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5523           if (fl.is_sleeping())
5524             fl.resume(__kmp_gtid_from_thread(th));
5525           KMP_CPU_PAUSE();
5526         }
5527       }
5528 
5529       // Delete task teams
5530       int tt_idx;
5531       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5532         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5533         if (task_team != NULL) {
5534           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5535             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5536             team->t.t_threads[f]->th.th_task_team = NULL;
5537           }
5538           KA_TRACE(
5539               20,
5540               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5541                __kmp_get_gtid(), task_team, team->t.t_id));
5542 #if KMP_NESTED_HOT_TEAMS
5543           __kmp_free_task_team(master, task_team);
5544 #endif
5545           team->t.t_task_team[tt_idx] = NULL;
5546         }
5547       }
5548     }
5549 
5550     // Reset pointer to parent team only for non-hot teams.
5551     team->t.t_parent = NULL;
5552     team->t.t_level = 0;
5553     team->t.t_active_level = 0;
5554 
5555     /* free the worker threads */
5556     for (f = 1; f < team->t.t_nproc; ++f) {
5557       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5558       __kmp_free_thread(team->t.t_threads[f]);
5559       team->t.t_threads[f] = NULL;
5560     }
5561 
5562     /* put the team back in the team pool */
5563     /* TODO limit size of team pool, call reap_team if pool too large */
5564     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5565     __kmp_team_pool = (volatile kmp_team_t *)team;
5566   } else { // Check if team was created for primary threads in teams construct
5567     // See if first worker is a CG root
5568     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5569                      team->t.t_threads[1]->th.th_cg_roots);
5570     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5571       // Clean up the CG root nodes on workers so that this team can be re-used
5572       for (f = 1; f < team->t.t_nproc; ++f) {
5573         kmp_info_t *thr = team->t.t_threads[f];
5574         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5575                          thr->th.th_cg_roots->cg_root == thr);
5576         // Pop current CG root off list
5577         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5578         thr->th.th_cg_roots = tmp->up;
5579         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5580                        " up to node %p. cg_nthreads was %d\n",
5581                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5582         int i = tmp->cg_nthreads--;
5583         if (i == 1) {
5584           __kmp_free(tmp); // free CG if we are the last thread in it
5585         }
5586         // Restore current task's thread_limit from CG root
5587         if (thr->th.th_cg_roots)
5588           thr->th.th_current_task->td_icvs.thread_limit =
5589               thr->th.th_cg_roots->cg_thread_limit;
5590       }
5591     }
5592   }
5593 
5594   KMP_MB();
5595 }
5596 
5597 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5598 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5599   kmp_team_t *next_pool = team->t.t_next_pool;
5600 
5601   KMP_DEBUG_ASSERT(team);
5602   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5603   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5604   KMP_DEBUG_ASSERT(team->t.t_threads);
5605   KMP_DEBUG_ASSERT(team->t.t_argv);
5606 
5607   /* TODO clean the threads that are a part of this? */
5608 
5609   /* free stuff */
5610   __kmp_free_team_arrays(team);
5611   if (team->t.t_argv != &team->t.t_inline_argv[0])
5612     __kmp_free((void *)team->t.t_argv);
5613   __kmp_free(team);
5614 
5615   KMP_MB();
5616   return next_pool;
5617 }
5618 
5619 // Free the thread.  Don't reap it, just place it on the pool of available
5620 // threads.
5621 //
5622 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5623 // binding for the affinity mechanism to be useful.
5624 //
5625 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5626 // However, we want to avoid a potential performance problem by always
5627 // scanning through the list to find the correct point at which to insert
5628 // the thread (potential N**2 behavior).  To do this we keep track of the
5629 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5630 // With single-level parallelism, threads will always be added to the tail
5631 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5632 // parallelism, all bets are off and we may need to scan through the entire
5633 // free list.
5634 //
5635 // This change also has a potentially large performance benefit, for some
5636 // applications.  Previously, as threads were freed from the hot team, they
5637 // would be placed back on the free list in inverse order.  If the hot team
5638 // grew back to it's original size, then the freed thread would be placed
5639 // back on the hot team in reverse order.  This could cause bad cache
5640 // locality problems on programs where the size of the hot team regularly
5641 // grew and shrunk.
5642 //
5643 // Now, for single-level parallelism, the OMP tid is always == gtid.
5644 void __kmp_free_thread(kmp_info_t *this_th) {
5645   int gtid;
5646   kmp_info_t **scan;
5647 
5648   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5649                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5650 
5651   KMP_DEBUG_ASSERT(this_th);
5652 
5653   // When moving thread to pool, switch thread to wait on own b_go flag, and
5654   // uninitialized (NULL team).
5655   int b;
5656   kmp_balign_t *balign = this_th->th.th_bar;
5657   for (b = 0; b < bs_last_barrier; ++b) {
5658     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5659       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5660     balign[b].bb.team = NULL;
5661     balign[b].bb.leaf_kids = 0;
5662   }
5663   this_th->th.th_task_state = 0;
5664   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5665 
5666   /* put thread back on the free pool */
5667   TCW_PTR(this_th->th.th_team, NULL);
5668   TCW_PTR(this_th->th.th_root, NULL);
5669   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5670 
5671   while (this_th->th.th_cg_roots) {
5672     this_th->th.th_cg_roots->cg_nthreads--;
5673     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5674                    " %p of thread  %p to %d\n",
5675                    this_th, this_th->th.th_cg_roots,
5676                    this_th->th.th_cg_roots->cg_root,
5677                    this_th->th.th_cg_roots->cg_nthreads));
5678     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5679     if (tmp->cg_root == this_th) { // Thread is a cg_root
5680       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5681       KA_TRACE(
5682           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5683       this_th->th.th_cg_roots = tmp->up;
5684       __kmp_free(tmp);
5685     } else { // Worker thread
5686       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5687         __kmp_free(tmp);
5688       }
5689       this_th->th.th_cg_roots = NULL;
5690       break;
5691     }
5692   }
5693 
5694   /* If the implicit task assigned to this thread can be used by other threads
5695    * -> multiple threads can share the data and try to free the task at
5696    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5697    * with higher probability when hot team is disabled but can occurs even when
5698    * the hot team is enabled */
5699   __kmp_free_implicit_task(this_th);
5700   this_th->th.th_current_task = NULL;
5701 
5702   // If the __kmp_thread_pool_insert_pt is already past the new insert
5703   // point, then we need to re-scan the entire list.
5704   gtid = this_th->th.th_info.ds.ds_gtid;
5705   if (__kmp_thread_pool_insert_pt != NULL) {
5706     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5707     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5708       __kmp_thread_pool_insert_pt = NULL;
5709     }
5710   }
5711 
5712   // Scan down the list to find the place to insert the thread.
5713   // scan is the address of a link in the list, possibly the address of
5714   // __kmp_thread_pool itself.
5715   //
5716   // In the absence of nested parallelism, the for loop will have 0 iterations.
5717   if (__kmp_thread_pool_insert_pt != NULL) {
5718     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5719   } else {
5720     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5721   }
5722   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5723        scan = &((*scan)->th.th_next_pool))
5724     ;
5725 
5726   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5727   // to its address.
5728   TCW_PTR(this_th->th.th_next_pool, *scan);
5729   __kmp_thread_pool_insert_pt = *scan = this_th;
5730   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5731                    (this_th->th.th_info.ds.ds_gtid <
5732                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5733   TCW_4(this_th->th.th_in_pool, TRUE);
5734   __kmp_suspend_initialize_thread(this_th);
5735   __kmp_lock_suspend_mx(this_th);
5736   if (this_th->th.th_active == TRUE) {
5737     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5738     this_th->th.th_active_in_pool = TRUE;
5739   }
5740 #if KMP_DEBUG
5741   else {
5742     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5743   }
5744 #endif
5745   __kmp_unlock_suspend_mx(this_th);
5746 
5747   TCW_4(__kmp_nth, __kmp_nth - 1);
5748 
5749 #ifdef KMP_ADJUST_BLOCKTIME
5750   /* Adjust blocktime back to user setting or default if necessary */
5751   /* Middle initialization might never have occurred                */
5752   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5753     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5754     if (__kmp_nth <= __kmp_avail_proc) {
5755       __kmp_zero_bt = FALSE;
5756     }
5757   }
5758 #endif /* KMP_ADJUST_BLOCKTIME */
5759 
5760   KMP_MB();
5761 }
5762 
5763 /* ------------------------------------------------------------------------ */
5764 
5765 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5766 #if OMP_PROFILING_SUPPORT
5767   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5768   // TODO: add a configuration option for time granularity
5769   if (ProfileTraceFile)
5770     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5771 #endif
5772 
5773   int gtid = this_thr->th.th_info.ds.ds_gtid;
5774   /*    void                 *stack_data;*/
5775   kmp_team_t **volatile pteam;
5776 
5777   KMP_MB();
5778   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5779 
5780   if (__kmp_env_consistency_check) {
5781     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5782   }
5783 
5784 #if OMPD_SUPPORT
5785   if (ompd_state & OMPD_ENABLE_BP)
5786     ompd_bp_thread_begin();
5787 #endif
5788 
5789 #if OMPT_SUPPORT
5790   ompt_data_t *thread_data = nullptr;
5791   if (ompt_enabled.enabled) {
5792     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5793     *thread_data = ompt_data_none;
5794 
5795     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5796     this_thr->th.ompt_thread_info.wait_id = 0;
5797     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5798     this_thr->th.ompt_thread_info.parallel_flags = 0;
5799     if (ompt_enabled.ompt_callback_thread_begin) {
5800       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5801           ompt_thread_worker, thread_data);
5802     }
5803     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5804   }
5805 #endif
5806 
5807   /* This is the place where threads wait for work */
5808   while (!TCR_4(__kmp_global.g.g_done)) {
5809     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5810     KMP_MB();
5811 
5812     /* wait for work to do */
5813     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5814 
5815     /* No tid yet since not part of a team */
5816     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5817 
5818 #if OMPT_SUPPORT
5819     if (ompt_enabled.enabled) {
5820       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5821     }
5822 #endif
5823 
5824     pteam = &this_thr->th.th_team;
5825 
5826     /* have we been allocated? */
5827     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5828       /* we were just woken up, so run our new task */
5829       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5830         int rc;
5831         KA_TRACE(20,
5832                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5833                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5834                   (*pteam)->t.t_pkfn));
5835 
5836         updateHWFPControl(*pteam);
5837 
5838 #if OMPT_SUPPORT
5839         if (ompt_enabled.enabled) {
5840           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5841         }
5842 #endif
5843 
5844         rc = (*pteam)->t.t_invoke(gtid);
5845         KMP_ASSERT(rc);
5846 
5847         KMP_MB();
5848         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5849                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5850                       (*pteam)->t.t_pkfn));
5851       }
5852 #if OMPT_SUPPORT
5853       if (ompt_enabled.enabled) {
5854         /* no frame set while outside task */
5855         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5856 
5857         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5858       }
5859 #endif
5860       /* join barrier after parallel region */
5861       __kmp_join_barrier(gtid);
5862     }
5863   }
5864   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5865 
5866 #if OMPD_SUPPORT
5867   if (ompd_state & OMPD_ENABLE_BP)
5868     ompd_bp_thread_end();
5869 #endif
5870 
5871 #if OMPT_SUPPORT
5872   if (ompt_enabled.ompt_callback_thread_end) {
5873     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5874   }
5875 #endif
5876 
5877   this_thr->th.th_task_team = NULL;
5878   /* run the destructors for the threadprivate data for this thread */
5879   __kmp_common_destroy_gtid(gtid);
5880 
5881   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5882   KMP_MB();
5883 
5884 #if OMP_PROFILING_SUPPORT
5885   llvm::timeTraceProfilerFinishThread();
5886 #endif
5887   return this_thr;
5888 }
5889 
5890 /* ------------------------------------------------------------------------ */
5891 
5892 void __kmp_internal_end_dest(void *specific_gtid) {
5893   // Make sure no significant bits are lost
5894   int gtid;
5895   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5896 
5897   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5898   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5899    * this is because 0 is reserved for the nothing-stored case */
5900 
5901   __kmp_internal_end_thread(gtid);
5902 }
5903 
5904 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5905 
5906 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5907   __kmp_internal_end_atexit();
5908 }
5909 
5910 #endif
5911 
5912 /* [Windows] josh: when the atexit handler is called, there may still be more
5913    than one thread alive */
5914 void __kmp_internal_end_atexit(void) {
5915   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5916   /* [Windows]
5917      josh: ideally, we want to completely shutdown the library in this atexit
5918      handler, but stat code that depends on thread specific data for gtid fails
5919      because that data becomes unavailable at some point during the shutdown, so
5920      we call __kmp_internal_end_thread instead. We should eventually remove the
5921      dependency on __kmp_get_specific_gtid in the stat code and use
5922      __kmp_internal_end_library to cleanly shutdown the library.
5923 
5924      // TODO: Can some of this comment about GVS be removed?
5925      I suspect that the offending stat code is executed when the calling thread
5926      tries to clean up a dead root thread's data structures, resulting in GVS
5927      code trying to close the GVS structures for that thread, but since the stat
5928      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5929      the calling thread is cleaning up itself instead of another thread, it get
5930      confused. This happens because allowing a thread to unregister and cleanup
5931      another thread is a recent modification for addressing an issue.
5932      Based on the current design (20050722), a thread may end up
5933      trying to unregister another thread only if thread death does not trigger
5934      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5935      thread specific data destructor function to detect thread death. For
5936      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5937      is nothing.  Thus, the workaround is applicable only for Windows static
5938      stat library. */
5939   __kmp_internal_end_library(-1);
5940 #if KMP_OS_WINDOWS
5941   __kmp_close_console();
5942 #endif
5943 }
5944 
5945 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5946   // It is assumed __kmp_forkjoin_lock is acquired.
5947 
5948   int gtid;
5949 
5950   KMP_DEBUG_ASSERT(thread != NULL);
5951 
5952   gtid = thread->th.th_info.ds.ds_gtid;
5953 
5954   if (!is_root) {
5955     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5956       /* Assume the threads are at the fork barrier here */
5957       KA_TRACE(
5958           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5959                gtid));
5960       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5961        * (GEH) */
5962       ANNOTATE_HAPPENS_BEFORE(thread);
5963       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5964                          thread);
5965       __kmp_release_64(&flag);
5966     }
5967 
5968     // Terminate OS thread.
5969     __kmp_reap_worker(thread);
5970 
5971     // The thread was killed asynchronously.  If it was actively
5972     // spinning in the thread pool, decrement the global count.
5973     //
5974     // There is a small timing hole here - if the worker thread was just waking
5975     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5976     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5977     // the global counter might not get updated.
5978     //
5979     // Currently, this can only happen as the library is unloaded,
5980     // so there are no harmful side effects.
5981     if (thread->th.th_active_in_pool) {
5982       thread->th.th_active_in_pool = FALSE;
5983       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5984       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5985     }
5986   }
5987 
5988   __kmp_free_implicit_task(thread);
5989 
5990 // Free the fast memory for tasking
5991 #if USE_FAST_MEMORY
5992   __kmp_free_fast_memory(thread);
5993 #endif /* USE_FAST_MEMORY */
5994 
5995   __kmp_suspend_uninitialize_thread(thread);
5996 
5997   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5998   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5999 
6000   --__kmp_all_nth;
6001   // __kmp_nth was decremented when thread is added to the pool.
6002 
6003 #ifdef KMP_ADJUST_BLOCKTIME
6004   /* Adjust blocktime back to user setting or default if necessary */
6005   /* Middle initialization might never have occurred                */
6006   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6007     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6008     if (__kmp_nth <= __kmp_avail_proc) {
6009       __kmp_zero_bt = FALSE;
6010     }
6011   }
6012 #endif /* KMP_ADJUST_BLOCKTIME */
6013 
6014   /* free the memory being used */
6015   if (__kmp_env_consistency_check) {
6016     if (thread->th.th_cons) {
6017       __kmp_free_cons_stack(thread->th.th_cons);
6018       thread->th.th_cons = NULL;
6019     }
6020   }
6021 
6022   if (thread->th.th_pri_common != NULL) {
6023     __kmp_free(thread->th.th_pri_common);
6024     thread->th.th_pri_common = NULL;
6025   }
6026 
6027   if (thread->th.th_task_state_memo_stack != NULL) {
6028     __kmp_free(thread->th.th_task_state_memo_stack);
6029     thread->th.th_task_state_memo_stack = NULL;
6030   }
6031 
6032 #if KMP_USE_BGET
6033   if (thread->th.th_local.bget_data != NULL) {
6034     __kmp_finalize_bget(thread);
6035   }
6036 #endif
6037 
6038 #if KMP_AFFINITY_SUPPORTED
6039   if (thread->th.th_affin_mask != NULL) {
6040     KMP_CPU_FREE(thread->th.th_affin_mask);
6041     thread->th.th_affin_mask = NULL;
6042   }
6043 #endif /* KMP_AFFINITY_SUPPORTED */
6044 
6045 #if KMP_USE_HIER_SCHED
6046   if (thread->th.th_hier_bar_data != NULL) {
6047     __kmp_free(thread->th.th_hier_bar_data);
6048     thread->th.th_hier_bar_data = NULL;
6049   }
6050 #endif
6051 
6052   __kmp_reap_team(thread->th.th_serial_team);
6053   thread->th.th_serial_team = NULL;
6054   __kmp_free(thread);
6055 
6056   KMP_MB();
6057 
6058 } // __kmp_reap_thread
6059 
6060 static void __kmp_internal_end(void) {
6061   int i;
6062 
6063   /* First, unregister the library */
6064   __kmp_unregister_library();
6065 
6066 #if KMP_OS_WINDOWS
6067   /* In Win static library, we can't tell when a root actually dies, so we
6068      reclaim the data structures for any root threads that have died but not
6069      unregistered themselves, in order to shut down cleanly.
6070      In Win dynamic library we also can't tell when a thread dies.  */
6071   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6072 // dead roots
6073 #endif
6074 
6075   for (i = 0; i < __kmp_threads_capacity; i++)
6076     if (__kmp_root[i])
6077       if (__kmp_root[i]->r.r_active)
6078         break;
6079   KMP_MB(); /* Flush all pending memory write invalidates.  */
6080   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6081 
6082   if (i < __kmp_threads_capacity) {
6083 #if KMP_USE_MONITOR
6084     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6085     KMP_MB(); /* Flush all pending memory write invalidates.  */
6086 
6087     // Need to check that monitor was initialized before reaping it. If we are
6088     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6089     // __kmp_monitor will appear to contain valid data, but it is only valid in
6090     // the parent process, not the child.
6091     // New behavior (201008): instead of keying off of the flag
6092     // __kmp_init_parallel, the monitor thread creation is keyed off
6093     // of the new flag __kmp_init_monitor.
6094     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6095     if (TCR_4(__kmp_init_monitor)) {
6096       __kmp_reap_monitor(&__kmp_monitor);
6097       TCW_4(__kmp_init_monitor, 0);
6098     }
6099     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6100     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6101 #endif // KMP_USE_MONITOR
6102   } else {
6103 /* TODO move this to cleanup code */
6104 #ifdef KMP_DEBUG
6105     /* make sure that everything has properly ended */
6106     for (i = 0; i < __kmp_threads_capacity; i++) {
6107       if (__kmp_root[i]) {
6108         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6109         //                    there can be uber threads alive here
6110         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6111       }
6112     }
6113 #endif
6114 
6115     KMP_MB();
6116 
6117     // Reap the worker threads.
6118     // This is valid for now, but be careful if threads are reaped sooner.
6119     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6120       // Get the next thread from the pool.
6121       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6122       __kmp_thread_pool = thread->th.th_next_pool;
6123       // Reap it.
6124       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6125       thread->th.th_next_pool = NULL;
6126       thread->th.th_in_pool = FALSE;
6127       __kmp_reap_thread(thread, 0);
6128     }
6129     __kmp_thread_pool_insert_pt = NULL;
6130 
6131     // Reap teams.
6132     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6133       // Get the next team from the pool.
6134       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6135       __kmp_team_pool = team->t.t_next_pool;
6136       // Reap it.
6137       team->t.t_next_pool = NULL;
6138       __kmp_reap_team(team);
6139     }
6140 
6141     __kmp_reap_task_teams();
6142 
6143 #if KMP_OS_UNIX
6144     // Threads that are not reaped should not access any resources since they
6145     // are going to be deallocated soon, so the shutdown sequence should wait
6146     // until all threads either exit the final spin-waiting loop or begin
6147     // sleeping after the given blocktime.
6148     for (i = 0; i < __kmp_threads_capacity; i++) {
6149       kmp_info_t *thr = __kmp_threads[i];
6150       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6151         KMP_CPU_PAUSE();
6152     }
6153 #endif
6154 
6155     for (i = 0; i < __kmp_threads_capacity; ++i) {
6156       // TBD: Add some checking...
6157       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6158     }
6159 
6160     /* Make sure all threadprivate destructors get run by joining with all
6161        worker threads before resetting this flag */
6162     TCW_SYNC_4(__kmp_init_common, FALSE);
6163 
6164     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6165     KMP_MB();
6166 
6167 #if KMP_USE_MONITOR
6168     // See note above: One of the possible fixes for CQ138434 / CQ140126
6169     //
6170     // FIXME: push both code fragments down and CSE them?
6171     // push them into __kmp_cleanup() ?
6172     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6173     if (TCR_4(__kmp_init_monitor)) {
6174       __kmp_reap_monitor(&__kmp_monitor);
6175       TCW_4(__kmp_init_monitor, 0);
6176     }
6177     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6178     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6179 #endif
6180   } /* else !__kmp_global.t_active */
6181   TCW_4(__kmp_init_gtid, FALSE);
6182   KMP_MB(); /* Flush all pending memory write invalidates.  */
6183 
6184   __kmp_cleanup();
6185 #if OMPT_SUPPORT
6186   ompt_fini();
6187 #endif
6188 }
6189 
6190 void __kmp_internal_end_library(int gtid_req) {
6191   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6192   /* this shouldn't be a race condition because __kmp_internal_end() is the
6193      only place to clear __kmp_serial_init */
6194   /* we'll check this later too, after we get the lock */
6195   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6196   // redundant, because the next check will work in any case.
6197   if (__kmp_global.g.g_abort) {
6198     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6199     /* TODO abort? */
6200     return;
6201   }
6202   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6203     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6204     return;
6205   }
6206 
6207   KMP_MB(); /* Flush all pending memory write invalidates.  */
6208   /* find out who we are and what we should do */
6209   {
6210     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6211     KA_TRACE(
6212         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6213     if (gtid == KMP_GTID_SHUTDOWN) {
6214       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6215                     "already shutdown\n"));
6216       return;
6217     } else if (gtid == KMP_GTID_MONITOR) {
6218       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6219                     "registered, or system shutdown\n"));
6220       return;
6221     } else if (gtid == KMP_GTID_DNE) {
6222       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6223                     "shutdown\n"));
6224       /* we don't know who we are, but we may still shutdown the library */
6225     } else if (KMP_UBER_GTID(gtid)) {
6226       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6227       if (__kmp_root[gtid]->r.r_active) {
6228         __kmp_global.g.g_abort = -1;
6229         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6230         __kmp_unregister_library();
6231         KA_TRACE(10,
6232                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6233                   gtid));
6234         return;
6235       } else {
6236         KA_TRACE(
6237             10,
6238             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6239         __kmp_unregister_root_current_thread(gtid);
6240       }
6241     } else {
6242 /* worker threads may call this function through the atexit handler, if they
6243  * call exit() */
6244 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6245    TODO: do a thorough shutdown instead */
6246 #ifdef DUMP_DEBUG_ON_EXIT
6247       if (__kmp_debug_buf)
6248         __kmp_dump_debug_buffer();
6249 #endif
6250       // added unregister library call here when we switch to shm linux
6251       // if we don't, it will leave lots of files in /dev/shm
6252       // cleanup shared memory file before exiting.
6253       __kmp_unregister_library();
6254       return;
6255     }
6256   }
6257   /* synchronize the termination process */
6258   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6259 
6260   /* have we already finished */
6261   if (__kmp_global.g.g_abort) {
6262     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6263     /* TODO abort? */
6264     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6265     return;
6266   }
6267   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6268     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6269     return;
6270   }
6271 
6272   /* We need this lock to enforce mutex between this reading of
6273      __kmp_threads_capacity and the writing by __kmp_register_root.
6274      Alternatively, we can use a counter of roots that is atomically updated by
6275      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6276      __kmp_internal_end_*.  */
6277   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6278 
6279   /* now we can safely conduct the actual termination */
6280   __kmp_internal_end();
6281 
6282   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6283   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6284 
6285   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6286 
6287 #ifdef DUMP_DEBUG_ON_EXIT
6288   if (__kmp_debug_buf)
6289     __kmp_dump_debug_buffer();
6290 #endif
6291 
6292 #if KMP_OS_WINDOWS
6293   __kmp_close_console();
6294 #endif
6295 
6296   __kmp_fini_allocator();
6297 
6298 } // __kmp_internal_end_library
6299 
6300 void __kmp_internal_end_thread(int gtid_req) {
6301   int i;
6302 
6303   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6304   /* this shouldn't be a race condition because __kmp_internal_end() is the
6305    * only place to clear __kmp_serial_init */
6306   /* we'll check this later too, after we get the lock */
6307   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6308   // redundant, because the next check will work in any case.
6309   if (__kmp_global.g.g_abort) {
6310     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6311     /* TODO abort? */
6312     return;
6313   }
6314   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6315     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6316     return;
6317   }
6318 
6319   // If hidden helper team has been initialized, we need to deinit it
6320   if (TCR_4(__kmp_init_hidden_helper)) {
6321     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6322     // First release the main thread to let it continue its work
6323     __kmp_hidden_helper_main_thread_release();
6324     // Wait until the hidden helper team has been destroyed
6325     __kmp_hidden_helper_threads_deinitz_wait();
6326   }
6327 
6328   KMP_MB(); /* Flush all pending memory write invalidates.  */
6329 
6330   /* find out who we are and what we should do */
6331   {
6332     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6333     KA_TRACE(10,
6334              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6335     if (gtid == KMP_GTID_SHUTDOWN) {
6336       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6337                     "already shutdown\n"));
6338       return;
6339     } else if (gtid == KMP_GTID_MONITOR) {
6340       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6341                     "registered, or system shutdown\n"));
6342       return;
6343     } else if (gtid == KMP_GTID_DNE) {
6344       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6345                     "shutdown\n"));
6346       return;
6347       /* we don't know who we are */
6348     } else if (KMP_UBER_GTID(gtid)) {
6349       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6350       if (__kmp_root[gtid]->r.r_active) {
6351         __kmp_global.g.g_abort = -1;
6352         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6353         KA_TRACE(10,
6354                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6355                   gtid));
6356         return;
6357       } else {
6358         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6359                       gtid));
6360         __kmp_unregister_root_current_thread(gtid);
6361       }
6362     } else {
6363       /* just a worker thread, let's leave */
6364       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6365 
6366       if (gtid >= 0) {
6367         __kmp_threads[gtid]->th.th_task_team = NULL;
6368       }
6369 
6370       KA_TRACE(10,
6371                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6372                 gtid));
6373       return;
6374     }
6375   }
6376 #if KMP_DYNAMIC_LIB
6377   if (__kmp_pause_status != kmp_hard_paused)
6378   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6379   // because we will better shutdown later in the library destructor.
6380   {
6381     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6382     return;
6383   }
6384 #endif
6385   /* synchronize the termination process */
6386   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6387 
6388   /* have we already finished */
6389   if (__kmp_global.g.g_abort) {
6390     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6391     /* TODO abort? */
6392     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6393     return;
6394   }
6395   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6396     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6397     return;
6398   }
6399 
6400   /* We need this lock to enforce mutex between this reading of
6401      __kmp_threads_capacity and the writing by __kmp_register_root.
6402      Alternatively, we can use a counter of roots that is atomically updated by
6403      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6404      __kmp_internal_end_*.  */
6405 
6406   /* should we finish the run-time?  are all siblings done? */
6407   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6408 
6409   for (i = 0; i < __kmp_threads_capacity; ++i) {
6410     if (KMP_UBER_GTID(i)) {
6411       KA_TRACE(
6412           10,
6413           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6414       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6415       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6416       return;
6417     }
6418   }
6419 
6420   /* now we can safely conduct the actual termination */
6421 
6422   __kmp_internal_end();
6423 
6424   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6425   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6426 
6427   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6428 
6429 #ifdef DUMP_DEBUG_ON_EXIT
6430   if (__kmp_debug_buf)
6431     __kmp_dump_debug_buffer();
6432 #endif
6433 } // __kmp_internal_end_thread
6434 
6435 // -----------------------------------------------------------------------------
6436 // Library registration stuff.
6437 
6438 static long __kmp_registration_flag = 0;
6439 // Random value used to indicate library initialization.
6440 static char *__kmp_registration_str = NULL;
6441 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6442 
6443 static inline char *__kmp_reg_status_name() {
6444 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6445    each thread. If registration and unregistration go in different threads
6446    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6447    env var can not be found, because the name will contain different pid. */
6448 // macOS* complains about name being too long with additional getuid()
6449 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6450   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6451                           (int)getuid());
6452 #else
6453   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6454 #endif
6455 } // __kmp_reg_status_get
6456 
6457 void __kmp_register_library_startup(void) {
6458 
6459   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6460   int done = 0;
6461   union {
6462     double dtime;
6463     long ltime;
6464   } time;
6465 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6466   __kmp_initialize_system_tick();
6467 #endif
6468   __kmp_read_system_time(&time.dtime);
6469   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6470   __kmp_registration_str =
6471       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6472                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6473 
6474   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6475                 __kmp_registration_str));
6476 
6477   while (!done) {
6478 
6479     char *value = NULL; // Actual value of the environment variable.
6480 
6481 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6482     char *shm_name = __kmp_str_format("/%s", name);
6483     int shm_preexist = 0;
6484     char *data1;
6485     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6486     if ((fd1 == -1) && (errno == EEXIST)) {
6487       // file didn't open because it already exists.
6488       // try opening existing file
6489       fd1 = shm_open(shm_name, O_RDWR, 0666);
6490       if (fd1 == -1) { // file didn't open
6491         // error out here
6492         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6493                     __kmp_msg_null);
6494       } else {
6495         // able to open existing file
6496         shm_preexist = 1;
6497       }
6498     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6499       // already exists.
6500       // error out here.
6501       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6502                   __kmp_msg_null);
6503     }
6504     if (shm_preexist == 0) {
6505       // we created SHM now set size
6506       if (ftruncate(fd1, SHM_SIZE) == -1) {
6507         // error occured setting size;
6508         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6509                     KMP_ERR(errno), __kmp_msg_null);
6510       }
6511     }
6512     data1 =
6513         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6514     if (data1 == MAP_FAILED) {
6515       // failed to map shared memory
6516       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6517                   __kmp_msg_null);
6518     }
6519     if (shm_preexist == 0) { // set data to SHM, set value
6520       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6521     }
6522     // Read value from either what we just wrote or existing file.
6523     value = __kmp_str_format("%s", data1); // read value from SHM
6524     munmap(data1, SHM_SIZE);
6525     close(fd1);
6526 #else // Windows and unix with static library
6527     // Set environment variable, but do not overwrite if it is exist.
6528     __kmp_env_set(name, __kmp_registration_str, 0);
6529     // read value to see if it got set
6530     value = __kmp_env_get(name);
6531 #endif
6532 
6533     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6534       done = 1; // Ok, environment variable set successfully, exit the loop.
6535     } else {
6536       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6537       // Check whether it alive or dead.
6538       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6539       char *tail = value;
6540       char *flag_addr_str = NULL;
6541       char *flag_val_str = NULL;
6542       char const *file_name = NULL;
6543       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6544       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6545       file_name = tail;
6546       if (tail != NULL) {
6547         long *flag_addr = 0;
6548         unsigned long flag_val = 0;
6549         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6550         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6551         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6552           // First, check whether environment-encoded address is mapped into
6553           // addr space.
6554           // If so, dereference it to see if it still has the right value.
6555           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6556             neighbor = 1;
6557           } else {
6558             // If not, then we know the other copy of the library is no longer
6559             // running.
6560             neighbor = 2;
6561           }
6562         }
6563       }
6564       switch (neighbor) {
6565       case 0: // Cannot parse environment variable -- neighbor status unknown.
6566         // Assume it is the incompatible format of future version of the
6567         // library. Assume the other library is alive.
6568         // WARN( ... ); // TODO: Issue a warning.
6569         file_name = "unknown library";
6570         KMP_FALLTHROUGH();
6571       // Attention! Falling to the next case. That's intentional.
6572       case 1: { // Neighbor is alive.
6573         // Check it is allowed.
6574         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6575         if (!__kmp_str_match_true(duplicate_ok)) {
6576           // That's not allowed. Issue fatal error.
6577           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6578                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6579         }
6580         KMP_INTERNAL_FREE(duplicate_ok);
6581         __kmp_duplicate_library_ok = 1;
6582         done = 1; // Exit the loop.
6583       } break;
6584       case 2: { // Neighbor is dead.
6585 
6586 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6587         // close shared memory.
6588         shm_unlink(shm_name); // this removes file in /dev/shm
6589 #else
6590         // Clear the variable and try to register library again.
6591         __kmp_env_unset(name);
6592 #endif
6593       } break;
6594       default: {
6595         KMP_DEBUG_ASSERT(0);
6596       } break;
6597       }
6598     }
6599     KMP_INTERNAL_FREE((void *)value);
6600 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6601     KMP_INTERNAL_FREE((void *)shm_name);
6602 #endif
6603   } // while
6604   KMP_INTERNAL_FREE((void *)name);
6605 
6606 } // func __kmp_register_library_startup
6607 
6608 void __kmp_unregister_library(void) {
6609 
6610   char *name = __kmp_reg_status_name();
6611   char *value = NULL;
6612 
6613 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6614   char *shm_name = __kmp_str_format("/%s", name);
6615   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6616   if (fd1 == -1) {
6617     // file did not open. return.
6618     return;
6619   }
6620   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6621   if (data1 != MAP_FAILED) {
6622     value = __kmp_str_format("%s", data1); // read value from SHM
6623     munmap(data1, SHM_SIZE);
6624   }
6625   close(fd1);
6626 #else
6627   value = __kmp_env_get(name);
6628 #endif
6629 
6630   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6631   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6632   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6633 //  Ok, this is our variable. Delete it.
6634 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6635     shm_unlink(shm_name); // this removes file in /dev/shm
6636 #else
6637     __kmp_env_unset(name);
6638 #endif
6639   }
6640 
6641 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6642   KMP_INTERNAL_FREE(shm_name);
6643 #endif
6644 
6645   KMP_INTERNAL_FREE(__kmp_registration_str);
6646   KMP_INTERNAL_FREE(value);
6647   KMP_INTERNAL_FREE(name);
6648 
6649   __kmp_registration_flag = 0;
6650   __kmp_registration_str = NULL;
6651 
6652 } // __kmp_unregister_library
6653 
6654 // End of Library registration stuff.
6655 // -----------------------------------------------------------------------------
6656 
6657 #if KMP_MIC_SUPPORTED
6658 
6659 static void __kmp_check_mic_type() {
6660   kmp_cpuid_t cpuid_state = {0};
6661   kmp_cpuid_t *cs_p = &cpuid_state;
6662   __kmp_x86_cpuid(1, 0, cs_p);
6663   // We don't support mic1 at the moment
6664   if ((cs_p->eax & 0xff0) == 0xB10) {
6665     __kmp_mic_type = mic2;
6666   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6667     __kmp_mic_type = mic3;
6668   } else {
6669     __kmp_mic_type = non_mic;
6670   }
6671 }
6672 
6673 #endif /* KMP_MIC_SUPPORTED */
6674 
6675 #if KMP_HAVE_UMWAIT
6676 static void __kmp_user_level_mwait_init() {
6677   struct kmp_cpuid buf;
6678   __kmp_x86_cpuid(7, 0, &buf);
6679   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6680   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6681                 __kmp_umwait_enabled));
6682 }
6683 #elif KMP_HAVE_MWAIT
6684 #ifndef AT_INTELPHIUSERMWAIT
6685 // Spurious, non-existent value that should always fail to return anything.
6686 // Will be replaced with the correct value when we know that.
6687 #define AT_INTELPHIUSERMWAIT 10000
6688 #endif
6689 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6690 // earlier OS is used to build the RTL, we'll use the following internal
6691 // function when the entry is not found.
6692 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6693 unsigned long getauxval(unsigned long) { return 0; }
6694 
6695 static void __kmp_user_level_mwait_init() {
6696   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6697   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6698   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6699   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6700   if (__kmp_mic_type == mic3) {
6701     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6702     if ((res & 0x1) || __kmp_user_level_mwait) {
6703       __kmp_mwait_enabled = TRUE;
6704       if (__kmp_user_level_mwait) {
6705         KMP_INFORM(EnvMwaitWarn);
6706       }
6707     } else {
6708       __kmp_mwait_enabled = FALSE;
6709     }
6710   }
6711   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6712                 "__kmp_mwait_enabled = %d\n",
6713                 __kmp_mic_type, __kmp_mwait_enabled));
6714 }
6715 #endif /* KMP_HAVE_UMWAIT */
6716 
6717 static void __kmp_do_serial_initialize(void) {
6718   int i, gtid;
6719   size_t size;
6720 
6721   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6722 
6723   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6724   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6725   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6726   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6727   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6728 
6729 #if OMPT_SUPPORT
6730   ompt_pre_init();
6731 #endif
6732 #if OMPD_SUPPORT
6733   __kmp_env_dump();
6734   ompd_init();
6735 #endif
6736 
6737   __kmp_validate_locks();
6738 
6739   /* Initialize internal memory allocator */
6740   __kmp_init_allocator();
6741 
6742   /* Register the library startup via an environment variable and check to see
6743      whether another copy of the library is already registered. */
6744 
6745   __kmp_register_library_startup();
6746 
6747   /* TODO reinitialization of library */
6748   if (TCR_4(__kmp_global.g.g_done)) {
6749     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6750   }
6751 
6752   __kmp_global.g.g_abort = 0;
6753   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6754 
6755 /* initialize the locks */
6756 #if KMP_USE_ADAPTIVE_LOCKS
6757 #if KMP_DEBUG_ADAPTIVE_LOCKS
6758   __kmp_init_speculative_stats();
6759 #endif
6760 #endif
6761 #if KMP_STATS_ENABLED
6762   __kmp_stats_init();
6763 #endif
6764   __kmp_init_lock(&__kmp_global_lock);
6765   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6766   __kmp_init_lock(&__kmp_debug_lock);
6767   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6768   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6769   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6770   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6771   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6772   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6773   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6774   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6775   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6776   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6777   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6778   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6779   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6780   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6781   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6782 #if KMP_USE_MONITOR
6783   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6784 #endif
6785   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6786 
6787   /* conduct initialization and initial setup of configuration */
6788 
6789   __kmp_runtime_initialize();
6790 
6791 #if KMP_MIC_SUPPORTED
6792   __kmp_check_mic_type();
6793 #endif
6794 
6795 // Some global variable initialization moved here from kmp_env_initialize()
6796 #ifdef KMP_DEBUG
6797   kmp_diag = 0;
6798 #endif
6799   __kmp_abort_delay = 0;
6800 
6801   // From __kmp_init_dflt_team_nth()
6802   /* assume the entire machine will be used */
6803   __kmp_dflt_team_nth_ub = __kmp_xproc;
6804   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6805     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6806   }
6807   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6808     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6809   }
6810   __kmp_max_nth = __kmp_sys_max_nth;
6811   __kmp_cg_max_nth = __kmp_sys_max_nth;
6812   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6813   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6814     __kmp_teams_max_nth = __kmp_sys_max_nth;
6815   }
6816 
6817   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6818   // part
6819   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6820 #if KMP_USE_MONITOR
6821   __kmp_monitor_wakeups =
6822       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6823   __kmp_bt_intervals =
6824       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6825 #endif
6826   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6827   __kmp_library = library_throughput;
6828   // From KMP_SCHEDULE initialization
6829   __kmp_static = kmp_sch_static_balanced;
6830 // AC: do not use analytical here, because it is non-monotonous
6831 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6832 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6833 // need to repeat assignment
6834 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6835 // bit control and barrier method control parts
6836 #if KMP_FAST_REDUCTION_BARRIER
6837 #define kmp_reduction_barrier_gather_bb ((int)1)
6838 #define kmp_reduction_barrier_release_bb ((int)1)
6839 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6840 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6841 #endif // KMP_FAST_REDUCTION_BARRIER
6842   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6843     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6844     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6845     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6846     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6847 #if KMP_FAST_REDUCTION_BARRIER
6848     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6849       // lin_64 ): hyper,1
6850       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6851       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6852       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6853       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6854     }
6855 #endif // KMP_FAST_REDUCTION_BARRIER
6856   }
6857 #if KMP_FAST_REDUCTION_BARRIER
6858 #undef kmp_reduction_barrier_release_pat
6859 #undef kmp_reduction_barrier_gather_pat
6860 #undef kmp_reduction_barrier_release_bb
6861 #undef kmp_reduction_barrier_gather_bb
6862 #endif // KMP_FAST_REDUCTION_BARRIER
6863 #if KMP_MIC_SUPPORTED
6864   if (__kmp_mic_type == mic2) { // KNC
6865     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6866     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6867     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6868         1; // forkjoin release
6869     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6870     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6871   }
6872 #if KMP_FAST_REDUCTION_BARRIER
6873   if (__kmp_mic_type == mic2) { // KNC
6874     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6875     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6876   }
6877 #endif // KMP_FAST_REDUCTION_BARRIER
6878 #endif // KMP_MIC_SUPPORTED
6879 
6880 // From KMP_CHECKS initialization
6881 #ifdef KMP_DEBUG
6882   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6883 #else
6884   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6885 #endif
6886 
6887   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6888   __kmp_foreign_tp = TRUE;
6889 
6890   __kmp_global.g.g_dynamic = FALSE;
6891   __kmp_global.g.g_dynamic_mode = dynamic_default;
6892 
6893   __kmp_init_nesting_mode();
6894 
6895   __kmp_env_initialize(NULL);
6896 
6897 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6898   __kmp_user_level_mwait_init();
6899 #endif
6900 // Print all messages in message catalog for testing purposes.
6901 #ifdef KMP_DEBUG
6902   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6903   if (__kmp_str_match_true(val)) {
6904     kmp_str_buf_t buffer;
6905     __kmp_str_buf_init(&buffer);
6906     __kmp_i18n_dump_catalog(&buffer);
6907     __kmp_printf("%s", buffer.str);
6908     __kmp_str_buf_free(&buffer);
6909   }
6910   __kmp_env_free(&val);
6911 #endif
6912 
6913   __kmp_threads_capacity =
6914       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6915   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6916   __kmp_tp_capacity = __kmp_default_tp_capacity(
6917       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6918 
6919   // If the library is shut down properly, both pools must be NULL. Just in
6920   // case, set them to NULL -- some memory may leak, but subsequent code will
6921   // work even if pools are not freed.
6922   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6923   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6924   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6925   __kmp_thread_pool = NULL;
6926   __kmp_thread_pool_insert_pt = NULL;
6927   __kmp_team_pool = NULL;
6928 
6929   /* Allocate all of the variable sized records */
6930   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6931    * expandable */
6932   /* Since allocation is cache-aligned, just add extra padding at the end */
6933   size =
6934       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6935       CACHE_LINE;
6936   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6937   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6938                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6939 
6940   /* init thread counts */
6941   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6942                    0); // Asserts fail if the library is reinitializing and
6943   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6944   __kmp_all_nth = 0;
6945   __kmp_nth = 0;
6946 
6947   /* setup the uber master thread and hierarchy */
6948   gtid = __kmp_register_root(TRUE);
6949   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6950   KMP_ASSERT(KMP_UBER_GTID(gtid));
6951   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6952 
6953   KMP_MB(); /* Flush all pending memory write invalidates.  */
6954 
6955   __kmp_common_initialize();
6956 
6957 #if KMP_OS_UNIX
6958   /* invoke the child fork handler */
6959   __kmp_register_atfork();
6960 #endif
6961 
6962 #if !KMP_DYNAMIC_LIB
6963   {
6964     /* Invoke the exit handler when the program finishes, only for static
6965        library. For dynamic library, we already have _fini and DllMain. */
6966     int rc = atexit(__kmp_internal_end_atexit);
6967     if (rc != 0) {
6968       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6969                   __kmp_msg_null);
6970     }
6971   }
6972 #endif
6973 
6974 #if KMP_HANDLE_SIGNALS
6975 #if KMP_OS_UNIX
6976   /* NOTE: make sure that this is called before the user installs their own
6977      signal handlers so that the user handlers are called first. this way they
6978      can return false, not call our handler, avoid terminating the library, and
6979      continue execution where they left off. */
6980   __kmp_install_signals(FALSE);
6981 #endif /* KMP_OS_UNIX */
6982 #if KMP_OS_WINDOWS
6983   __kmp_install_signals(TRUE);
6984 #endif /* KMP_OS_WINDOWS */
6985 #endif
6986 
6987   /* we have finished the serial initialization */
6988   __kmp_init_counter++;
6989 
6990   __kmp_init_serial = TRUE;
6991 
6992   if (__kmp_settings) {
6993     __kmp_env_print();
6994   }
6995 
6996   if (__kmp_display_env || __kmp_display_env_verbose) {
6997     __kmp_env_print_2();
6998   }
6999 
7000 #if OMPT_SUPPORT
7001   ompt_post_init();
7002 #endif
7003 
7004   KMP_MB();
7005 
7006   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7007 }
7008 
7009 void __kmp_serial_initialize(void) {
7010   if (__kmp_init_serial) {
7011     return;
7012   }
7013   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7014   if (__kmp_init_serial) {
7015     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7016     return;
7017   }
7018   __kmp_do_serial_initialize();
7019   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7020 }
7021 
7022 static void __kmp_do_middle_initialize(void) {
7023   int i, j;
7024   int prev_dflt_team_nth;
7025 
7026   if (!__kmp_init_serial) {
7027     __kmp_do_serial_initialize();
7028   }
7029 
7030   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7031 
7032   // Save the previous value for the __kmp_dflt_team_nth so that
7033   // we can avoid some reinitialization if it hasn't changed.
7034   prev_dflt_team_nth = __kmp_dflt_team_nth;
7035 
7036 #if KMP_AFFINITY_SUPPORTED
7037   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7038   // number of cores on the machine.
7039   __kmp_affinity_initialize();
7040 
7041 #endif /* KMP_AFFINITY_SUPPORTED */
7042 
7043   KMP_ASSERT(__kmp_xproc > 0);
7044   if (__kmp_avail_proc == 0) {
7045     __kmp_avail_proc = __kmp_xproc;
7046   }
7047 
7048   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7049   // correct them now
7050   j = 0;
7051   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7052     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7053         __kmp_avail_proc;
7054     j++;
7055   }
7056 
7057   if (__kmp_dflt_team_nth == 0) {
7058 #ifdef KMP_DFLT_NTH_CORES
7059     // Default #threads = #cores
7060     __kmp_dflt_team_nth = __kmp_ncores;
7061     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7062                   "__kmp_ncores (%d)\n",
7063                   __kmp_dflt_team_nth));
7064 #else
7065     // Default #threads = #available OS procs
7066     __kmp_dflt_team_nth = __kmp_avail_proc;
7067     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7068                   "__kmp_avail_proc(%d)\n",
7069                   __kmp_dflt_team_nth));
7070 #endif /* KMP_DFLT_NTH_CORES */
7071   }
7072 
7073   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7074     __kmp_dflt_team_nth = KMP_MIN_NTH;
7075   }
7076   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7077     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7078   }
7079 
7080   if (__kmp_nesting_mode > 0)
7081     __kmp_set_nesting_mode_threads();
7082 
7083   // There's no harm in continuing if the following check fails,
7084   // but it indicates an error in the previous logic.
7085   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7086 
7087   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7088     // Run through the __kmp_threads array and set the num threads icv for each
7089     // root thread that is currently registered with the RTL (which has not
7090     // already explicitly set its nthreads-var with a call to
7091     // omp_set_num_threads()).
7092     for (i = 0; i < __kmp_threads_capacity; i++) {
7093       kmp_info_t *thread = __kmp_threads[i];
7094       if (thread == NULL)
7095         continue;
7096       if (thread->th.th_current_task->td_icvs.nproc != 0)
7097         continue;
7098 
7099       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7100     }
7101   }
7102   KA_TRACE(
7103       20,
7104       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7105        __kmp_dflt_team_nth));
7106 
7107 #ifdef KMP_ADJUST_BLOCKTIME
7108   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7109   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7110     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7111     if (__kmp_nth > __kmp_avail_proc) {
7112       __kmp_zero_bt = TRUE;
7113     }
7114   }
7115 #endif /* KMP_ADJUST_BLOCKTIME */
7116 
7117   /* we have finished middle initialization */
7118   TCW_SYNC_4(__kmp_init_middle, TRUE);
7119 
7120   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7121 }
7122 
7123 void __kmp_middle_initialize(void) {
7124   if (__kmp_init_middle) {
7125     return;
7126   }
7127   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7128   if (__kmp_init_middle) {
7129     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7130     return;
7131   }
7132   __kmp_do_middle_initialize();
7133   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7134 }
7135 
7136 void __kmp_parallel_initialize(void) {
7137   int gtid = __kmp_entry_gtid(); // this might be a new root
7138 
7139   /* synchronize parallel initialization (for sibling) */
7140   if (TCR_4(__kmp_init_parallel))
7141     return;
7142   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7143   if (TCR_4(__kmp_init_parallel)) {
7144     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7145     return;
7146   }
7147 
7148   /* TODO reinitialization after we have already shut down */
7149   if (TCR_4(__kmp_global.g.g_done)) {
7150     KA_TRACE(
7151         10,
7152         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7153     __kmp_infinite_loop();
7154   }
7155 
7156   /* jc: The lock __kmp_initz_lock is already held, so calling
7157      __kmp_serial_initialize would cause a deadlock.  So we call
7158      __kmp_do_serial_initialize directly. */
7159   if (!__kmp_init_middle) {
7160     __kmp_do_middle_initialize();
7161   }
7162   __kmp_assign_root_init_mask();
7163   __kmp_resume_if_hard_paused();
7164 
7165   /* begin initialization */
7166   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7167   KMP_ASSERT(KMP_UBER_GTID(gtid));
7168 
7169 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7170   // Save the FP control regs.
7171   // Worker threads will set theirs to these values at thread startup.
7172   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7173   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7174   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7175 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7176 
7177 #if KMP_OS_UNIX
7178 #if KMP_HANDLE_SIGNALS
7179   /*  must be after __kmp_serial_initialize  */
7180   __kmp_install_signals(TRUE);
7181 #endif
7182 #endif
7183 
7184   __kmp_suspend_initialize();
7185 
7186 #if defined(USE_LOAD_BALANCE)
7187   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7188     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7189   }
7190 #else
7191   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7192     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7193   }
7194 #endif
7195 
7196   if (__kmp_version) {
7197     __kmp_print_version_2();
7198   }
7199 
7200   /* we have finished parallel initialization */
7201   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7202 
7203   KMP_MB();
7204   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7205 
7206   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7207 }
7208 
7209 void __kmp_hidden_helper_initialize() {
7210   if (TCR_4(__kmp_init_hidden_helper))
7211     return;
7212 
7213   // __kmp_parallel_initialize is required before we initialize hidden helper
7214   if (!TCR_4(__kmp_init_parallel))
7215     __kmp_parallel_initialize();
7216 
7217   // Double check. Note that this double check should not be placed before
7218   // __kmp_parallel_initialize as it will cause dead lock.
7219   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7220   if (TCR_4(__kmp_init_hidden_helper)) {
7221     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7222     return;
7223   }
7224 
7225   // Set the count of hidden helper tasks to be executed to zero
7226   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7227 
7228   // Set the global variable indicating that we're initializing hidden helper
7229   // team/threads
7230   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7231 
7232   // Platform independent initialization
7233   __kmp_do_initialize_hidden_helper_threads();
7234 
7235   // Wait here for the finish of initialization of hidden helper teams
7236   __kmp_hidden_helper_threads_initz_wait();
7237 
7238   // We have finished hidden helper initialization
7239   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7240 
7241   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7242 }
7243 
7244 /* ------------------------------------------------------------------------ */
7245 
7246 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7247                                    kmp_team_t *team) {
7248   kmp_disp_t *dispatch;
7249 
7250   KMP_MB();
7251 
7252   /* none of the threads have encountered any constructs, yet. */
7253   this_thr->th.th_local.this_construct = 0;
7254 #if KMP_CACHE_MANAGE
7255   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7256 #endif /* KMP_CACHE_MANAGE */
7257   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7258   KMP_DEBUG_ASSERT(dispatch);
7259   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7260   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7261   // this_thr->th.th_info.ds.ds_tid ] );
7262 
7263   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7264   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7265   if (__kmp_env_consistency_check)
7266     __kmp_push_parallel(gtid, team->t.t_ident);
7267 
7268   KMP_MB(); /* Flush all pending memory write invalidates.  */
7269 }
7270 
7271 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7272                                   kmp_team_t *team) {
7273   if (__kmp_env_consistency_check)
7274     __kmp_pop_parallel(gtid, team->t.t_ident);
7275 
7276   __kmp_finish_implicit_task(this_thr);
7277 }
7278 
7279 int __kmp_invoke_task_func(int gtid) {
7280   int rc;
7281   int tid = __kmp_tid_from_gtid(gtid);
7282   kmp_info_t *this_thr = __kmp_threads[gtid];
7283   kmp_team_t *team = this_thr->th.th_team;
7284 
7285   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7286 #if USE_ITT_BUILD
7287   if (__itt_stack_caller_create_ptr) {
7288     // inform ittnotify about entering user's code
7289     if (team->t.t_stack_id != NULL) {
7290       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7291     } else {
7292       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7293       __kmp_itt_stack_callee_enter(
7294           (__itt_caller)team->t.t_parent->t.t_stack_id);
7295     }
7296   }
7297 #endif /* USE_ITT_BUILD */
7298 #if INCLUDE_SSC_MARKS
7299   SSC_MARK_INVOKING();
7300 #endif
7301 
7302 #if OMPT_SUPPORT
7303   void *dummy;
7304   void **exit_frame_p;
7305   ompt_data_t *my_task_data;
7306   ompt_data_t *my_parallel_data;
7307   int ompt_team_size;
7308 
7309   if (ompt_enabled.enabled) {
7310     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7311                          .ompt_task_info.frame.exit_frame.ptr);
7312   } else {
7313     exit_frame_p = &dummy;
7314   }
7315 
7316   my_task_data =
7317       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7318   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7319   if (ompt_enabled.ompt_callback_implicit_task) {
7320     ompt_team_size = team->t.t_nproc;
7321     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7322         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7323         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7324     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7325   }
7326 #endif
7327 
7328 #if KMP_STATS_ENABLED
7329   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7330   if (previous_state == stats_state_e::TEAMS_REGION) {
7331     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7332   } else {
7333     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7334   }
7335   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7336 #endif
7337 
7338   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7339                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7340 #if OMPT_SUPPORT
7341                               ,
7342                               exit_frame_p
7343 #endif
7344   );
7345 #if OMPT_SUPPORT
7346   *exit_frame_p = NULL;
7347   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7348 #endif
7349 
7350 #if KMP_STATS_ENABLED
7351   if (previous_state == stats_state_e::TEAMS_REGION) {
7352     KMP_SET_THREAD_STATE(previous_state);
7353   }
7354   KMP_POP_PARTITIONED_TIMER();
7355 #endif
7356 
7357 #if USE_ITT_BUILD
7358   if (__itt_stack_caller_create_ptr) {
7359     // inform ittnotify about leaving user's code
7360     if (team->t.t_stack_id != NULL) {
7361       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7362     } else {
7363       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7364       __kmp_itt_stack_callee_leave(
7365           (__itt_caller)team->t.t_parent->t.t_stack_id);
7366     }
7367   }
7368 #endif /* USE_ITT_BUILD */
7369   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7370 
7371   return rc;
7372 }
7373 
7374 void __kmp_teams_master(int gtid) {
7375   // This routine is called by all primary threads in teams construct
7376   kmp_info_t *thr = __kmp_threads[gtid];
7377   kmp_team_t *team = thr->th.th_team;
7378   ident_t *loc = team->t.t_ident;
7379   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7380   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7381   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7382   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7383                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7384 
7385   // This thread is a new CG root.  Set up the proper variables.
7386   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7387   tmp->cg_root = thr; // Make thr the CG root
7388   // Init to thread limit stored when league primary threads were forked
7389   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7390   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7391   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7392                  " cg_nthreads to 1\n",
7393                  thr, tmp));
7394   tmp->up = thr->th.th_cg_roots;
7395   thr->th.th_cg_roots = tmp;
7396 
7397 // Launch league of teams now, but not let workers execute
7398 // (they hang on fork barrier until next parallel)
7399 #if INCLUDE_SSC_MARKS
7400   SSC_MARK_FORKING();
7401 #endif
7402   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7403                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7404                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7405 #if INCLUDE_SSC_MARKS
7406   SSC_MARK_JOINING();
7407 #endif
7408   // If the team size was reduced from the limit, set it to the new size
7409   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7410     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7411   // AC: last parameter "1" eliminates join barrier which won't work because
7412   // worker threads are in a fork barrier waiting for more parallel regions
7413   __kmp_join_call(loc, gtid
7414 #if OMPT_SUPPORT
7415                   ,
7416                   fork_context_intel
7417 #endif
7418                   ,
7419                   1);
7420 }
7421 
7422 int __kmp_invoke_teams_master(int gtid) {
7423   kmp_info_t *this_thr = __kmp_threads[gtid];
7424   kmp_team_t *team = this_thr->th.th_team;
7425 #if KMP_DEBUG
7426   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7427     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7428                      (void *)__kmp_teams_master);
7429 #endif
7430   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7431 #if OMPT_SUPPORT
7432   int tid = __kmp_tid_from_gtid(gtid);
7433   ompt_data_t *task_data =
7434       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7435   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7436   if (ompt_enabled.ompt_callback_implicit_task) {
7437     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7438         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7439         ompt_task_initial);
7440     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7441   }
7442 #endif
7443   __kmp_teams_master(gtid);
7444 #if OMPT_SUPPORT
7445   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7446 #endif
7447   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7448   return 1;
7449 }
7450 
7451 /* this sets the requested number of threads for the next parallel region
7452    encountered by this team. since this should be enclosed in the forkjoin
7453    critical section it should avoid race conditions with asymmetrical nested
7454    parallelism */
7455 
7456 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7457   kmp_info_t *thr = __kmp_threads[gtid];
7458 
7459   if (num_threads > 0)
7460     thr->th.th_set_nproc = num_threads;
7461 }
7462 
7463 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7464                                     int num_threads) {
7465   KMP_DEBUG_ASSERT(thr);
7466   // Remember the number of threads for inner parallel regions
7467   if (!TCR_4(__kmp_init_middle))
7468     __kmp_middle_initialize(); // get internal globals calculated
7469   __kmp_assign_root_init_mask();
7470   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7471   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7472 
7473   if (num_threads == 0) {
7474     if (__kmp_teams_thread_limit > 0) {
7475       num_threads = __kmp_teams_thread_limit;
7476     } else {
7477       num_threads = __kmp_avail_proc / num_teams;
7478     }
7479     // adjust num_threads w/o warning as it is not user setting
7480     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7481     // no thread_limit clause specified -  do not change thread-limit-var ICV
7482     if (num_threads > __kmp_dflt_team_nth) {
7483       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7484     }
7485     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7486       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7487     } // prevent team size to exceed thread-limit-var
7488     if (num_teams * num_threads > __kmp_teams_max_nth) {
7489       num_threads = __kmp_teams_max_nth / num_teams;
7490     }
7491     if (num_threads == 0) {
7492       num_threads = 1;
7493     }
7494   } else {
7495     // This thread will be the primary thread of the league primary threads
7496     // Store new thread limit; old limit is saved in th_cg_roots list
7497     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7498     // num_threads = min(num_threads, nthreads-var)
7499     if (num_threads > __kmp_dflt_team_nth) {
7500       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7501     }
7502     if (num_teams * num_threads > __kmp_teams_max_nth) {
7503       int new_threads = __kmp_teams_max_nth / num_teams;
7504       if (new_threads == 0) {
7505         new_threads = 1;
7506       }
7507       if (new_threads != num_threads) {
7508         if (!__kmp_reserve_warn) { // user asked for too many threads
7509           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7510           __kmp_msg(kmp_ms_warning,
7511                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7512                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7513         }
7514       }
7515       num_threads = new_threads;
7516     }
7517   }
7518   thr->th.th_teams_size.nth = num_threads;
7519 }
7520 
7521 /* this sets the requested number of teams for the teams region and/or
7522    the number of threads for the next parallel region encountered  */
7523 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7524                           int num_threads) {
7525   kmp_info_t *thr = __kmp_threads[gtid];
7526   KMP_DEBUG_ASSERT(num_teams >= 0);
7527   KMP_DEBUG_ASSERT(num_threads >= 0);
7528 
7529   if (num_teams == 0) {
7530     if (__kmp_nteams > 0) {
7531       num_teams = __kmp_nteams;
7532     } else {
7533       num_teams = 1; // default number of teams is 1.
7534     }
7535   }
7536   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7537     if (!__kmp_reserve_warn) {
7538       __kmp_reserve_warn = 1;
7539       __kmp_msg(kmp_ms_warning,
7540                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7541                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7542     }
7543     num_teams = __kmp_teams_max_nth;
7544   }
7545   // Set number of teams (number of threads in the outer "parallel" of the
7546   // teams)
7547   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7548 
7549   __kmp_push_thread_limit(thr, num_teams, num_threads);
7550 }
7551 
7552 /* This sets the requested number of teams for the teams region and/or
7553    the number of threads for the next parallel region encountered  */
7554 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7555                              int num_teams_ub, int num_threads) {
7556   kmp_info_t *thr = __kmp_threads[gtid];
7557   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7558   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7559   KMP_DEBUG_ASSERT(num_threads >= 0);
7560 
7561   if (num_teams_lb > num_teams_ub) {
7562     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7563                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7564   }
7565 
7566   int num_teams = 1; // defalt number of teams is 1.
7567 
7568   if (num_teams_lb == 0 && num_teams_ub > 0)
7569     num_teams_lb = num_teams_ub;
7570 
7571   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7572     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7573     if (num_teams > __kmp_teams_max_nth) {
7574       if (!__kmp_reserve_warn) {
7575         __kmp_reserve_warn = 1;
7576         __kmp_msg(kmp_ms_warning,
7577                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7578                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7579       }
7580       num_teams = __kmp_teams_max_nth;
7581     }
7582   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7583     num_teams = num_teams_ub;
7584   } else { // num_teams_lb <= num_teams <= num_teams_ub
7585     if (num_threads == 0) {
7586       if (num_teams_ub > __kmp_teams_max_nth) {
7587         num_teams = num_teams_lb;
7588       } else {
7589         num_teams = num_teams_ub;
7590       }
7591     } else {
7592       num_teams = (num_threads > __kmp_teams_max_nth)
7593                       ? num_teams
7594                       : __kmp_teams_max_nth / num_threads;
7595       if (num_teams < num_teams_lb) {
7596         num_teams = num_teams_lb;
7597       } else if (num_teams > num_teams_ub) {
7598         num_teams = num_teams_ub;
7599       }
7600     }
7601   }
7602   // Set number of teams (number of threads in the outer "parallel" of the
7603   // teams)
7604   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7605 
7606   __kmp_push_thread_limit(thr, num_teams, num_threads);
7607 }
7608 
7609 // Set the proc_bind var to use in the following parallel region.
7610 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7611   kmp_info_t *thr = __kmp_threads[gtid];
7612   thr->th.th_set_proc_bind = proc_bind;
7613 }
7614 
7615 /* Launch the worker threads into the microtask. */
7616 
7617 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7618   kmp_info_t *this_thr = __kmp_threads[gtid];
7619 
7620 #ifdef KMP_DEBUG
7621   int f;
7622 #endif /* KMP_DEBUG */
7623 
7624   KMP_DEBUG_ASSERT(team);
7625   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7626   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7627   KMP_MB(); /* Flush all pending memory write invalidates.  */
7628 
7629   team->t.t_construct = 0; /* no single directives seen yet */
7630   team->t.t_ordered.dt.t_value =
7631       0; /* thread 0 enters the ordered section first */
7632 
7633   /* Reset the identifiers on the dispatch buffer */
7634   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7635   if (team->t.t_max_nproc > 1) {
7636     int i;
7637     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7638       team->t.t_disp_buffer[i].buffer_index = i;
7639       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7640     }
7641   } else {
7642     team->t.t_disp_buffer[0].buffer_index = 0;
7643     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7644   }
7645 
7646   KMP_MB(); /* Flush all pending memory write invalidates.  */
7647   KMP_ASSERT(this_thr->th.th_team == team);
7648 
7649 #ifdef KMP_DEBUG
7650   for (f = 0; f < team->t.t_nproc; f++) {
7651     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7652                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7653   }
7654 #endif /* KMP_DEBUG */
7655 
7656   /* release the worker threads so they may begin working */
7657   __kmp_fork_barrier(gtid, 0);
7658 }
7659 
7660 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7661   kmp_info_t *this_thr = __kmp_threads[gtid];
7662 
7663   KMP_DEBUG_ASSERT(team);
7664   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7665   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7666   KMP_MB(); /* Flush all pending memory write invalidates.  */
7667 
7668   /* Join barrier after fork */
7669 
7670 #ifdef KMP_DEBUG
7671   if (__kmp_threads[gtid] &&
7672       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7673     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7674                  __kmp_threads[gtid]);
7675     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7676                  "team->t.t_nproc=%d\n",
7677                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7678                  team->t.t_nproc);
7679     __kmp_print_structure();
7680   }
7681   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7682                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7683 #endif /* KMP_DEBUG */
7684 
7685   __kmp_join_barrier(gtid); /* wait for everyone */
7686 #if OMPT_SUPPORT
7687   if (ompt_enabled.enabled &&
7688       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7689     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7690     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7691     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7692 #if OMPT_OPTIONAL
7693     void *codeptr = NULL;
7694     if (KMP_MASTER_TID(ds_tid) &&
7695         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7696          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7697       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7698 
7699     if (ompt_enabled.ompt_callback_sync_region_wait) {
7700       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7701           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7702           codeptr);
7703     }
7704     if (ompt_enabled.ompt_callback_sync_region) {
7705       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7706           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7707           codeptr);
7708     }
7709 #endif
7710     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7711       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7712           ompt_scope_end, NULL, task_data, 0, ds_tid,
7713           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7714     }
7715   }
7716 #endif
7717 
7718   KMP_MB(); /* Flush all pending memory write invalidates.  */
7719   KMP_ASSERT(this_thr->th.th_team == team);
7720 }
7721 
7722 /* ------------------------------------------------------------------------ */
7723 
7724 #ifdef USE_LOAD_BALANCE
7725 
7726 // Return the worker threads actively spinning in the hot team, if we
7727 // are at the outermost level of parallelism.  Otherwise, return 0.
7728 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7729   int i;
7730   int retval;
7731   kmp_team_t *hot_team;
7732 
7733   if (root->r.r_active) {
7734     return 0;
7735   }
7736   hot_team = root->r.r_hot_team;
7737   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7738     return hot_team->t.t_nproc - 1; // Don't count primary thread
7739   }
7740 
7741   // Skip the primary thread - it is accounted for elsewhere.
7742   retval = 0;
7743   for (i = 1; i < hot_team->t.t_nproc; i++) {
7744     if (hot_team->t.t_threads[i]->th.th_active) {
7745       retval++;
7746     }
7747   }
7748   return retval;
7749 }
7750 
7751 // Perform an automatic adjustment to the number of
7752 // threads used by the next parallel region.
7753 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7754   int retval;
7755   int pool_active;
7756   int hot_team_active;
7757   int team_curr_active;
7758   int system_active;
7759 
7760   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7761                 set_nproc));
7762   KMP_DEBUG_ASSERT(root);
7763   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7764                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7765   KMP_DEBUG_ASSERT(set_nproc > 1);
7766 
7767   if (set_nproc == 1) {
7768     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7769     return 1;
7770   }
7771 
7772   // Threads that are active in the thread pool, active in the hot team for this
7773   // particular root (if we are at the outer par level), and the currently
7774   // executing thread (to become the primary thread) are available to add to the
7775   // new team, but are currently contributing to the system load, and must be
7776   // accounted for.
7777   pool_active = __kmp_thread_pool_active_nth;
7778   hot_team_active = __kmp_active_hot_team_nproc(root);
7779   team_curr_active = pool_active + hot_team_active + 1;
7780 
7781   // Check the system load.
7782   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7783   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7784                 "hot team active = %d\n",
7785                 system_active, pool_active, hot_team_active));
7786 
7787   if (system_active < 0) {
7788     // There was an error reading the necessary info from /proc, so use the
7789     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7790     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7791     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7792     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7793 
7794     // Make this call behave like the thread limit algorithm.
7795     retval = __kmp_avail_proc - __kmp_nth +
7796              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7797     if (retval > set_nproc) {
7798       retval = set_nproc;
7799     }
7800     if (retval < KMP_MIN_NTH) {
7801       retval = KMP_MIN_NTH;
7802     }
7803 
7804     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7805                   retval));
7806     return retval;
7807   }
7808 
7809   // There is a slight delay in the load balance algorithm in detecting new
7810   // running procs. The real system load at this instant should be at least as
7811   // large as the #active omp thread that are available to add to the team.
7812   if (system_active < team_curr_active) {
7813     system_active = team_curr_active;
7814   }
7815   retval = __kmp_avail_proc - system_active + team_curr_active;
7816   if (retval > set_nproc) {
7817     retval = set_nproc;
7818   }
7819   if (retval < KMP_MIN_NTH) {
7820     retval = KMP_MIN_NTH;
7821   }
7822 
7823   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7824   return retval;
7825 } // __kmp_load_balance_nproc()
7826 
7827 #endif /* USE_LOAD_BALANCE */
7828 
7829 /* ------------------------------------------------------------------------ */
7830 
7831 /* NOTE: this is called with the __kmp_init_lock held */
7832 void __kmp_cleanup(void) {
7833   int f;
7834 
7835   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7836 
7837   if (TCR_4(__kmp_init_parallel)) {
7838 #if KMP_HANDLE_SIGNALS
7839     __kmp_remove_signals();
7840 #endif
7841     TCW_4(__kmp_init_parallel, FALSE);
7842   }
7843 
7844   if (TCR_4(__kmp_init_middle)) {
7845 #if KMP_AFFINITY_SUPPORTED
7846     __kmp_affinity_uninitialize();
7847 #endif /* KMP_AFFINITY_SUPPORTED */
7848     __kmp_cleanup_hierarchy();
7849     TCW_4(__kmp_init_middle, FALSE);
7850   }
7851 
7852   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7853 
7854   if (__kmp_init_serial) {
7855     __kmp_runtime_destroy();
7856     __kmp_init_serial = FALSE;
7857   }
7858 
7859   __kmp_cleanup_threadprivate_caches();
7860 
7861   for (f = 0; f < __kmp_threads_capacity; f++) {
7862     if (__kmp_root[f] != NULL) {
7863       __kmp_free(__kmp_root[f]);
7864       __kmp_root[f] = NULL;
7865     }
7866   }
7867   __kmp_free(__kmp_threads);
7868   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7869   // there is no need in freeing __kmp_root.
7870   __kmp_threads = NULL;
7871   __kmp_root = NULL;
7872   __kmp_threads_capacity = 0;
7873 
7874 #if KMP_USE_DYNAMIC_LOCK
7875   __kmp_cleanup_indirect_user_locks();
7876 #else
7877   __kmp_cleanup_user_locks();
7878 #endif
7879 #if OMPD_SUPPORT
7880   if (ompd_state) {
7881     __kmp_free(ompd_env_block);
7882     ompd_env_block = NULL;
7883     ompd_env_block_size = 0;
7884   }
7885 #endif
7886 
7887 #if KMP_AFFINITY_SUPPORTED
7888   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7889   __kmp_cpuinfo_file = NULL;
7890 #endif /* KMP_AFFINITY_SUPPORTED */
7891 
7892 #if KMP_USE_ADAPTIVE_LOCKS
7893 #if KMP_DEBUG_ADAPTIVE_LOCKS
7894   __kmp_print_speculative_stats();
7895 #endif
7896 #endif
7897   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7898   __kmp_nested_nth.nth = NULL;
7899   __kmp_nested_nth.size = 0;
7900   __kmp_nested_nth.used = 0;
7901   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7902   __kmp_nested_proc_bind.bind_types = NULL;
7903   __kmp_nested_proc_bind.size = 0;
7904   __kmp_nested_proc_bind.used = 0;
7905   if (__kmp_affinity_format) {
7906     KMP_INTERNAL_FREE(__kmp_affinity_format);
7907     __kmp_affinity_format = NULL;
7908   }
7909 
7910   __kmp_i18n_catclose();
7911 
7912 #if KMP_USE_HIER_SCHED
7913   __kmp_hier_scheds.deallocate();
7914 #endif
7915 
7916 #if KMP_STATS_ENABLED
7917   __kmp_stats_fini();
7918 #endif
7919 
7920   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7921 }
7922 
7923 /* ------------------------------------------------------------------------ */
7924 
7925 int __kmp_ignore_mppbeg(void) {
7926   char *env;
7927 
7928   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7929     if (__kmp_str_match_false(env))
7930       return FALSE;
7931   }
7932   // By default __kmpc_begin() is no-op.
7933   return TRUE;
7934 }
7935 
7936 int __kmp_ignore_mppend(void) {
7937   char *env;
7938 
7939   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7940     if (__kmp_str_match_false(env))
7941       return FALSE;
7942   }
7943   // By default __kmpc_end() is no-op.
7944   return TRUE;
7945 }
7946 
7947 void __kmp_internal_begin(void) {
7948   int gtid;
7949   kmp_root_t *root;
7950 
7951   /* this is a very important step as it will register new sibling threads
7952      and assign these new uber threads a new gtid */
7953   gtid = __kmp_entry_gtid();
7954   root = __kmp_threads[gtid]->th.th_root;
7955   KMP_ASSERT(KMP_UBER_GTID(gtid));
7956 
7957   if (root->r.r_begin)
7958     return;
7959   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7960   if (root->r.r_begin) {
7961     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7962     return;
7963   }
7964 
7965   root->r.r_begin = TRUE;
7966 
7967   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7968 }
7969 
7970 /* ------------------------------------------------------------------------ */
7971 
7972 void __kmp_user_set_library(enum library_type arg) {
7973   int gtid;
7974   kmp_root_t *root;
7975   kmp_info_t *thread;
7976 
7977   /* first, make sure we are initialized so we can get our gtid */
7978 
7979   gtid = __kmp_entry_gtid();
7980   thread = __kmp_threads[gtid];
7981 
7982   root = thread->th.th_root;
7983 
7984   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7985                 library_serial));
7986   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7987                                   thread */
7988     KMP_WARNING(SetLibraryIncorrectCall);
7989     return;
7990   }
7991 
7992   switch (arg) {
7993   case library_serial:
7994     thread->th.th_set_nproc = 0;
7995     set__nproc(thread, 1);
7996     break;
7997   case library_turnaround:
7998     thread->th.th_set_nproc = 0;
7999     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8000                                            : __kmp_dflt_team_nth_ub);
8001     break;
8002   case library_throughput:
8003     thread->th.th_set_nproc = 0;
8004     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8005                                            : __kmp_dflt_team_nth_ub);
8006     break;
8007   default:
8008     KMP_FATAL(UnknownLibraryType, arg);
8009   }
8010 
8011   __kmp_aux_set_library(arg);
8012 }
8013 
8014 void __kmp_aux_set_stacksize(size_t arg) {
8015   if (!__kmp_init_serial)
8016     __kmp_serial_initialize();
8017 
8018 #if KMP_OS_DARWIN
8019   if (arg & (0x1000 - 1)) {
8020     arg &= ~(0x1000 - 1);
8021     if (arg + 0x1000) /* check for overflow if we round up */
8022       arg += 0x1000;
8023   }
8024 #endif
8025   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8026 
8027   /* only change the default stacksize before the first parallel region */
8028   if (!TCR_4(__kmp_init_parallel)) {
8029     size_t value = arg; /* argument is in bytes */
8030 
8031     if (value < __kmp_sys_min_stksize)
8032       value = __kmp_sys_min_stksize;
8033     else if (value > KMP_MAX_STKSIZE)
8034       value = KMP_MAX_STKSIZE;
8035 
8036     __kmp_stksize = value;
8037 
8038     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8039   }
8040 
8041   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8042 }
8043 
8044 /* set the behaviour of the runtime library */
8045 /* TODO this can cause some odd behaviour with sibling parallelism... */
8046 void __kmp_aux_set_library(enum library_type arg) {
8047   __kmp_library = arg;
8048 
8049   switch (__kmp_library) {
8050   case library_serial: {
8051     KMP_INFORM(LibraryIsSerial);
8052   } break;
8053   case library_turnaround:
8054     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8055       __kmp_use_yield = 2; // only yield when oversubscribed
8056     break;
8057   case library_throughput:
8058     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8059       __kmp_dflt_blocktime = 200;
8060     break;
8061   default:
8062     KMP_FATAL(UnknownLibraryType, arg);
8063   }
8064 }
8065 
8066 /* Getting team information common for all team API */
8067 // Returns NULL if not in teams construct
8068 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8069   kmp_info_t *thr = __kmp_entry_thread();
8070   teams_serialized = 0;
8071   if (thr->th.th_teams_microtask) {
8072     kmp_team_t *team = thr->th.th_team;
8073     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8074     int ii = team->t.t_level;
8075     teams_serialized = team->t.t_serialized;
8076     int level = tlevel + 1;
8077     KMP_DEBUG_ASSERT(ii >= tlevel);
8078     while (ii > level) {
8079       for (teams_serialized = team->t.t_serialized;
8080            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8081       }
8082       if (team->t.t_serialized && (!teams_serialized)) {
8083         team = team->t.t_parent;
8084         continue;
8085       }
8086       if (ii > level) {
8087         team = team->t.t_parent;
8088         ii--;
8089       }
8090     }
8091     return team;
8092   }
8093   return NULL;
8094 }
8095 
8096 int __kmp_aux_get_team_num() {
8097   int serialized;
8098   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8099   if (team) {
8100     if (serialized > 1) {
8101       return 0; // teams region is serialized ( 1 team of 1 thread ).
8102     } else {
8103       return team->t.t_master_tid;
8104     }
8105   }
8106   return 0;
8107 }
8108 
8109 int __kmp_aux_get_num_teams() {
8110   int serialized;
8111   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8112   if (team) {
8113     if (serialized > 1) {
8114       return 1;
8115     } else {
8116       return team->t.t_parent->t.t_nproc;
8117     }
8118   }
8119   return 1;
8120 }
8121 
8122 /* ------------------------------------------------------------------------ */
8123 
8124 /*
8125  * Affinity Format Parser
8126  *
8127  * Field is in form of: %[[[0].]size]type
8128  * % and type are required (%% means print a literal '%')
8129  * type is either single char or long name surrounded by {},
8130  * e.g., N or {num_threads}
8131  * 0 => leading zeros
8132  * . => right justified when size is specified
8133  * by default output is left justified
8134  * size is the *minimum* field length
8135  * All other characters are printed as is
8136  *
8137  * Available field types:
8138  * L {thread_level}      - omp_get_level()
8139  * n {thread_num}        - omp_get_thread_num()
8140  * h {host}              - name of host machine
8141  * P {process_id}        - process id (integer)
8142  * T {thread_identifier} - native thread identifier (integer)
8143  * N {num_threads}       - omp_get_num_threads()
8144  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8145  * a {thread_affinity}   - comma separated list of integers or integer ranges
8146  *                         (values of affinity mask)
8147  *
8148  * Implementation-specific field types can be added
8149  * If a type is unknown, print "undefined"
8150  */
8151 
8152 // Structure holding the short name, long name, and corresponding data type
8153 // for snprintf.  A table of these will represent the entire valid keyword
8154 // field types.
8155 typedef struct kmp_affinity_format_field_t {
8156   char short_name; // from spec e.g., L -> thread level
8157   const char *long_name; // from spec thread_level -> thread level
8158   char field_format; // data type for snprintf (typically 'd' or 's'
8159   // for integer or string)
8160 } kmp_affinity_format_field_t;
8161 
8162 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8163 #if KMP_AFFINITY_SUPPORTED
8164     {'A', "thread_affinity", 's'},
8165 #endif
8166     {'t', "team_num", 'd'},
8167     {'T', "num_teams", 'd'},
8168     {'L', "nesting_level", 'd'},
8169     {'n', "thread_num", 'd'},
8170     {'N', "num_threads", 'd'},
8171     {'a', "ancestor_tnum", 'd'},
8172     {'H', "host", 's'},
8173     {'P', "process_id", 'd'},
8174     {'i', "native_thread_id", 'd'}};
8175 
8176 // Return the number of characters it takes to hold field
8177 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8178                                             const char **ptr,
8179                                             kmp_str_buf_t *field_buffer) {
8180   int rc, format_index, field_value;
8181   const char *width_left, *width_right;
8182   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8183   static const int FORMAT_SIZE = 20;
8184   char format[FORMAT_SIZE] = {0};
8185   char absolute_short_name = 0;
8186 
8187   KMP_DEBUG_ASSERT(gtid >= 0);
8188   KMP_DEBUG_ASSERT(th);
8189   KMP_DEBUG_ASSERT(**ptr == '%');
8190   KMP_DEBUG_ASSERT(field_buffer);
8191 
8192   __kmp_str_buf_clear(field_buffer);
8193 
8194   // Skip the initial %
8195   (*ptr)++;
8196 
8197   // Check for %% first
8198   if (**ptr == '%') {
8199     __kmp_str_buf_cat(field_buffer, "%", 1);
8200     (*ptr)++; // skip over the second %
8201     return 1;
8202   }
8203 
8204   // Parse field modifiers if they are present
8205   pad_zeros = false;
8206   if (**ptr == '0') {
8207     pad_zeros = true;
8208     (*ptr)++; // skip over 0
8209   }
8210   right_justify = false;
8211   if (**ptr == '.') {
8212     right_justify = true;
8213     (*ptr)++; // skip over .
8214   }
8215   // Parse width of field: [width_left, width_right)
8216   width_left = width_right = NULL;
8217   if (**ptr >= '0' && **ptr <= '9') {
8218     width_left = *ptr;
8219     SKIP_DIGITS(*ptr);
8220     width_right = *ptr;
8221   }
8222 
8223   // Create the format for KMP_SNPRINTF based on flags parsed above
8224   format_index = 0;
8225   format[format_index++] = '%';
8226   if (!right_justify)
8227     format[format_index++] = '-';
8228   if (pad_zeros)
8229     format[format_index++] = '0';
8230   if (width_left && width_right) {
8231     int i = 0;
8232     // Only allow 8 digit number widths.
8233     // This also prevents overflowing format variable
8234     while (i < 8 && width_left < width_right) {
8235       format[format_index++] = *width_left;
8236       width_left++;
8237       i++;
8238     }
8239   }
8240 
8241   // Parse a name (long or short)
8242   // Canonicalize the name into absolute_short_name
8243   found_valid_name = false;
8244   parse_long_name = (**ptr == '{');
8245   if (parse_long_name)
8246     (*ptr)++; // skip initial left brace
8247   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8248                              sizeof(__kmp_affinity_format_table[0]);
8249        ++i) {
8250     char short_name = __kmp_affinity_format_table[i].short_name;
8251     const char *long_name = __kmp_affinity_format_table[i].long_name;
8252     char field_format = __kmp_affinity_format_table[i].field_format;
8253     if (parse_long_name) {
8254       size_t length = KMP_STRLEN(long_name);
8255       if (strncmp(*ptr, long_name, length) == 0) {
8256         found_valid_name = true;
8257         (*ptr) += length; // skip the long name
8258       }
8259     } else if (**ptr == short_name) {
8260       found_valid_name = true;
8261       (*ptr)++; // skip the short name
8262     }
8263     if (found_valid_name) {
8264       format[format_index++] = field_format;
8265       format[format_index++] = '\0';
8266       absolute_short_name = short_name;
8267       break;
8268     }
8269   }
8270   if (parse_long_name) {
8271     if (**ptr != '}') {
8272       absolute_short_name = 0;
8273     } else {
8274       (*ptr)++; // skip over the right brace
8275     }
8276   }
8277 
8278   // Attempt to fill the buffer with the requested
8279   // value using snprintf within __kmp_str_buf_print()
8280   switch (absolute_short_name) {
8281   case 't':
8282     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8283     break;
8284   case 'T':
8285     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8286     break;
8287   case 'L':
8288     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8289     break;
8290   case 'n':
8291     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8292     break;
8293   case 'H': {
8294     static const int BUFFER_SIZE = 256;
8295     char buf[BUFFER_SIZE];
8296     __kmp_expand_host_name(buf, BUFFER_SIZE);
8297     rc = __kmp_str_buf_print(field_buffer, format, buf);
8298   } break;
8299   case 'P':
8300     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8301     break;
8302   case 'i':
8303     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8304     break;
8305   case 'N':
8306     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8307     break;
8308   case 'a':
8309     field_value =
8310         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8311     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8312     break;
8313 #if KMP_AFFINITY_SUPPORTED
8314   case 'A': {
8315     kmp_str_buf_t buf;
8316     __kmp_str_buf_init(&buf);
8317     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8318     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8319     __kmp_str_buf_free(&buf);
8320   } break;
8321 #endif
8322   default:
8323     // According to spec, If an implementation does not have info for field
8324     // type, then "undefined" is printed
8325     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8326     // Skip the field
8327     if (parse_long_name) {
8328       SKIP_TOKEN(*ptr);
8329       if (**ptr == '}')
8330         (*ptr)++;
8331     } else {
8332       (*ptr)++;
8333     }
8334   }
8335 
8336   KMP_ASSERT(format_index <= FORMAT_SIZE);
8337   return rc;
8338 }
8339 
8340 /*
8341  * Return number of characters needed to hold the affinity string
8342  * (not including null byte character)
8343  * The resultant string is printed to buffer, which the caller can then
8344  * handle afterwards
8345  */
8346 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8347                                   kmp_str_buf_t *buffer) {
8348   const char *parse_ptr;
8349   size_t retval;
8350   const kmp_info_t *th;
8351   kmp_str_buf_t field;
8352 
8353   KMP_DEBUG_ASSERT(buffer);
8354   KMP_DEBUG_ASSERT(gtid >= 0);
8355 
8356   __kmp_str_buf_init(&field);
8357   __kmp_str_buf_clear(buffer);
8358 
8359   th = __kmp_threads[gtid];
8360   retval = 0;
8361 
8362   // If format is NULL or zero-length string, then we use
8363   // affinity-format-var ICV
8364   parse_ptr = format;
8365   if (parse_ptr == NULL || *parse_ptr == '\0') {
8366     parse_ptr = __kmp_affinity_format;
8367   }
8368   KMP_DEBUG_ASSERT(parse_ptr);
8369 
8370   while (*parse_ptr != '\0') {
8371     // Parse a field
8372     if (*parse_ptr == '%') {
8373       // Put field in the buffer
8374       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8375       __kmp_str_buf_catbuf(buffer, &field);
8376       retval += rc;
8377     } else {
8378       // Put literal character in buffer
8379       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8380       retval++;
8381       parse_ptr++;
8382     }
8383   }
8384   __kmp_str_buf_free(&field);
8385   return retval;
8386 }
8387 
8388 // Displays the affinity string to stdout
8389 void __kmp_aux_display_affinity(int gtid, const char *format) {
8390   kmp_str_buf_t buf;
8391   __kmp_str_buf_init(&buf);
8392   __kmp_aux_capture_affinity(gtid, format, &buf);
8393   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8394   __kmp_str_buf_free(&buf);
8395 }
8396 
8397 /* ------------------------------------------------------------------------ */
8398 
8399 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8400   int blocktime = arg; /* argument is in milliseconds */
8401 #if KMP_USE_MONITOR
8402   int bt_intervals;
8403 #endif
8404   kmp_int8 bt_set;
8405 
8406   __kmp_save_internal_controls(thread);
8407 
8408   /* Normalize and set blocktime for the teams */
8409   if (blocktime < KMP_MIN_BLOCKTIME)
8410     blocktime = KMP_MIN_BLOCKTIME;
8411   else if (blocktime > KMP_MAX_BLOCKTIME)
8412     blocktime = KMP_MAX_BLOCKTIME;
8413 
8414   set__blocktime_team(thread->th.th_team, tid, blocktime);
8415   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8416 
8417 #if KMP_USE_MONITOR
8418   /* Calculate and set blocktime intervals for the teams */
8419   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8420 
8421   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8422   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8423 #endif
8424 
8425   /* Set whether blocktime has been set to "TRUE" */
8426   bt_set = TRUE;
8427 
8428   set__bt_set_team(thread->th.th_team, tid, bt_set);
8429   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8430 #if KMP_USE_MONITOR
8431   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8432                 "bt_intervals=%d, monitor_updates=%d\n",
8433                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8434                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8435                 __kmp_monitor_wakeups));
8436 #else
8437   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8438                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8439                 thread->th.th_team->t.t_id, tid, blocktime));
8440 #endif
8441 }
8442 
8443 void __kmp_aux_set_defaults(char const *str, size_t len) {
8444   if (!__kmp_init_serial) {
8445     __kmp_serial_initialize();
8446   }
8447   __kmp_env_initialize(str);
8448 
8449   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8450     __kmp_env_print();
8451   }
8452 } // __kmp_aux_set_defaults
8453 
8454 /* ------------------------------------------------------------------------ */
8455 /* internal fast reduction routines */
8456 
8457 PACKED_REDUCTION_METHOD_T
8458 __kmp_determine_reduction_method(
8459     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8460     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8461     kmp_critical_name *lck) {
8462 
8463   // Default reduction method: critical construct ( lck != NULL, like in current
8464   // PAROPT )
8465   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8466   // can be selected by RTL
8467   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8468   // can be selected by RTL
8469   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8470   // among generated by PAROPT.
8471 
8472   PACKED_REDUCTION_METHOD_T retval;
8473 
8474   int team_size;
8475 
8476   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8477   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8478 
8479 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8480   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8481 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8482 
8483   retval = critical_reduce_block;
8484 
8485   // another choice of getting a team size (with 1 dynamic deference) is slower
8486   team_size = __kmp_get_team_num_threads(global_tid);
8487   if (team_size == 1) {
8488 
8489     retval = empty_reduce_block;
8490 
8491   } else {
8492 
8493     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8494 
8495 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8496     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8497 
8498 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8499     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8500 
8501     int teamsize_cutoff = 4;
8502 
8503 #if KMP_MIC_SUPPORTED
8504     if (__kmp_mic_type != non_mic) {
8505       teamsize_cutoff = 8;
8506     }
8507 #endif
8508     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8509     if (tree_available) {
8510       if (team_size <= teamsize_cutoff) {
8511         if (atomic_available) {
8512           retval = atomic_reduce_block;
8513         }
8514       } else {
8515         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8516       }
8517     } else if (atomic_available) {
8518       retval = atomic_reduce_block;
8519     }
8520 #else
8521 #error "Unknown or unsupported OS"
8522 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8523        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8524 
8525 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8526 
8527 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8528 
8529     // basic tuning
8530 
8531     if (atomic_available) {
8532       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8533         retval = atomic_reduce_block;
8534       }
8535     } // otherwise: use critical section
8536 
8537 #elif KMP_OS_DARWIN
8538 
8539     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8540     if (atomic_available && (num_vars <= 3)) {
8541       retval = atomic_reduce_block;
8542     } else if (tree_available) {
8543       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8544           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8545         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8546       }
8547     } // otherwise: use critical section
8548 
8549 #else
8550 #error "Unknown or unsupported OS"
8551 #endif
8552 
8553 #else
8554 #error "Unknown or unsupported architecture"
8555 #endif
8556   }
8557 
8558   // KMP_FORCE_REDUCTION
8559 
8560   // If the team is serialized (team_size == 1), ignore the forced reduction
8561   // method and stay with the unsynchronized method (empty_reduce_block)
8562   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8563       team_size != 1) {
8564 
8565     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8566 
8567     int atomic_available, tree_available;
8568 
8569     switch ((forced_retval = __kmp_force_reduction_method)) {
8570     case critical_reduce_block:
8571       KMP_ASSERT(lck); // lck should be != 0
8572       break;
8573 
8574     case atomic_reduce_block:
8575       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8576       if (!atomic_available) {
8577         KMP_WARNING(RedMethodNotSupported, "atomic");
8578         forced_retval = critical_reduce_block;
8579       }
8580       break;
8581 
8582     case tree_reduce_block:
8583       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8584       if (!tree_available) {
8585         KMP_WARNING(RedMethodNotSupported, "tree");
8586         forced_retval = critical_reduce_block;
8587       } else {
8588 #if KMP_FAST_REDUCTION_BARRIER
8589         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8590 #endif
8591       }
8592       break;
8593 
8594     default:
8595       KMP_ASSERT(0); // "unsupported method specified"
8596     }
8597 
8598     retval = forced_retval;
8599   }
8600 
8601   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8602 
8603 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8604 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8605 
8606   return (retval);
8607 }
8608 // this function is for testing set/get/determine reduce method
8609 kmp_int32 __kmp_get_reduce_method(void) {
8610   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8611 }
8612 
8613 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8614 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8615 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8616 
8617 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8618 // OpenMP is used subsequently.
8619 void __kmp_hard_pause() {
8620   __kmp_pause_status = kmp_hard_paused;
8621   __kmp_internal_end_thread(-1);
8622 }
8623 
8624 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8625 void __kmp_resume_if_soft_paused() {
8626   if (__kmp_pause_status == kmp_soft_paused) {
8627     __kmp_pause_status = kmp_not_paused;
8628 
8629     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8630       kmp_info_t *thread = __kmp_threads[gtid];
8631       if (thread) { // Wake it if sleeping
8632         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8633                          thread);
8634         if (fl.is_sleeping())
8635           fl.resume(gtid);
8636         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8637           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8638         } else { // thread holds the lock and may sleep soon
8639           do { // until either the thread sleeps, or we can get the lock
8640             if (fl.is_sleeping()) {
8641               fl.resume(gtid);
8642               break;
8643             } else if (__kmp_try_suspend_mx(thread)) {
8644               __kmp_unlock_suspend_mx(thread);
8645               break;
8646             }
8647           } while (1);
8648         }
8649       }
8650     }
8651   }
8652 }
8653 
8654 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8655 // TODO: add warning messages
8656 int __kmp_pause_resource(kmp_pause_status_t level) {
8657   if (level == kmp_not_paused) { // requesting resume
8658     if (__kmp_pause_status == kmp_not_paused) {
8659       // error message about runtime not being paused, so can't resume
8660       return 1;
8661     } else {
8662       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8663                        __kmp_pause_status == kmp_hard_paused);
8664       __kmp_pause_status = kmp_not_paused;
8665       return 0;
8666     }
8667   } else if (level == kmp_soft_paused) { // requesting soft pause
8668     if (__kmp_pause_status != kmp_not_paused) {
8669       // error message about already being paused
8670       return 1;
8671     } else {
8672       __kmp_soft_pause();
8673       return 0;
8674     }
8675   } else if (level == kmp_hard_paused) { // requesting hard pause
8676     if (__kmp_pause_status != kmp_not_paused) {
8677       // error message about already being paused
8678       return 1;
8679     } else {
8680       __kmp_hard_pause();
8681       return 0;
8682     }
8683   } else {
8684     // error message about invalid level
8685     return 1;
8686   }
8687 }
8688 
8689 void __kmp_omp_display_env(int verbose) {
8690   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8691   if (__kmp_init_serial == 0)
8692     __kmp_do_serial_initialize();
8693   __kmp_display_env_impl(!verbose, verbose);
8694   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8695 }
8696 
8697 // Globals and functions for hidden helper task
8698 kmp_info_t **__kmp_hidden_helper_threads;
8699 kmp_info_t *__kmp_hidden_helper_main_thread;
8700 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8701 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8702 #if KMP_OS_LINUX
8703 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8704 #else
8705 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8706 #endif
8707 
8708 namespace {
8709 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8710 
8711 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8712   // This is an explicit synchronization on all hidden helper threads in case
8713   // that when a regular thread pushes a hidden helper task to one hidden
8714   // helper thread, the thread has not been awaken once since they're released
8715   // by the main thread after creating the team.
8716   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8717   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8718          __kmp_hidden_helper_threads_num)
8719     ;
8720 
8721   // If main thread, then wait for signal
8722   if (__kmpc_master(nullptr, *gtid)) {
8723     // First, unset the initial state and release the initial thread
8724     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8725     __kmp_hidden_helper_initz_release();
8726     __kmp_hidden_helper_main_thread_wait();
8727     // Now wake up all worker threads
8728     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8729       __kmp_hidden_helper_worker_thread_signal();
8730     }
8731   }
8732 }
8733 } // namespace
8734 
8735 void __kmp_hidden_helper_threads_initz_routine() {
8736   // Create a new root for hidden helper team/threads
8737   const int gtid = __kmp_register_root(TRUE);
8738   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8739   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8740   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8741       __kmp_hidden_helper_threads_num;
8742 
8743   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8744 
8745   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8746 
8747   // Set the initialization flag to FALSE
8748   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8749 
8750   __kmp_hidden_helper_threads_deinitz_release();
8751 }
8752 
8753 /* Nesting Mode:
8754    Set via KMP_NESTING_MODE, which takes an integer.
8755    Note: we skip duplicate topology levels, and skip levels with only
8756       one entity.
8757    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
8758    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
8759       in the topology, and initializes the number of threads at each of those
8760       levels to the number of entities at each level, respectively, below the
8761       entity at the parent level.
8762    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
8763       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
8764       the user to turn nesting on explicitly. This is an even more experimental
8765       option to this experimental feature, and may change or go away in the
8766       future.
8767 */
8768 
8769 // Allocate space to store nesting levels
8770 void __kmp_init_nesting_mode() {
8771   int levels = KMP_HW_LAST;
8772   __kmp_nesting_mode_nlevels = levels;
8773   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
8774   for (int i = 0; i < levels; ++i)
8775     __kmp_nesting_nth_level[i] = 0;
8776   if (__kmp_nested_nth.size < levels) {
8777     __kmp_nested_nth.nth =
8778         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
8779     __kmp_nested_nth.size = levels;
8780   }
8781 }
8782 
8783 // Set # threads for top levels of nesting; must be called after topology set
8784 void __kmp_set_nesting_mode_threads() {
8785   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
8786 
8787   if (__kmp_nesting_mode == 1)
8788     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
8789   else if (__kmp_nesting_mode > 1)
8790     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8791 
8792   if (__kmp_topology) { // use topology info
8793     int loc, hw_level;
8794     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
8795                                 loc < __kmp_nesting_mode_nlevels;
8796          loc++, hw_level++) {
8797       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
8798       if (__kmp_nesting_nth_level[loc] == 1)
8799         loc--;
8800     }
8801     // Make sure all cores are used
8802     if (__kmp_nesting_mode > 1 && loc > 1) {
8803       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
8804       int num_cores = __kmp_topology->get_count(core_level);
8805       int upper_levels = 1;
8806       for (int level = 0; level < loc - 1; ++level)
8807         upper_levels *= __kmp_nesting_nth_level[level];
8808       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
8809         __kmp_nesting_nth_level[loc - 1] =
8810             num_cores / __kmp_nesting_nth_level[loc - 2];
8811     }
8812     __kmp_nesting_mode_nlevels = loc;
8813     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8814   } else { // no topology info available; provide a reasonable guesstimation
8815     if (__kmp_avail_proc >= 4) {
8816       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
8817       __kmp_nesting_nth_level[1] = 2;
8818       __kmp_nesting_mode_nlevels = 2;
8819     } else {
8820       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
8821       __kmp_nesting_mode_nlevels = 1;
8822     }
8823     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8824   }
8825   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
8826     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
8827   }
8828   set__nproc(thread, __kmp_nesting_nth_level[0]);
8829   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
8830     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8831   if (get__max_active_levels(thread) > 1) {
8832     // if max levels was set, set nesting mode levels to same
8833     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
8834   }
8835   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
8836     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
8837 }
8838