1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61     KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69     KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85                                   int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87                                   kmp_internal_control_t *new_icvs,
88                                   ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91                                    int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97                           kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111                                int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118   int i;
119   kmp_info_t **other_threads;
120   size_t stack_data;
121   char *stack_addr;
122   size_t stack_size;
123   char *stack_base;
124 
125   KA_TRACE(
126       1000,
127       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
128        __kmp_nth, __kmp_all_nth));
129 
130   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133      __kmp_init_gtid for this to work. */
134 
135   if (!TCR_4(__kmp_init_gtid))
136     return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139   if (TCR_4(__kmp_gtid_mode) >= 3) {
140     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141     return __kmp_gtid;
142   }
143 #endif
144   if (TCR_4(__kmp_gtid_mode) >= 2) {
145     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146     return __kmp_gtid_get_specific();
147   }
148   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150   stack_addr = (char *)&stack_data;
151   other_threads = __kmp_threads;
152 
153   /* ATT: The code below is a source of potential bugs due to unsynchronized
154      access to __kmp_threads array. For example:
155      1. Current thread loads other_threads[i] to thr and checks it, it is
156         non-NULL.
157      2. Current thread is suspended by OS.
158      3. Another thread unregisters and finishes (debug versions of free()
159         may fill memory with something like 0xEF).
160      4. Current thread is resumed.
161      5. Current thread reads junk from *thr.
162      TODO: Fix it.  --ln  */
163 
164   for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167     if (!thr)
168       continue;
169 
170     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173     /* stack grows down -- search through all of the active threads */
174 
175     if (stack_addr <= stack_base) {
176       size_t stack_diff = stack_base - stack_addr;
177 
178       if (stack_diff <= stack_size) {
179         /* The only way we can be closer than the allocated */
180         /* stack size is if we are running on this thread. */
181         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182         return i;
183       }
184     }
185   }
186 
187   /* get specific to try and determine our gtid */
188   KA_TRACE(1000,
189            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190             "thread, using TLS\n"));
191   i = __kmp_gtid_get_specific();
192 
193   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
194 
195   /* if we havn't been assigned a gtid, then return code */
196   if (i < 0)
197     return i;
198 
199   /* dynamically updated stack window for uber threads to avoid get_specific
200      call */
201   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202     KMP_FATAL(StackOverflow, i);
203   }
204 
205   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206   if (stack_addr > stack_base) {
207     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210                 stack_base);
211   } else {
212     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213             stack_base - stack_addr);
214   }
215 
216   /* Reprint stack bounds for ubermaster since they have been refined */
217   if (__kmp_storage_map) {
218     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221                                  other_threads[i]->th.th_info.ds.ds_stacksize,
222                                  "th_%d stack (refinement)", i);
223   }
224   return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228   int gtid;
229 
230   if (!__kmp_init_serial) {
231     gtid = KMP_GTID_DNE;
232   } else
233 #ifdef KMP_TDATA_GTID
234       if (TCR_4(__kmp_gtid_mode) >= 3) {
235     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236     gtid = __kmp_gtid;
237   } else
238 #endif
239       if (TCR_4(__kmp_gtid_mode) >= 2) {
240     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241     gtid = __kmp_gtid_get_specific();
242   } else {
243     KA_TRACE(1000,
244              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245     gtid = __kmp_get_global_thread_id();
246   }
247 
248   /* we must be a new uber master sibling thread */
249   if (gtid == KMP_GTID_DNE) {
250     KA_TRACE(10,
251              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252               "Registering a new gtid.\n"));
253     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254     if (!__kmp_init_serial) {
255       __kmp_do_serial_initialize();
256       gtid = __kmp_gtid_get_specific();
257     } else {
258       gtid = __kmp_register_root(FALSE);
259     }
260     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262   }
263 
264   KMP_DEBUG_ASSERT(gtid >= 0);
265 
266   return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271   int f;
272   char *stack_beg = NULL;
273   char *stack_end = NULL;
274   int gtid;
275 
276   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277   if (__kmp_storage_map) {
278     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281     gtid = __kmp_gtid_from_thread(th);
282 
283     if (gtid == KMP_GTID_MONITOR) {
284       __kmp_print_storage_map_gtid(
285           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286           "th_%s stack (%s)", "mon",
287           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288     } else {
289       __kmp_print_storage_map_gtid(
290           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291           "th_%d stack (%s)", gtid,
292           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293     }
294   }
295 
296   /* No point in checking ubermaster threads since they use refinement and
297    * cannot overlap */
298   gtid = __kmp_gtid_from_thread(th);
299   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300     KA_TRACE(10,
301              ("__kmp_check_stack_overlap: performing extensive checking\n"));
302     if (stack_beg == NULL) {
303       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305     }
306 
307     for (f = 0; f < __kmp_threads_capacity; f++) {
308       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310       if (f_th && f_th != th) {
311         char *other_stack_end =
312             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313         char *other_stack_beg =
314             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318           /* Print the other stack values before the abort */
319           if (__kmp_storage_map)
320             __kmp_print_storage_map_gtid(
321                 -1, other_stack_beg, other_stack_end,
322                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326                       __kmp_msg_null);
327         }
328       }
329     }
330   }
331   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337   static int done = FALSE;
338 
339   while (!done) {
340     KMP_YIELD(TRUE);
341   }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347                                   char const *format, ...) {
348   char buffer[MAX_MESSAGE];
349   va_list ap;
350 
351   va_start(ap, format);
352   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353                p2, (unsigned long)size, format);
354   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355   __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357   int node;
358   if (gtid >= 0) {
359     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360       if (__kmp_storage_map_verbose) {
361         node = __kmp_get_host_node(p1);
362         if (node < 0) /* doesn't work, so don't try this next time */
363           __kmp_storage_map_verbose = FALSE;
364         else {
365           char *last;
366           int lastNode;
367           int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369           const int page_size = KMP_GET_PAGE_SIZE();
370 
371           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373           if (localProc >= 0)
374             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
375                                  localProc >> 1);
376           else
377             __kmp_printf_no_lock("  GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379           /* The more elaborate format is disabled for now because of the prctl
380            * hanging bug. */
381           do {
382             last = p1;
383             lastNode = node;
384             /* This loop collates adjacent pages with the same host node. */
385             do {
386               (char *)p1 += page_size;
387             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
389                                  lastNode);
390           } while (p1 <= p2);
391 #else
392           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
393                                (char *)p1 + (page_size - 1),
394                                __kmp_get_host_node(p1));
395           if (p1 < p2) {
396             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
397                                  (char *)p2 + (page_size - 1),
398                                  __kmp_get_host_node(p2));
399           }
400 #endif
401         }
402       }
403     } else
404       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
405   }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 }
409 
410 void __kmp_warn(char const *format, ...) {
411   char buffer[MAX_MESSAGE];
412   va_list ap;
413 
414   if (__kmp_generate_warnings == kmp_warnings_off) {
415     return;
416   }
417 
418   va_start(ap, format);
419 
420   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422   __kmp_vprintf(kmp_err, buffer, ap);
423   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424 
425   va_end(ap);
426 }
427 
428 void __kmp_abort_process() {
429   // Later threads may stall here, but that's ok because abort() will kill them.
430   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431 
432   if (__kmp_debug_buf) {
433     __kmp_dump_debug_buffer();
434   }
435 
436   if (KMP_OS_WINDOWS) {
437     // Let other threads know of abnormal termination and prevent deadlock
438     // if abort happened during library initialization or shutdown
439     __kmp_global.g.g_abort = SIGABRT;
440 
441     /* On Windows* OS by default abort() causes pop-up error box, which stalls
442        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443        boxes. _set_abort_behavior() works well, but this function is not
444        available in VS7 (this is not problem for DLL, but it is a problem for
445        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446        help, at least in some versions of MS C RTL.
447 
448        It seems following sequence is the only way to simulate abort() and
449        avoid pop-up error box. */
450     raise(SIGABRT);
451     _exit(3); // Just in case, if signal ignored, exit anyway.
452   } else {
453     __kmp_unregister_library();
454     abort();
455   }
456 
457   __kmp_infinite_loop();
458   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459 
460 } // __kmp_abort_process
461 
462 void __kmp_abort_thread(void) {
463   // TODO: Eliminate g_abort global variable and this function.
464   // In case of abort just call abort(), it will kill all the threads.
465   __kmp_infinite_loop();
466 } // __kmp_abort_thread
467 
468 /* Print out the storage map for the major kmp_info_t thread data structures
469    that are allocated together. */
470 
471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473                                gtid);
474 
475   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477 
478   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
480 
481   __kmp_print_storage_map_gtid(
482       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484 
485   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486                                &thr->th.th_bar[bs_plain_barrier + 1],
487                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488                                gtid);
489 
490   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
492                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493                                gtid);
494 
495 #if KMP_FAST_REDUCTION_BARRIER
496   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497                                &thr->th.th_bar[bs_reduction_barrier + 1],
498                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499                                gtid);
500 #endif // KMP_FAST_REDUCTION_BARRIER
501 }
502 
503 /* Print out the storage map for the major kmp_team_t team data structures
504    that are allocated together. */
505 
506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507                                          int team_id, int num_thr) {
508   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510                                header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513                                &team->t.t_bar[bs_last_barrier],
514                                sizeof(kmp_balign_team_t) * bs_last_barrier,
515                                "%s_%d.t_bar", header, team_id);
516 
517   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518                                &team->t.t_bar[bs_plain_barrier + 1],
519                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520                                header, team_id);
521 
522   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523                                &team->t.t_bar[bs_forkjoin_barrier + 1],
524                                sizeof(kmp_balign_team_t),
525                                "%s_%d.t_bar[forkjoin]", header, team_id);
526 
527 #if KMP_FAST_REDUCTION_BARRIER
528   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529                                &team->t.t_bar[bs_reduction_barrier + 1],
530                                sizeof(kmp_balign_team_t),
531                                "%s_%d.t_bar[reduction]", header, team_id);
532 #endif // KMP_FAST_REDUCTION_BARRIER
533 
534   __kmp_print_storage_map_gtid(
535       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537 
538   __kmp_print_storage_map_gtid(
539       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541 
542   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543                                &team->t.t_disp_buffer[num_disp_buff],
544                                sizeof(dispatch_shared_info_t) * num_disp_buff,
545                                "%s_%d.t_disp_buffer", header, team_id);
546 }
547 
548 static void __kmp_init_allocator() {
549   __kmp_init_memkind();
550   __kmp_init_target_mem();
551 }
552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553 
554 /* ------------------------------------------------------------------------ */
555 
556 #if KMP_DYNAMIC_LIB
557 #if KMP_OS_WINDOWS
558 
559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561 
562   switch (fdwReason) {
563 
564   case DLL_PROCESS_ATTACH:
565     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566 
567     return TRUE;
568 
569   case DLL_PROCESS_DETACH:
570     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571 
572     // According to Windows* documentation for DllMain entry point:
573     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574     //   lpReserved == NULL when FreeLibrary() is called,
575     //   lpReserved != NULL when the process is terminated.
576     // When FreeLibrary() is called, worker threads remain alive. So the
577     // runtime's state is consistent and executing proper shutdown is OK.
578     // When the process is terminated, worker threads have exited or been
579     // forcefully terminated by the OS and only the shutdown thread remains.
580     // This can leave the runtime in an inconsistent state.
581     // Hence, only attempt proper cleanup when FreeLibrary() is called.
582     // Otherwise, rely on OS to reclaim resources.
583     if (lpReserved == NULL)
584       __kmp_internal_end_library(__kmp_gtid_get_specific());
585 
586     return TRUE;
587 
588   case DLL_THREAD_ATTACH:
589     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590 
591     /* if we want to register new siblings all the time here call
592      * __kmp_get_gtid(); */
593     return TRUE;
594 
595   case DLL_THREAD_DETACH:
596     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597 
598     __kmp_internal_end_thread(__kmp_gtid_get_specific());
599     return TRUE;
600   }
601 
602   return TRUE;
603 }
604 
605 #endif /* KMP_OS_WINDOWS */
606 #endif /* KMP_DYNAMIC_LIB */
607 
608 /* __kmp_parallel_deo -- Wait until it's our turn. */
609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610   int gtid = *gtid_ref;
611 #ifdef BUILD_PARALLEL_ORDERED
612   kmp_team_t *team = __kmp_team_from_gtid(gtid);
613 #endif /* BUILD_PARALLEL_ORDERED */
614 
615   if (__kmp_env_consistency_check) {
616     if (__kmp_threads[gtid]->th.th_root->r.r_active)
617 #if KMP_USE_DYNAMIC_LOCK
618       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619 #else
620       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621 #endif
622   }
623 #ifdef BUILD_PARALLEL_ORDERED
624   if (!team->t.t_serialized) {
625     KMP_MB();
626     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627              NULL);
628     KMP_MB();
629   }
630 #endif /* BUILD_PARALLEL_ORDERED */
631 }
632 
633 /* __kmp_parallel_dxo -- Signal the next task. */
634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635   int gtid = *gtid_ref;
636 #ifdef BUILD_PARALLEL_ORDERED
637   int tid = __kmp_tid_from_gtid(gtid);
638   kmp_team_t *team = __kmp_team_from_gtid(gtid);
639 #endif /* BUILD_PARALLEL_ORDERED */
640 
641   if (__kmp_env_consistency_check) {
642     if (__kmp_threads[gtid]->th.th_root->r.r_active)
643       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644   }
645 #ifdef BUILD_PARALLEL_ORDERED
646   if (!team->t.t_serialized) {
647     KMP_MB(); /* Flush all pending memory write invalidates.  */
648 
649     /* use the tid of the next thread in this team */
650     /* TODO replace with general release procedure */
651     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652 
653     KMP_MB(); /* Flush all pending memory write invalidates.  */
654   }
655 #endif /* BUILD_PARALLEL_ORDERED */
656 }
657 
658 /* ------------------------------------------------------------------------ */
659 /* The BARRIER for a SINGLE process section is always explicit   */
660 
661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662   int status;
663   kmp_info_t *th;
664   kmp_team_t *team;
665 
666   if (!TCR_4(__kmp_init_parallel))
667     __kmp_parallel_initialize();
668   __kmp_resume_if_soft_paused();
669 
670   th = __kmp_threads[gtid];
671   team = th->th.th_team;
672   status = 0;
673 
674   th->th.th_ident = id_ref;
675 
676   if (team->t.t_serialized) {
677     status = 1;
678   } else {
679     kmp_int32 old_this = th->th.th_local.this_construct;
680 
681     ++th->th.th_local.this_construct;
682     /* try to set team count to thread count--success means thread got the
683        single block */
684     /* TODO: Should this be acquire or release? */
685     if (team->t.t_construct == old_this) {
686       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687                                               th->th.th_local.this_construct);
688     }
689 #if USE_ITT_BUILD
690     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692         team->t.t_active_level == 1) {
693       // Only report metadata by primary thread of active team at level 1
694       __kmp_itt_metadata_single(id_ref);
695     }
696 #endif /* USE_ITT_BUILD */
697   }
698 
699   if (__kmp_env_consistency_check) {
700     if (status && push_ws) {
701       __kmp_push_workshare(gtid, ct_psingle, id_ref);
702     } else {
703       __kmp_check_workshare(gtid, ct_psingle, id_ref);
704     }
705   }
706 #if USE_ITT_BUILD
707   if (status) {
708     __kmp_itt_single_start(gtid);
709   }
710 #endif /* USE_ITT_BUILD */
711   return status;
712 }
713 
714 void __kmp_exit_single(int gtid) {
715 #if USE_ITT_BUILD
716   __kmp_itt_single_end(gtid);
717 #endif /* USE_ITT_BUILD */
718   if (__kmp_env_consistency_check)
719     __kmp_pop_workshare(gtid, ct_psingle, NULL);
720 }
721 
722 /* determine if we can go parallel or must use a serialized parallel region and
723  * how many threads we can use
724  * set_nproc is the number of threads requested for the team
725  * returns 0 if we should serialize or only use one thread,
726  * otherwise the number of threads to use
727  * The forkjoin lock is held by the caller. */
728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729                                  int master_tid, int set_nthreads,
730                                  int enter_teams) {
731   int capacity;
732   int new_nthreads;
733   KMP_DEBUG_ASSERT(__kmp_init_serial);
734   KMP_DEBUG_ASSERT(root && parent_team);
735   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736 
737   // If dyn-var is set, dynamically adjust the number of desired threads,
738   // according to the method specified by dynamic_mode.
739   new_nthreads = set_nthreads;
740   if (!get__dynamic_2(parent_team, master_tid)) {
741     ;
742   }
743 #ifdef USE_LOAD_BALANCE
744   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746     if (new_nthreads == 1) {
747       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748                     "reservation to 1 thread\n",
749                     master_tid));
750       return 1;
751     }
752     if (new_nthreads < set_nthreads) {
753       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754                     "reservation to %d threads\n",
755                     master_tid, new_nthreads));
756     }
757   }
758 #endif /* USE_LOAD_BALANCE */
759   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760     new_nthreads = __kmp_avail_proc - __kmp_nth +
761                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762     if (new_nthreads <= 1) {
763       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764                     "reservation to 1 thread\n",
765                     master_tid));
766       return 1;
767     }
768     if (new_nthreads < set_nthreads) {
769       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770                     "reservation to %d threads\n",
771                     master_tid, new_nthreads));
772     } else {
773       new_nthreads = set_nthreads;
774     }
775   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776     if (set_nthreads > 2) {
777       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778       new_nthreads = (new_nthreads % set_nthreads) + 1;
779       if (new_nthreads == 1) {
780         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781                       "reservation to 1 thread\n",
782                       master_tid));
783         return 1;
784       }
785       if (new_nthreads < set_nthreads) {
786         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787                       "reservation to %d threads\n",
788                       master_tid, new_nthreads));
789       }
790     }
791   } else {
792     KMP_ASSERT(0);
793   }
794 
795   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796   if (__kmp_nth + new_nthreads -
797           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798       __kmp_max_nth) {
799     int tl_nthreads = __kmp_max_nth - __kmp_nth +
800                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801     if (tl_nthreads <= 0) {
802       tl_nthreads = 1;
803     }
804 
805     // If dyn-var is false, emit a 1-time warning.
806     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807       __kmp_reserve_warn = 1;
808       __kmp_msg(kmp_ms_warning,
809                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811     }
812     if (tl_nthreads == 1) {
813       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814                     "reduced reservation to 1 thread\n",
815                     master_tid));
816       return 1;
817     }
818     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819                   "reservation to %d threads\n",
820                   master_tid, tl_nthreads));
821     new_nthreads = tl_nthreads;
822   }
823 
824   // Respect OMP_THREAD_LIMIT
825   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827   if (cg_nthreads + new_nthreads -
828           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829       max_cg_threads) {
830     int tl_nthreads = max_cg_threads - cg_nthreads +
831                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832     if (tl_nthreads <= 0) {
833       tl_nthreads = 1;
834     }
835 
836     // If dyn-var is false, emit a 1-time warning.
837     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838       __kmp_reserve_warn = 1;
839       __kmp_msg(kmp_ms_warning,
840                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842     }
843     if (tl_nthreads == 1) {
844       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845                     "reduced reservation to 1 thread\n",
846                     master_tid));
847       return 1;
848     }
849     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850                   "reservation to %d threads\n",
851                   master_tid, tl_nthreads));
852     new_nthreads = tl_nthreads;
853   }
854 
855   // Check if the threads array is large enough, or needs expanding.
856   // See comment in __kmp_register_root() about the adjustment if
857   // __kmp_threads[0] == NULL.
858   capacity = __kmp_threads_capacity;
859   if (TCR_PTR(__kmp_threads[0]) == NULL) {
860     --capacity;
861   }
862   // If it is not for initializing the hidden helper team, we need to take
863   // __kmp_hidden_helper_threads_num out of the capacity because it is included
864   // in __kmp_threads_capacity.
865   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866     capacity -= __kmp_hidden_helper_threads_num;
867   }
868   if (__kmp_nth + new_nthreads -
869           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870       capacity) {
871     // Expand the threads array.
872     int slotsRequired = __kmp_nth + new_nthreads -
873                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874                         capacity;
875     int slotsAdded = __kmp_expand_threads(slotsRequired);
876     if (slotsAdded < slotsRequired) {
877       // The threads array was not expanded enough.
878       new_nthreads -= (slotsRequired - slotsAdded);
879       KMP_ASSERT(new_nthreads >= 1);
880 
881       // If dyn-var is false, emit a 1-time warning.
882       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883         __kmp_reserve_warn = 1;
884         if (__kmp_tp_cached) {
885           __kmp_msg(kmp_ms_warning,
886                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889         } else {
890           __kmp_msg(kmp_ms_warning,
891                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893         }
894       }
895     }
896   }
897 
898 #ifdef KMP_DEBUG
899   if (new_nthreads == 1) {
900     KC_TRACE(10,
901              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902               "dead roots and rechecking; requested %d threads\n",
903               __kmp_get_gtid(), set_nthreads));
904   } else {
905     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906                   " %d threads\n",
907                   __kmp_get_gtid(), new_nthreads, set_nthreads));
908   }
909 #endif // KMP_DEBUG
910   return new_nthreads;
911 }
912 
913 /* Allocate threads from the thread pool and assign them to the new team. We are
914    assured that there are enough threads available, because we checked on that
915    earlier within critical section forkjoin */
916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917                                     kmp_info_t *master_th, int master_gtid,
918                                     int fork_teams_workers) {
919   int i;
920   int use_hot_team;
921 
922   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
923   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
924   KMP_MB();
925 
926   /* first, let's setup the primary thread */
927   master_th->th.th_info.ds.ds_tid = 0;
928   master_th->th.th_team = team;
929   master_th->th.th_team_nproc = team->t.t_nproc;
930   master_th->th.th_team_master = master_th;
931   master_th->th.th_team_serialized = FALSE;
932   master_th->th.th_dispatch = &team->t.t_dispatch[0];
933 
934 /* make sure we are not the optimized hot team */
935 #if KMP_NESTED_HOT_TEAMS
936   use_hot_team = 0;
937   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
938   if (hot_teams) { // hot teams array is not allocated if
939     // KMP_HOT_TEAMS_MAX_LEVEL=0
940     int level = team->t.t_active_level - 1; // index in array of hot teams
941     if (master_th->th.th_teams_microtask) { // are we inside the teams?
942       if (master_th->th.th_teams_size.nteams > 1) {
943         ++level; // level was not increased in teams construct for
944         // team_of_masters
945       }
946       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
947           master_th->th.th_teams_level == team->t.t_level) {
948         ++level; // level was not increased in teams construct for
949         // team_of_workers before the parallel
950       } // team->t.t_level will be increased inside parallel
951     }
952     if (level < __kmp_hot_teams_max_level) {
953       if (hot_teams[level].hot_team) {
954         // hot team has already been allocated for given level
955         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
956         use_hot_team = 1; // the team is ready to use
957       } else {
958         use_hot_team = 0; // AC: threads are not allocated yet
959         hot_teams[level].hot_team = team; // remember new hot team
960         hot_teams[level].hot_team_nth = team->t.t_nproc;
961       }
962     } else {
963       use_hot_team = 0;
964     }
965   }
966 #else
967   use_hot_team = team == root->r.r_hot_team;
968 #endif
969   if (!use_hot_team) {
970 
971     /* install the primary thread */
972     team->t.t_threads[0] = master_th;
973     __kmp_initialize_info(master_th, team, 0, master_gtid);
974 
975     /* now, install the worker threads */
976     for (i = 1; i < team->t.t_nproc; i++) {
977 
978       /* fork or reallocate a new thread and install it in team */
979       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
980       team->t.t_threads[i] = thr;
981       KMP_DEBUG_ASSERT(thr);
982       KMP_DEBUG_ASSERT(thr->th.th_team == team);
983       /* align team and thread arrived states */
984       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
985                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
986                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
987                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
988                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
989                     team->t.t_bar[bs_plain_barrier].b_arrived));
990       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
991       thr->th.th_teams_level = master_th->th.th_teams_level;
992       thr->th.th_teams_size = master_th->th.th_teams_size;
993       { // Initialize threads' barrier data.
994         int b;
995         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
996         for (b = 0; b < bs_last_barrier; ++b) {
997           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
998           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
999 #if USE_DEBUGGER
1000           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1001 #endif
1002         }
1003       }
1004     }
1005 
1006 #if KMP_AFFINITY_SUPPORTED
1007     // Do not partition the places list for teams construct workers who
1008     // haven't actually been forked to do real work yet. This partitioning
1009     // will take place in the parallel region nested within the teams construct.
1010     if (!fork_teams_workers) {
1011       __kmp_partition_places(team);
1012     }
1013 #endif
1014   }
1015 
1016   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1017     for (i = 0; i < team->t.t_nproc; i++) {
1018       kmp_info_t *thr = team->t.t_threads[i];
1019       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1020           thr->th.th_prev_level != team->t.t_level) {
1021         team->t.t_display_affinity = 1;
1022         break;
1023       }
1024     }
1025   }
1026 
1027   KMP_MB();
1028 }
1029 
1030 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1031 // Propagate any changes to the floating point control registers out to the team
1032 // We try to avoid unnecessary writes to the relevant cache line in the team
1033 // structure, so we don't make changes unless they are needed.
1034 inline static void propagateFPControl(kmp_team_t *team) {
1035   if (__kmp_inherit_fp_control) {
1036     kmp_int16 x87_fpu_control_word;
1037     kmp_uint32 mxcsr;
1038 
1039     // Get primary thread's values of FPU control flags (both X87 and vector)
1040     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1041     __kmp_store_mxcsr(&mxcsr);
1042     mxcsr &= KMP_X86_MXCSR_MASK;
1043 
1044     // There is no point looking at t_fp_control_saved here.
1045     // If it is TRUE, we still have to update the values if they are different
1046     // from those we now have. If it is FALSE we didn't save anything yet, but
1047     // our objective is the same. We have to ensure that the values in the team
1048     // are the same as those we have.
1049     // So, this code achieves what we need whether or not t_fp_control_saved is
1050     // true. By checking whether the value needs updating we avoid unnecessary
1051     // writes that would put the cache-line into a written state, causing all
1052     // threads in the team to have to read it again.
1053     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1054     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1055     // Although we don't use this value, other code in the runtime wants to know
1056     // whether it should restore them. So we must ensure it is correct.
1057     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1058   } else {
1059     // Similarly here. Don't write to this cache-line in the team structure
1060     // unless we have to.
1061     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1062   }
1063 }
1064 
1065 // Do the opposite, setting the hardware registers to the updated values from
1066 // the team.
1067 inline static void updateHWFPControl(kmp_team_t *team) {
1068   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1069     // Only reset the fp control regs if they have been changed in the team.
1070     // the parallel region that we are exiting.
1071     kmp_int16 x87_fpu_control_word;
1072     kmp_uint32 mxcsr;
1073     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1074     __kmp_store_mxcsr(&mxcsr);
1075     mxcsr &= KMP_X86_MXCSR_MASK;
1076 
1077     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1078       __kmp_clear_x87_fpu_status_word();
1079       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1080     }
1081 
1082     if (team->t.t_mxcsr != mxcsr) {
1083       __kmp_load_mxcsr(&team->t.t_mxcsr);
1084     }
1085   }
1086 }
1087 #else
1088 #define propagateFPControl(x) ((void)0)
1089 #define updateHWFPControl(x) ((void)0)
1090 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1091 
1092 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1093                                      int realloc); // forward declaration
1094 
1095 /* Run a parallel region that has been serialized, so runs only in a team of the
1096    single primary thread. */
1097 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1098   kmp_info_t *this_thr;
1099   kmp_team_t *serial_team;
1100 
1101   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1102 
1103   /* Skip all this code for autopar serialized loops since it results in
1104      unacceptable overhead */
1105   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1106     return;
1107 
1108   if (!TCR_4(__kmp_init_parallel))
1109     __kmp_parallel_initialize();
1110   __kmp_resume_if_soft_paused();
1111 
1112   this_thr = __kmp_threads[global_tid];
1113   serial_team = this_thr->th.th_serial_team;
1114 
1115   /* utilize the serialized team held by this thread */
1116   KMP_DEBUG_ASSERT(serial_team);
1117   KMP_MB();
1118 
1119   if (__kmp_tasking_mode != tskm_immediate_exec) {
1120     KMP_DEBUG_ASSERT(
1121         this_thr->th.th_task_team ==
1122         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1123     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1124                      NULL);
1125     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1126                   "team %p, new task_team = NULL\n",
1127                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1128     this_thr->th.th_task_team = NULL;
1129   }
1130 
1131   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1132   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1133     proc_bind = proc_bind_false;
1134   } else if (proc_bind == proc_bind_default) {
1135     // No proc_bind clause was specified, so use the current value
1136     // of proc-bind-var for this parallel region.
1137     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1138   }
1139   // Reset for next parallel region
1140   this_thr->th.th_set_proc_bind = proc_bind_default;
1141 
1142 #if OMPT_SUPPORT
1143   ompt_data_t ompt_parallel_data = ompt_data_none;
1144   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1145   if (ompt_enabled.enabled &&
1146       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1147 
1148     ompt_task_info_t *parent_task_info;
1149     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1150 
1151     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1152     if (ompt_enabled.ompt_callback_parallel_begin) {
1153       int team_size = 1;
1154 
1155       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1156           &(parent_task_info->task_data), &(parent_task_info->frame),
1157           &ompt_parallel_data, team_size,
1158           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1159     }
1160   }
1161 #endif // OMPT_SUPPORT
1162 
1163   if (this_thr->th.th_team != serial_team) {
1164     // Nested level will be an index in the nested nthreads array
1165     int level = this_thr->th.th_team->t.t_level;
1166 
1167     if (serial_team->t.t_serialized) {
1168       /* this serial team was already used
1169          TODO increase performance by making this locks more specific */
1170       kmp_team_t *new_team;
1171 
1172       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1173 
1174       new_team =
1175           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1176 #if OMPT_SUPPORT
1177                               ompt_parallel_data,
1178 #endif
1179                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1180                               0 USE_NESTED_HOT_ARG(NULL));
1181       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1182       KMP_ASSERT(new_team);
1183 
1184       /* setup new serialized team and install it */
1185       new_team->t.t_threads[0] = this_thr;
1186       new_team->t.t_parent = this_thr->th.th_team;
1187       serial_team = new_team;
1188       this_thr->th.th_serial_team = serial_team;
1189 
1190       KF_TRACE(
1191           10,
1192           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1193            global_tid, serial_team));
1194 
1195       /* TODO the above breaks the requirement that if we run out of resources,
1196          then we can still guarantee that serialized teams are ok, since we may
1197          need to allocate a new one */
1198     } else {
1199       KF_TRACE(
1200           10,
1201           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1202            global_tid, serial_team));
1203     }
1204 
1205     /* we have to initialize this serial team */
1206     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1207     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1208     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1209     serial_team->t.t_ident = loc;
1210     serial_team->t.t_serialized = 1;
1211     serial_team->t.t_nproc = 1;
1212     serial_team->t.t_parent = this_thr->th.th_team;
1213     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1214     this_thr->th.th_team = serial_team;
1215     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1216 
1217     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1218                   this_thr->th.th_current_task));
1219     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1220     this_thr->th.th_current_task->td_flags.executing = 0;
1221 
1222     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1223 
1224     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1225        implicit task for each serialized task represented by
1226        team->t.t_serialized? */
1227     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1228               &this_thr->th.th_current_task->td_parent->td_icvs);
1229 
1230     // Thread value exists in the nested nthreads array for the next nested
1231     // level
1232     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1233       this_thr->th.th_current_task->td_icvs.nproc =
1234           __kmp_nested_nth.nth[level + 1];
1235     }
1236 
1237     if (__kmp_nested_proc_bind.used &&
1238         (level + 1 < __kmp_nested_proc_bind.used)) {
1239       this_thr->th.th_current_task->td_icvs.proc_bind =
1240           __kmp_nested_proc_bind.bind_types[level + 1];
1241     }
1242 
1243 #if USE_DEBUGGER
1244     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1245 #endif
1246     this_thr->th.th_info.ds.ds_tid = 0;
1247 
1248     /* set thread cache values */
1249     this_thr->th.th_team_nproc = 1;
1250     this_thr->th.th_team_master = this_thr;
1251     this_thr->th.th_team_serialized = 1;
1252 
1253     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1254     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1255     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1256 
1257     propagateFPControl(serial_team);
1258 
1259     /* check if we need to allocate dispatch buffers stack */
1260     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1261     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1262       serial_team->t.t_dispatch->th_disp_buffer =
1263           (dispatch_private_info_t *)__kmp_allocate(
1264               sizeof(dispatch_private_info_t));
1265     }
1266     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1267 
1268     KMP_MB();
1269 
1270   } else {
1271     /* this serialized team is already being used,
1272      * that's fine, just add another nested level */
1273     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1274     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1275     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1276     ++serial_team->t.t_serialized;
1277     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1278 
1279     // Nested level will be an index in the nested nthreads array
1280     int level = this_thr->th.th_team->t.t_level;
1281     // Thread value exists in the nested nthreads array for the next nested
1282     // level
1283     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1284       this_thr->th.th_current_task->td_icvs.nproc =
1285           __kmp_nested_nth.nth[level + 1];
1286     }
1287     serial_team->t.t_level++;
1288     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1289                   "of serial team %p to %d\n",
1290                   global_tid, serial_team, serial_team->t.t_level));
1291 
1292     /* allocate/push dispatch buffers stack */
1293     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1294     {
1295       dispatch_private_info_t *disp_buffer =
1296           (dispatch_private_info_t *)__kmp_allocate(
1297               sizeof(dispatch_private_info_t));
1298       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1299       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1300     }
1301     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1302 
1303     KMP_MB();
1304   }
1305   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1306 
1307   // Perform the display affinity functionality for
1308   // serialized parallel regions
1309   if (__kmp_display_affinity) {
1310     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1311         this_thr->th.th_prev_num_threads != 1) {
1312       // NULL means use the affinity-format-var ICV
1313       __kmp_aux_display_affinity(global_tid, NULL);
1314       this_thr->th.th_prev_level = serial_team->t.t_level;
1315       this_thr->th.th_prev_num_threads = 1;
1316     }
1317   }
1318 
1319   if (__kmp_env_consistency_check)
1320     __kmp_push_parallel(global_tid, NULL);
1321 #if OMPT_SUPPORT
1322   serial_team->t.ompt_team_info.master_return_address = codeptr;
1323   if (ompt_enabled.enabled &&
1324       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1325     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1326         OMPT_GET_FRAME_ADDRESS(0);
1327 
1328     ompt_lw_taskteam_t lw_taskteam;
1329     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1330                             &ompt_parallel_data, codeptr);
1331 
1332     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1333     // don't use lw_taskteam after linking. content was swaped
1334 
1335     /* OMPT implicit task begin */
1336     if (ompt_enabled.ompt_callback_implicit_task) {
1337       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1338           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1339           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1340           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1341       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1342           __kmp_tid_from_gtid(global_tid);
1343     }
1344 
1345     /* OMPT state */
1346     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1347     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1348         OMPT_GET_FRAME_ADDRESS(0);
1349   }
1350 #endif
1351 }
1352 
1353 /* most of the work for a fork */
1354 /* return true if we really went parallel, false if serialized */
1355 int __kmp_fork_call(ident_t *loc, int gtid,
1356                     enum fork_context_e call_context, // Intel, GNU, ...
1357                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1358                     kmp_va_list ap) {
1359   void **argv;
1360   int i;
1361   int master_tid;
1362   int master_this_cons;
1363   kmp_team_t *team;
1364   kmp_team_t *parent_team;
1365   kmp_info_t *master_th;
1366   kmp_root_t *root;
1367   int nthreads;
1368   int master_active;
1369   int master_set_numthreads;
1370   int level;
1371   int active_level;
1372   int teams_level;
1373 #if KMP_NESTED_HOT_TEAMS
1374   kmp_hot_team_ptr_t **p_hot_teams;
1375 #endif
1376   { // KMP_TIME_BLOCK
1377     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1378     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1379 
1380     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1381     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1382       /* Some systems prefer the stack for the root thread(s) to start with */
1383       /* some gap from the parent stack to prevent false sharing. */
1384       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1385       /* These 2 lines below are so this does not get optimized out */
1386       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1387         __kmp_stkpadding += (short)((kmp_int64)dummy);
1388     }
1389 
1390     /* initialize if needed */
1391     KMP_DEBUG_ASSERT(
1392         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1393     if (!TCR_4(__kmp_init_parallel))
1394       __kmp_parallel_initialize();
1395     __kmp_resume_if_soft_paused();
1396 
1397     /* setup current data */
1398     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1399     // shutdown
1400     parent_team = master_th->th.th_team;
1401     master_tid = master_th->th.th_info.ds.ds_tid;
1402     master_this_cons = master_th->th.th_local.this_construct;
1403     root = master_th->th.th_root;
1404     master_active = root->r.r_active;
1405     master_set_numthreads = master_th->th.th_set_nproc;
1406 
1407 #if OMPT_SUPPORT
1408     ompt_data_t ompt_parallel_data = ompt_data_none;
1409     ompt_data_t *parent_task_data;
1410     ompt_frame_t *ompt_frame;
1411     ompt_data_t *implicit_task_data;
1412     void *return_address = NULL;
1413 
1414     if (ompt_enabled.enabled) {
1415       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1416                                     NULL, NULL);
1417       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1418     }
1419 #endif
1420 
1421     // Assign affinity to root thread if it hasn't happened yet
1422     __kmp_assign_root_init_mask();
1423 
1424     // Nested level will be an index in the nested nthreads array
1425     level = parent_team->t.t_level;
1426     // used to launch non-serial teams even if nested is not allowed
1427     active_level = parent_team->t.t_active_level;
1428     // needed to check nesting inside the teams
1429     teams_level = master_th->th.th_teams_level;
1430 #if KMP_NESTED_HOT_TEAMS
1431     p_hot_teams = &master_th->th.th_hot_teams;
1432     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1433       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1434           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1435       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1436       // it is either actual or not needed (when active_level > 0)
1437       (*p_hot_teams)[0].hot_team_nth = 1;
1438     }
1439 #endif
1440 
1441 #if OMPT_SUPPORT
1442     if (ompt_enabled.enabled) {
1443       if (ompt_enabled.ompt_callback_parallel_begin) {
1444         int team_size = master_set_numthreads
1445                             ? master_set_numthreads
1446                             : get__nproc_2(parent_team, master_tid);
1447         int flags = OMPT_INVOKER(call_context) |
1448                     ((microtask == (microtask_t)__kmp_teams_master)
1449                          ? ompt_parallel_league
1450                          : ompt_parallel_team);
1451         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1452             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1453             return_address);
1454       }
1455       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1456     }
1457 #endif
1458 
1459     master_th->th.th_ident = loc;
1460 
1461     if (master_th->th.th_teams_microtask && ap &&
1462         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1463       // AC: This is start of parallel that is nested inside teams construct.
1464       // The team is actual (hot), all workers are ready at the fork barrier.
1465       // No lock needed to initialize the team a bit, then free workers.
1466       parent_team->t.t_ident = loc;
1467       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1468       parent_team->t.t_argc = argc;
1469       argv = (void **)parent_team->t.t_argv;
1470       for (i = argc - 1; i >= 0; --i)
1471         *argv++ = va_arg(kmp_va_deref(ap), void *);
1472       // Increment our nested depth levels, but not increase the serialization
1473       if (parent_team == master_th->th.th_serial_team) {
1474         // AC: we are in serialized parallel
1475         __kmpc_serialized_parallel(loc, gtid);
1476         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1477 
1478         if (call_context == fork_context_gnu) {
1479           // AC: need to decrement t_serialized for enquiry functions to work
1480           // correctly, will restore at join time
1481           parent_team->t.t_serialized--;
1482           return TRUE;
1483         }
1484 
1485 #if OMPD_SUPPORT
1486         parent_team->t.t_pkfn = microtask;
1487 #endif
1488 
1489 #if OMPT_SUPPORT
1490         void *dummy;
1491         void **exit_frame_p;
1492 
1493         ompt_lw_taskteam_t lw_taskteam;
1494 
1495         if (ompt_enabled.enabled) {
1496           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1497                                   &ompt_parallel_data, return_address);
1498           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1499 
1500           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1501           // don't use lw_taskteam after linking. content was swaped
1502 
1503           /* OMPT implicit task begin */
1504           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1505           if (ompt_enabled.ompt_callback_implicit_task) {
1506             OMPT_CUR_TASK_INFO(master_th)->thread_num =
1507                 __kmp_tid_from_gtid(gtid);
1508             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1509                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1510                 implicit_task_data, 1,
1511                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1512           }
1513 
1514           /* OMPT state */
1515           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1516         } else {
1517           exit_frame_p = &dummy;
1518         }
1519 #endif
1520         // AC: need to decrement t_serialized for enquiry functions to work
1521         // correctly, will restore at join time
1522         parent_team->t.t_serialized--;
1523 
1524         {
1525           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1526           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1527           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1528 #if OMPT_SUPPORT
1529                                  ,
1530                                  exit_frame_p
1531 #endif
1532           );
1533         }
1534 
1535 #if OMPT_SUPPORT
1536         if (ompt_enabled.enabled) {
1537           *exit_frame_p = NULL;
1538           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1539           if (ompt_enabled.ompt_callback_implicit_task) {
1540             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1541                 ompt_scope_end, NULL, implicit_task_data, 1,
1542                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1543           }
1544           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1545           __ompt_lw_taskteam_unlink(master_th);
1546           if (ompt_enabled.ompt_callback_parallel_end) {
1547             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1548                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1549                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1550                 return_address);
1551           }
1552           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1553         }
1554 #endif
1555         return TRUE;
1556       }
1557 
1558       parent_team->t.t_pkfn = microtask;
1559       parent_team->t.t_invoke = invoker;
1560       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1561       parent_team->t.t_active_level++;
1562       parent_team->t.t_level++;
1563       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1564 
1565 #if OMPT_SUPPORT
1566       if (ompt_enabled.enabled) {
1567         ompt_lw_taskteam_t lw_taskteam;
1568         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1569                                 &ompt_parallel_data, return_address);
1570         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1571       }
1572 #endif
1573 
1574       /* Change number of threads in the team if requested */
1575       if (master_set_numthreads) { // The parallel has num_threads clause
1576         if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1577           // AC: only can reduce number of threads dynamically, can't increase
1578           kmp_info_t **other_threads = parent_team->t.t_threads;
1579           // NOTE: if using distributed barrier, we need to run this code block
1580           // even when the team size appears not to have changed from the max.
1581           int old_proc = master_th->th.th_teams_size.nth;
1582           if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
1583               bp_dist_bar) {
1584             __kmp_resize_dist_barrier(parent_team, old_proc,
1585                                       master_set_numthreads);
1586             __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1587           }
1588           parent_team->t.t_nproc = master_set_numthreads;
1589           for (i = 0; i < master_set_numthreads; ++i) {
1590             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1591           }
1592         }
1593         // Keep extra threads hot in the team for possible next parallels
1594         master_th->th.th_set_nproc = 0;
1595       }
1596 
1597 #if USE_DEBUGGER
1598       if (__kmp_debugging) { // Let debugger override number of threads.
1599         int nth = __kmp_omp_num_threads(loc);
1600         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1601           master_set_numthreads = nth;
1602         }
1603       }
1604 #endif
1605 
1606       // Figure out the proc_bind policy for the nested parallel within teams
1607       kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1608       // proc_bind_default means don't update
1609       kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1610       if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1611         proc_bind = proc_bind_false;
1612       } else {
1613         // No proc_bind clause specified; use current proc-bind-var
1614         if (proc_bind == proc_bind_default) {
1615           proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1616         }
1617         /* else: The proc_bind policy was specified explicitly on parallel
1618            clause.
1619            This overrides proc-bind-var for this parallel region, but does not
1620            change proc-bind-var. */
1621         // Figure the value of proc-bind-var for the child threads.
1622         if ((level + 1 < __kmp_nested_proc_bind.used) &&
1623             (__kmp_nested_proc_bind.bind_types[level + 1] !=
1624              master_th->th.th_current_task->td_icvs.proc_bind)) {
1625           proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1626         }
1627       }
1628       KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1629       // Need to change the bind-var ICV to correct value for each implicit task
1630       if (proc_bind_icv != proc_bind_default &&
1631           master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1632         kmp_info_t **other_threads = parent_team->t.t_threads;
1633         for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1634           other_threads[i]->th.th_current_task->td_icvs.proc_bind =
1635               proc_bind_icv;
1636         }
1637       }
1638       // Reset for next parallel region
1639       master_th->th.th_set_proc_bind = proc_bind_default;
1640 
1641 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1642       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1643            KMP_ITT_DEBUG) &&
1644           __kmp_forkjoin_frames_mode == 3 &&
1645           parent_team->t.t_active_level == 1 // only report frames at level 1
1646           && master_th->th.th_teams_size.nteams == 1) {
1647         kmp_uint64 tmp_time = __itt_get_timestamp();
1648         master_th->th.th_frame_time = tmp_time;
1649         parent_team->t.t_region_time = tmp_time;
1650       }
1651       if (__itt_stack_caller_create_ptr) {
1652         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1653         // create new stack stitching id before entering fork barrier
1654         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1655       }
1656 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1657 #if KMP_AFFINITY_SUPPORTED
1658       __kmp_partition_places(parent_team);
1659 #endif
1660 
1661       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1662                     "master_th=%p, gtid=%d\n",
1663                     root, parent_team, master_th, gtid));
1664       __kmp_internal_fork(loc, gtid, parent_team);
1665       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1666                     "master_th=%p, gtid=%d\n",
1667                     root, parent_team, master_th, gtid));
1668 
1669       if (call_context == fork_context_gnu)
1670         return TRUE;
1671 
1672       /* Invoke microtask for PRIMARY thread */
1673       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1674                     parent_team->t.t_id, parent_team->t.t_pkfn));
1675 
1676       if (!parent_team->t.t_invoke(gtid)) {
1677         KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1678       }
1679       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1680                     parent_team->t.t_id, parent_team->t.t_pkfn));
1681       KMP_MB(); /* Flush all pending memory write invalidates.  */
1682 
1683       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1684 
1685       return TRUE;
1686     } // Parallel closely nested in teams construct
1687 
1688 #if KMP_DEBUG
1689     if (__kmp_tasking_mode != tskm_immediate_exec) {
1690       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1691                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1692     }
1693 #endif
1694 
1695     // Need this to happen before we determine the number of threads, not while
1696     // we are allocating the team
1697     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1698     int enter_teams = 0;
1699     if (parent_team->t.t_active_level >=
1700         master_th->th.th_current_task->td_icvs.max_active_levels) {
1701       nthreads = 1;
1702     } else {
1703       enter_teams = ((ap == NULL && active_level == 0) ||
1704                      (ap && teams_level > 0 && teams_level == level));
1705       nthreads = master_set_numthreads
1706                      ? master_set_numthreads
1707                      // TODO: get nproc directly from current task
1708                      : get__nproc_2(parent_team, master_tid);
1709       // Check if we need to take forkjoin lock? (no need for serialized
1710       // parallel out of teams construct). This code moved here from
1711       // __kmp_reserve_threads() to speedup nested serialized parallels.
1712       if (nthreads > 1) {
1713         if ((get__max_active_levels(master_th) == 1 &&
1714              (root->r.r_in_parallel && !enter_teams)) ||
1715             (__kmp_library == library_serial)) {
1716           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1717                         " threads\n",
1718                         gtid, nthreads));
1719           nthreads = 1;
1720         }
1721       }
1722       if (nthreads > 1) {
1723         /* determine how many new threads we can use */
1724         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1725         /* AC: If we execute teams from parallel region (on host), then teams
1726            should be created but each can only have 1 thread if nesting is
1727            disabled. If teams called from serial region, then teams and their
1728            threads should be created regardless of the nesting setting. */
1729         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1730                                          nthreads, enter_teams);
1731         if (nthreads == 1) {
1732           // Free lock for single thread execution here; for multi-thread
1733           // execution it will be freed later after team of threads created
1734           // and initialized
1735           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1736         }
1737       }
1738     }
1739     KMP_DEBUG_ASSERT(nthreads > 0);
1740 
1741     // If we temporarily changed the set number of threads then restore it now
1742     master_th->th.th_set_nproc = 0;
1743 
1744     /* create a serialized parallel region? */
1745     if (nthreads == 1) {
1746 /* josh todo: hypothetical question: what do we do for OS X*? */
1747 #if KMP_OS_LINUX &&                                                            \
1748     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1749       void *args[argc];
1750 #else
1751       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1752 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1753           KMP_ARCH_AARCH64) */
1754 
1755       KA_TRACE(20,
1756                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1757 
1758       __kmpc_serialized_parallel(loc, gtid);
1759 
1760 #if OMPD_SUPPORT
1761       master_th->th.th_serial_team->t.t_pkfn = microtask;
1762 #endif
1763 
1764       if (call_context == fork_context_intel) {
1765         /* TODO this sucks, use the compiler itself to pass args! :) */
1766         master_th->th.th_serial_team->t.t_ident = loc;
1767         if (!ap) {
1768           // revert change made in __kmpc_serialized_parallel()
1769           master_th->th.th_serial_team->t.t_level--;
1770           // Get args from parent team for teams construct
1771 
1772 #if OMPT_SUPPORT
1773           void *dummy;
1774           void **exit_frame_p;
1775           ompt_task_info_t *task_info;
1776 
1777           ompt_lw_taskteam_t lw_taskteam;
1778 
1779           if (ompt_enabled.enabled) {
1780             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1781                                     &ompt_parallel_data, return_address);
1782 
1783             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1784             // don't use lw_taskteam after linking. content was swaped
1785 
1786             task_info = OMPT_CUR_TASK_INFO(master_th);
1787             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1788             if (ompt_enabled.ompt_callback_implicit_task) {
1789               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1790                   __kmp_tid_from_gtid(gtid);
1791               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1792                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1793                   &(task_info->task_data), 1,
1794                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1795                   ompt_task_implicit);
1796             }
1797 
1798             /* OMPT state */
1799             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1800           } else {
1801             exit_frame_p = &dummy;
1802           }
1803 #endif
1804 
1805           {
1806             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1807             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1808             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1809                                    parent_team->t.t_argv
1810 #if OMPT_SUPPORT
1811                                    ,
1812                                    exit_frame_p
1813 #endif
1814             );
1815           }
1816 
1817 #if OMPT_SUPPORT
1818           if (ompt_enabled.enabled) {
1819             *exit_frame_p = NULL;
1820             if (ompt_enabled.ompt_callback_implicit_task) {
1821               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1822                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1823                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1824                   ompt_task_implicit);
1825             }
1826             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1827             __ompt_lw_taskteam_unlink(master_th);
1828             if (ompt_enabled.ompt_callback_parallel_end) {
1829               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1830                   &ompt_parallel_data, parent_task_data,
1831                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1832                   return_address);
1833             }
1834             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1835           }
1836 #endif
1837         } else if (microtask == (microtask_t)__kmp_teams_master) {
1838           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1839                            master_th->th.th_serial_team);
1840           team = master_th->th.th_team;
1841           // team->t.t_pkfn = microtask;
1842           team->t.t_invoke = invoker;
1843           __kmp_alloc_argv_entries(argc, team, TRUE);
1844           team->t.t_argc = argc;
1845           argv = (void **)team->t.t_argv;
1846           if (ap) {
1847             for (i = argc - 1; i >= 0; --i)
1848               *argv++ = va_arg(kmp_va_deref(ap), void *);
1849           } else {
1850             for (i = 0; i < argc; ++i)
1851               // Get args from parent team for teams construct
1852               argv[i] = parent_team->t.t_argv[i];
1853           }
1854           // AC: revert change made in __kmpc_serialized_parallel()
1855           //     because initial code in teams should have level=0
1856           team->t.t_level--;
1857           // AC: call special invoker for outer "parallel" of teams construct
1858           invoker(gtid);
1859 #if OMPT_SUPPORT
1860           if (ompt_enabled.enabled) {
1861             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1862             if (ompt_enabled.ompt_callback_implicit_task) {
1863               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1864                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1865                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1866             }
1867             if (ompt_enabled.ompt_callback_parallel_end) {
1868               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1869                   &ompt_parallel_data, parent_task_data,
1870                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1871                   return_address);
1872             }
1873             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874           }
1875 #endif
1876         } else {
1877           argv = args;
1878           for (i = argc - 1; i >= 0; --i)
1879             *argv++ = va_arg(kmp_va_deref(ap), void *);
1880           KMP_MB();
1881 
1882 #if OMPT_SUPPORT
1883           void *dummy;
1884           void **exit_frame_p;
1885           ompt_task_info_t *task_info;
1886 
1887           ompt_lw_taskteam_t lw_taskteam;
1888 
1889           if (ompt_enabled.enabled) {
1890             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1891                                     &ompt_parallel_data, return_address);
1892             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1893             // don't use lw_taskteam after linking. content was swaped
1894             task_info = OMPT_CUR_TASK_INFO(master_th);
1895             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1896 
1897             /* OMPT implicit task begin */
1898             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1899             if (ompt_enabled.ompt_callback_implicit_task) {
1900               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1901                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1902                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1903                   ompt_task_implicit);
1904               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1905                   __kmp_tid_from_gtid(gtid);
1906             }
1907 
1908             /* OMPT state */
1909             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1910           } else {
1911             exit_frame_p = &dummy;
1912           }
1913 #endif
1914 
1915           {
1916             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1917             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1918             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1919 #if OMPT_SUPPORT
1920                                    ,
1921                                    exit_frame_p
1922 #endif
1923             );
1924           }
1925 
1926 #if OMPT_SUPPORT
1927           if (ompt_enabled.enabled) {
1928             *exit_frame_p = NULL;
1929             if (ompt_enabled.ompt_callback_implicit_task) {
1930               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1931                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1932                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1933                   ompt_task_implicit);
1934             }
1935 
1936             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1937             __ompt_lw_taskteam_unlink(master_th);
1938             if (ompt_enabled.ompt_callback_parallel_end) {
1939               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1940                   &ompt_parallel_data, parent_task_data,
1941                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1942                   return_address);
1943             }
1944             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1945           }
1946 #endif
1947         }
1948       } else if (call_context == fork_context_gnu) {
1949 #if OMPT_SUPPORT
1950         ompt_lw_taskteam_t lwt;
1951         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1952                                 return_address);
1953 
1954         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1955         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1956 // don't use lw_taskteam after linking. content was swaped
1957 #endif
1958 
1959         // we were called from GNU native code
1960         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1961         return FALSE;
1962       } else {
1963         KMP_ASSERT2(call_context < fork_context_last,
1964                     "__kmp_fork_call: unknown fork_context parameter");
1965       }
1966 
1967       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1968       KMP_MB();
1969       return FALSE;
1970     } // if (nthreads == 1)
1971 
1972     // GEH: only modify the executing flag in the case when not serialized
1973     //      serialized case is handled in kmpc_serialized_parallel
1974     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1975                   "curtask=%p, curtask_max_aclevel=%d\n",
1976                   parent_team->t.t_active_level, master_th,
1977                   master_th->th.th_current_task,
1978                   master_th->th.th_current_task->td_icvs.max_active_levels));
1979     // TODO: GEH - cannot do this assertion because root thread not set up as
1980     // executing
1981     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1982     master_th->th.th_current_task->td_flags.executing = 0;
1983 
1984     if (!master_th->th.th_teams_microtask || level > teams_level) {
1985       /* Increment our nested depth level */
1986       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1987     }
1988 
1989     // See if we need to make a copy of the ICVs.
1990     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1991     if ((level + 1 < __kmp_nested_nth.used) &&
1992         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1993       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1994     } else {
1995       nthreads_icv = 0; // don't update
1996     }
1997 
1998     // Figure out the proc_bind_policy for the new team.
1999     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2000     // proc_bind_default means don't update
2001     kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2002     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2003       proc_bind = proc_bind_false;
2004     } else {
2005       // No proc_bind clause specified; use current proc-bind-var for this
2006       // parallel region
2007       if (proc_bind == proc_bind_default) {
2008         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2009       }
2010       // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2011       if (master_th->th.th_teams_microtask &&
2012           microtask == (microtask_t)__kmp_teams_master) {
2013         proc_bind = __kmp_teams_proc_bind;
2014       }
2015       /* else: The proc_bind policy was specified explicitly on parallel clause.
2016          This overrides proc-bind-var for this parallel region, but does not
2017          change proc-bind-var. */
2018       // Figure the value of proc-bind-var for the child threads.
2019       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2020           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2021            master_th->th.th_current_task->td_icvs.proc_bind)) {
2022         // Do not modify the proc bind icv for the two teams construct forks
2023         // They just let the proc bind icv pass through
2024         if (!master_th->th.th_teams_microtask ||
2025             !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2026           proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2027       }
2028     }
2029 
2030     // Reset for next parallel region
2031     master_th->th.th_set_proc_bind = proc_bind_default;
2032 
2033     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2034       kmp_internal_control_t new_icvs;
2035       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2036       new_icvs.next = NULL;
2037       if (nthreads_icv > 0) {
2038         new_icvs.nproc = nthreads_icv;
2039       }
2040       if (proc_bind_icv != proc_bind_default) {
2041         new_icvs.proc_bind = proc_bind_icv;
2042       }
2043 
2044       /* allocate a new parallel team */
2045       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2046       team = __kmp_allocate_team(root, nthreads, nthreads,
2047 #if OMPT_SUPPORT
2048                                  ompt_parallel_data,
2049 #endif
2050                                  proc_bind, &new_icvs,
2051                                  argc USE_NESTED_HOT_ARG(master_th));
2052       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2053         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2054     } else {
2055       /* allocate a new parallel team */
2056       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2057       team = __kmp_allocate_team(root, nthreads, nthreads,
2058 #if OMPT_SUPPORT
2059                                  ompt_parallel_data,
2060 #endif
2061                                  proc_bind,
2062                                  &master_th->th.th_current_task->td_icvs,
2063                                  argc USE_NESTED_HOT_ARG(master_th));
2064       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2065         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2066                   &master_th->th.th_current_task->td_icvs);
2067     }
2068     KF_TRACE(
2069         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2070 
2071     /* setup the new team */
2072     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2073     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2074     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2075     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2076     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2077 #if OMPT_SUPPORT
2078     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2079                           return_address);
2080 #endif
2081     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2082     // TODO: parent_team->t.t_level == INT_MAX ???
2083     if (!master_th->th.th_teams_microtask || level > teams_level) {
2084       int new_level = parent_team->t.t_level + 1;
2085       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2086       new_level = parent_team->t.t_active_level + 1;
2087       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2088     } else {
2089       // AC: Do not increase parallel level at start of the teams construct
2090       int new_level = parent_team->t.t_level;
2091       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2092       new_level = parent_team->t.t_active_level;
2093       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2094     }
2095     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2096     // set primary thread's schedule as new run-time schedule
2097     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2098 
2099     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2100     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2101 
2102     // Update the floating point rounding in the team if required.
2103     propagateFPControl(team);
2104 #if OMPD_SUPPORT
2105     if (ompd_state & OMPD_ENABLE_BP)
2106       ompd_bp_parallel_begin();
2107 #endif
2108 
2109     if (__kmp_tasking_mode != tskm_immediate_exec) {
2110       // Set primary thread's task team to team's task team. Unless this is hot
2111       // team, it should be NULL.
2112       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2113                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2114       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2115                     "%p, new task_team %p / team %p\n",
2116                     __kmp_gtid_from_thread(master_th),
2117                     master_th->th.th_task_team, parent_team,
2118                     team->t.t_task_team[master_th->th.th_task_state], team));
2119 
2120       if (active_level || master_th->th.th_task_team) {
2121         // Take a memo of primary thread's task_state
2122         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2123         if (master_th->th.th_task_state_top >=
2124             master_th->th.th_task_state_stack_sz) { // increase size
2125           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2126           kmp_uint8 *old_stack, *new_stack;
2127           kmp_uint32 i;
2128           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2129           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2130             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2131           }
2132           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2133                ++i) { // zero-init rest of stack
2134             new_stack[i] = 0;
2135           }
2136           old_stack = master_th->th.th_task_state_memo_stack;
2137           master_th->th.th_task_state_memo_stack = new_stack;
2138           master_th->th.th_task_state_stack_sz = new_size;
2139           __kmp_free(old_stack);
2140         }
2141         // Store primary thread's task_state on stack
2142         master_th->th
2143             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2144             master_th->th.th_task_state;
2145         master_th->th.th_task_state_top++;
2146 #if KMP_NESTED_HOT_TEAMS
2147         if (master_th->th.th_hot_teams &&
2148             active_level < __kmp_hot_teams_max_level &&
2149             team == master_th->th.th_hot_teams[active_level].hot_team) {
2150           // Restore primary thread's nested state if nested hot team
2151           master_th->th.th_task_state =
2152               master_th->th
2153                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2154         } else {
2155 #endif
2156           master_th->th.th_task_state = 0;
2157 #if KMP_NESTED_HOT_TEAMS
2158         }
2159 #endif
2160       }
2161 #if !KMP_NESTED_HOT_TEAMS
2162       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2163                        (team == root->r.r_hot_team));
2164 #endif
2165     }
2166 
2167     KA_TRACE(
2168         20,
2169         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2170          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2171          team->t.t_nproc));
2172     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2173                      (team->t.t_master_tid == 0 &&
2174                       (team->t.t_parent == root->r.r_root_team ||
2175                        team->t.t_parent->t.t_serialized)));
2176     KMP_MB();
2177 
2178     /* now, setup the arguments */
2179     argv = (void **)team->t.t_argv;
2180     if (ap) {
2181       for (i = argc - 1; i >= 0; --i) {
2182         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2183         KMP_CHECK_UPDATE(*argv, new_argv);
2184         argv++;
2185       }
2186     } else {
2187       for (i = 0; i < argc; ++i) {
2188         // Get args from parent team for teams construct
2189         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2190       }
2191     }
2192 
2193     /* now actually fork the threads */
2194     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2195     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2196       root->r.r_active = TRUE;
2197 
2198     __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2199     __kmp_setup_icv_copy(team, nthreads,
2200                          &master_th->th.th_current_task->td_icvs, loc);
2201 
2202 #if OMPT_SUPPORT
2203     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2204 #endif
2205 
2206     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2207 
2208 #if USE_ITT_BUILD
2209     if (team->t.t_active_level == 1 // only report frames at level 1
2210         && !master_th->th.th_teams_microtask) { // not in teams construct
2211 #if USE_ITT_NOTIFY
2212       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2213           (__kmp_forkjoin_frames_mode == 3 ||
2214            __kmp_forkjoin_frames_mode == 1)) {
2215         kmp_uint64 tmp_time = 0;
2216         if (__itt_get_timestamp_ptr)
2217           tmp_time = __itt_get_timestamp();
2218         // Internal fork - report frame begin
2219         master_th->th.th_frame_time = tmp_time;
2220         if (__kmp_forkjoin_frames_mode == 3)
2221           team->t.t_region_time = tmp_time;
2222       } else
2223 // only one notification scheme (either "submit" or "forking/joined", not both)
2224 #endif /* USE_ITT_NOTIFY */
2225         if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2226             __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2227           // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2228           __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2229         }
2230     }
2231 #endif /* USE_ITT_BUILD */
2232 
2233     /* now go on and do the work */
2234     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2235     KMP_MB();
2236     KF_TRACE(10,
2237              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2238               root, team, master_th, gtid));
2239 
2240 #if USE_ITT_BUILD
2241     if (__itt_stack_caller_create_ptr) {
2242       // create new stack stitching id before entering fork barrier
2243       if (!enter_teams) {
2244         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2245         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2246       } else if (parent_team->t.t_serialized) {
2247         // keep stack stitching id in the serialized parent_team;
2248         // current team will be used for parallel inside the teams;
2249         // if parent_team is active, then it already keeps stack stitching id
2250         // for the league of teams
2251         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2252         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2253       }
2254     }
2255 #endif /* USE_ITT_BUILD */
2256 
2257     // AC: skip __kmp_internal_fork at teams construct, let only primary
2258     // threads execute
2259     if (ap) {
2260       __kmp_internal_fork(loc, gtid, team);
2261       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2262                     "master_th=%p, gtid=%d\n",
2263                     root, team, master_th, gtid));
2264     }
2265 
2266     if (call_context == fork_context_gnu) {
2267       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2268       return TRUE;
2269     }
2270 
2271     /* Invoke microtask for PRIMARY thread */
2272     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2273                   team->t.t_id, team->t.t_pkfn));
2274   } // END of timer KMP_fork_call block
2275 
2276 #if KMP_STATS_ENABLED
2277   // If beginning a teams construct, then change thread state
2278   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2279   if (!ap) {
2280     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2281   }
2282 #endif
2283 
2284   if (!team->t.t_invoke(gtid)) {
2285     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2286   }
2287 
2288 #if KMP_STATS_ENABLED
2289   // If was beginning of a teams construct, then reset thread state
2290   if (!ap) {
2291     KMP_SET_THREAD_STATE(previous_state);
2292   }
2293 #endif
2294 
2295   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2296                 team->t.t_id, team->t.t_pkfn));
2297   KMP_MB(); /* Flush all pending memory write invalidates.  */
2298 
2299   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2300 #if OMPT_SUPPORT
2301   if (ompt_enabled.enabled) {
2302     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2303   }
2304 #endif
2305 
2306   return TRUE;
2307 }
2308 
2309 #if OMPT_SUPPORT
2310 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2311                                             kmp_team_t *team) {
2312   // restore state outside the region
2313   thread->th.ompt_thread_info.state =
2314       ((team->t.t_serialized) ? ompt_state_work_serial
2315                               : ompt_state_work_parallel);
2316 }
2317 
2318 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2319                                    kmp_team_t *team, ompt_data_t *parallel_data,
2320                                    int flags, void *codeptr) {
2321   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2322   if (ompt_enabled.ompt_callback_parallel_end) {
2323     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2324         parallel_data, &(task_info->task_data), flags, codeptr);
2325   }
2326 
2327   task_info->frame.enter_frame = ompt_data_none;
2328   __kmp_join_restore_state(thread, team);
2329 }
2330 #endif
2331 
2332 void __kmp_join_call(ident_t *loc, int gtid
2333 #if OMPT_SUPPORT
2334                      ,
2335                      enum fork_context_e fork_context
2336 #endif
2337                      ,
2338                      int exit_teams) {
2339   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2340   kmp_team_t *team;
2341   kmp_team_t *parent_team;
2342   kmp_info_t *master_th;
2343   kmp_root_t *root;
2344   int master_active;
2345 
2346   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2347 
2348   /* setup current data */
2349   master_th = __kmp_threads[gtid];
2350   root = master_th->th.th_root;
2351   team = master_th->th.th_team;
2352   parent_team = team->t.t_parent;
2353 
2354   master_th->th.th_ident = loc;
2355 
2356 #if OMPT_SUPPORT
2357   void *team_microtask = (void *)team->t.t_pkfn;
2358   // For GOMP interface with serialized parallel, need the
2359   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2360   // and end-parallel events.
2361   if (ompt_enabled.enabled &&
2362       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2363     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2364   }
2365 #endif
2366 
2367 #if KMP_DEBUG
2368   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2369     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2370                   "th_task_team = %p\n",
2371                   __kmp_gtid_from_thread(master_th), team,
2372                   team->t.t_task_team[master_th->th.th_task_state],
2373                   master_th->th.th_task_team));
2374     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2375                      team->t.t_task_team[master_th->th.th_task_state]);
2376   }
2377 #endif
2378 
2379   if (team->t.t_serialized) {
2380     if (master_th->th.th_teams_microtask) {
2381       // We are in teams construct
2382       int level = team->t.t_level;
2383       int tlevel = master_th->th.th_teams_level;
2384       if (level == tlevel) {
2385         // AC: we haven't incremented it earlier at start of teams construct,
2386         //     so do it here - at the end of teams construct
2387         team->t.t_level++;
2388       } else if (level == tlevel + 1) {
2389         // AC: we are exiting parallel inside teams, need to increment
2390         // serialization in order to restore it in the next call to
2391         // __kmpc_end_serialized_parallel
2392         team->t.t_serialized++;
2393       }
2394     }
2395     __kmpc_end_serialized_parallel(loc, gtid);
2396 
2397 #if OMPT_SUPPORT
2398     if (ompt_enabled.enabled) {
2399       __kmp_join_restore_state(master_th, parent_team);
2400     }
2401 #endif
2402 
2403     return;
2404   }
2405 
2406   master_active = team->t.t_master_active;
2407 
2408   if (!exit_teams) {
2409     // AC: No barrier for internal teams at exit from teams construct.
2410     //     But there is barrier for external team (league).
2411     __kmp_internal_join(loc, gtid, team);
2412 #if USE_ITT_BUILD
2413     if (__itt_stack_caller_create_ptr) {
2414       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2415       // destroy the stack stitching id after join barrier
2416       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2417       team->t.t_stack_id = NULL;
2418     }
2419 #endif
2420   } else {
2421     master_th->th.th_task_state =
2422         0; // AC: no tasking in teams (out of any parallel)
2423 #if USE_ITT_BUILD
2424     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2425       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2426       // destroy the stack stitching id on exit from the teams construct
2427       // if parent_team is active, then the id will be destroyed later on
2428       // by master of the league of teams
2429       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2430       parent_team->t.t_stack_id = NULL;
2431     }
2432 #endif
2433 
2434     if (team->t.t_nproc > 1 &&
2435         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2436       team->t.b->update_num_threads(team->t.t_nproc);
2437       __kmp_add_threads_to_team(team, team->t.t_nproc);
2438     }
2439   }
2440 
2441   KMP_MB();
2442 
2443 #if OMPT_SUPPORT
2444   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2445   void *codeptr = team->t.ompt_team_info.master_return_address;
2446 #endif
2447 
2448 #if USE_ITT_BUILD
2449   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2450   if (team->t.t_active_level == 1 &&
2451       (!master_th->th.th_teams_microtask || /* not in teams construct */
2452        master_th->th.th_teams_size.nteams == 1)) {
2453     master_th->th.th_ident = loc;
2454     // only one notification scheme (either "submit" or "forking/joined", not
2455     // both)
2456     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2457         __kmp_forkjoin_frames_mode == 3)
2458       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2459                              master_th->th.th_frame_time, 0, loc,
2460                              master_th->th.th_team_nproc, 1);
2461     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2462              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2463       __kmp_itt_region_joined(gtid);
2464   } // active_level == 1
2465 #endif /* USE_ITT_BUILD */
2466 
2467 #if KMP_AFFINITY_SUPPORTED
2468   if (!exit_teams) {
2469     // Restore master thread's partition.
2470     master_th->th.th_first_place = team->t.t_first_place;
2471     master_th->th.th_last_place = team->t.t_last_place;
2472   }
2473 #endif // KMP_AFFINITY_SUPPORTED
2474 
2475   if (master_th->th.th_teams_microtask && !exit_teams &&
2476       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2477       team->t.t_level == master_th->th.th_teams_level + 1) {
2478 // AC: We need to leave the team structure intact at the end of parallel
2479 // inside the teams construct, so that at the next parallel same (hot) team
2480 // works, only adjust nesting levels
2481 #if OMPT_SUPPORT
2482     ompt_data_t ompt_parallel_data = ompt_data_none;
2483     if (ompt_enabled.enabled) {
2484       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2485       if (ompt_enabled.ompt_callback_implicit_task) {
2486         int ompt_team_size = team->t.t_nproc;
2487         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2488             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2489             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2490       }
2491       task_info->frame.exit_frame = ompt_data_none;
2492       task_info->task_data = ompt_data_none;
2493       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2494       __ompt_lw_taskteam_unlink(master_th);
2495     }
2496 #endif
2497     /* Decrement our nested depth level */
2498     team->t.t_level--;
2499     team->t.t_active_level--;
2500     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2501 
2502     // Restore number of threads in the team if needed. This code relies on
2503     // the proper adjustment of th_teams_size.nth after the fork in
2504     // __kmp_teams_master on each teams primary thread in the case that
2505     // __kmp_reserve_threads reduced it.
2506     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2507       int old_num = master_th->th.th_team_nproc;
2508       int new_num = master_th->th.th_teams_size.nth;
2509       kmp_info_t **other_threads = team->t.t_threads;
2510       team->t.t_nproc = new_num;
2511       for (int i = 0; i < old_num; ++i) {
2512         other_threads[i]->th.th_team_nproc = new_num;
2513       }
2514       // Adjust states of non-used threads of the team
2515       for (int i = old_num; i < new_num; ++i) {
2516         // Re-initialize thread's barrier data.
2517         KMP_DEBUG_ASSERT(other_threads[i]);
2518         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2519         for (int b = 0; b < bs_last_barrier; ++b) {
2520           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2521           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2522 #if USE_DEBUGGER
2523           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2524 #endif
2525         }
2526         if (__kmp_tasking_mode != tskm_immediate_exec) {
2527           // Synchronize thread's task state
2528           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2529         }
2530       }
2531     }
2532 
2533 #if OMPT_SUPPORT
2534     if (ompt_enabled.enabled) {
2535       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2536                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2537     }
2538 #endif
2539 
2540     return;
2541   }
2542 
2543   /* do cleanup and restore the parent team */
2544   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2545   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2546 
2547   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2548 
2549   /* jc: The following lock has instructions with REL and ACQ semantics,
2550      separating the parallel user code called in this parallel region
2551      from the serial user code called after this function returns. */
2552   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2553 
2554   if (!master_th->th.th_teams_microtask ||
2555       team->t.t_level > master_th->th.th_teams_level) {
2556     /* Decrement our nested depth level */
2557     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2558   }
2559   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2560 
2561 #if OMPT_SUPPORT
2562   if (ompt_enabled.enabled) {
2563     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2564     if (ompt_enabled.ompt_callback_implicit_task) {
2565       int flags = (team_microtask == (void *)__kmp_teams_master)
2566                       ? ompt_task_initial
2567                       : ompt_task_implicit;
2568       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2569       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2570           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2571           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2572     }
2573     task_info->frame.exit_frame = ompt_data_none;
2574     task_info->task_data = ompt_data_none;
2575   }
2576 #endif
2577 
2578   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2579                 master_th, team));
2580   __kmp_pop_current_task_from_thread(master_th);
2581 
2582   master_th->th.th_def_allocator = team->t.t_def_allocator;
2583 
2584 #if OMPD_SUPPORT
2585   if (ompd_state & OMPD_ENABLE_BP)
2586     ompd_bp_parallel_end();
2587 #endif
2588   updateHWFPControl(team);
2589 
2590   if (root->r.r_active != master_active)
2591     root->r.r_active = master_active;
2592 
2593   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2594                             master_th)); // this will free worker threads
2595 
2596   /* this race was fun to find. make sure the following is in the critical
2597      region otherwise assertions may fail occasionally since the old team may be
2598      reallocated and the hierarchy appears inconsistent. it is actually safe to
2599      run and won't cause any bugs, but will cause those assertion failures. it's
2600      only one deref&assign so might as well put this in the critical region */
2601   master_th->th.th_team = parent_team;
2602   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2603   master_th->th.th_team_master = parent_team->t.t_threads[0];
2604   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2605 
2606   /* restore serialized team, if need be */
2607   if (parent_team->t.t_serialized &&
2608       parent_team != master_th->th.th_serial_team &&
2609       parent_team != root->r.r_root_team) {
2610     __kmp_free_team(root,
2611                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2612     master_th->th.th_serial_team = parent_team;
2613   }
2614 
2615   if (__kmp_tasking_mode != tskm_immediate_exec) {
2616     if (master_th->th.th_task_state_top >
2617         0) { // Restore task state from memo stack
2618       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2619       // Remember primary thread's state if we re-use this nested hot team
2620       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2621           master_th->th.th_task_state;
2622       --master_th->th.th_task_state_top; // pop
2623       // Now restore state at this level
2624       master_th->th.th_task_state =
2625           master_th->th
2626               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2627     }
2628     // Copy the task team from the parent team to the primary thread
2629     master_th->th.th_task_team =
2630         parent_team->t.t_task_team[master_th->th.th_task_state];
2631     KA_TRACE(20,
2632              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2633               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2634               parent_team));
2635   }
2636 
2637   // TODO: GEH - cannot do this assertion because root thread not set up as
2638   // executing
2639   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2640   master_th->th.th_current_task->td_flags.executing = 1;
2641 
2642   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2643 
2644 #if OMPT_SUPPORT
2645   int flags =
2646       OMPT_INVOKER(fork_context) |
2647       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2648                                                       : ompt_parallel_team);
2649   if (ompt_enabled.enabled) {
2650     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2651                     codeptr);
2652   }
2653 #endif
2654 
2655   KMP_MB();
2656   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2657 }
2658 
2659 /* Check whether we should push an internal control record onto the
2660    serial team stack.  If so, do it.  */
2661 void __kmp_save_internal_controls(kmp_info_t *thread) {
2662 
2663   if (thread->th.th_team != thread->th.th_serial_team) {
2664     return;
2665   }
2666   if (thread->th.th_team->t.t_serialized > 1) {
2667     int push = 0;
2668 
2669     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2670       push = 1;
2671     } else {
2672       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2673           thread->th.th_team->t.t_serialized) {
2674         push = 1;
2675       }
2676     }
2677     if (push) { /* push a record on the serial team's stack */
2678       kmp_internal_control_t *control =
2679           (kmp_internal_control_t *)__kmp_allocate(
2680               sizeof(kmp_internal_control_t));
2681 
2682       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2683 
2684       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2685 
2686       control->next = thread->th.th_team->t.t_control_stack_top;
2687       thread->th.th_team->t.t_control_stack_top = control;
2688     }
2689   }
2690 }
2691 
2692 /* Changes set_nproc */
2693 void __kmp_set_num_threads(int new_nth, int gtid) {
2694   kmp_info_t *thread;
2695   kmp_root_t *root;
2696 
2697   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2698   KMP_DEBUG_ASSERT(__kmp_init_serial);
2699 
2700   if (new_nth < 1)
2701     new_nth = 1;
2702   else if (new_nth > __kmp_max_nth)
2703     new_nth = __kmp_max_nth;
2704 
2705   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2706   thread = __kmp_threads[gtid];
2707   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2708     return; // nothing to do
2709 
2710   __kmp_save_internal_controls(thread);
2711 
2712   set__nproc(thread, new_nth);
2713 
2714   // If this omp_set_num_threads() call will cause the hot team size to be
2715   // reduced (in the absence of a num_threads clause), then reduce it now,
2716   // rather than waiting for the next parallel region.
2717   root = thread->th.th_root;
2718   if (__kmp_init_parallel && (!root->r.r_active) &&
2719       (root->r.r_hot_team->t.t_nproc > new_nth)
2720 #if KMP_NESTED_HOT_TEAMS
2721       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2722 #endif
2723   ) {
2724     kmp_team_t *hot_team = root->r.r_hot_team;
2725     int f;
2726 
2727     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2728 
2729     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2730       __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2731     }
2732     // Release the extra threads we don't need any more.
2733     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2734       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2735       if (__kmp_tasking_mode != tskm_immediate_exec) {
2736         // When decreasing team size, threads no longer in the team should unref
2737         // task team.
2738         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2739       }
2740       __kmp_free_thread(hot_team->t.t_threads[f]);
2741       hot_team->t.t_threads[f] = NULL;
2742     }
2743     hot_team->t.t_nproc = new_nth;
2744 #if KMP_NESTED_HOT_TEAMS
2745     if (thread->th.th_hot_teams) {
2746       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2747       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2748     }
2749 #endif
2750 
2751     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2752       hot_team->t.b->update_num_threads(new_nth);
2753       __kmp_add_threads_to_team(hot_team, new_nth);
2754     }
2755 
2756     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2757 
2758     // Update the t_nproc field in the threads that are still active.
2759     for (f = 0; f < new_nth; f++) {
2760       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2761       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2762     }
2763     // Special flag in case omp_set_num_threads() call
2764     hot_team->t.t_size_changed = -1;
2765   }
2766 }
2767 
2768 /* Changes max_active_levels */
2769 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2770   kmp_info_t *thread;
2771 
2772   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2773                 "%d = (%d)\n",
2774                 gtid, max_active_levels));
2775   KMP_DEBUG_ASSERT(__kmp_init_serial);
2776 
2777   // validate max_active_levels
2778   if (max_active_levels < 0) {
2779     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2780     // We ignore this call if the user has specified a negative value.
2781     // The current setting won't be changed. The last valid setting will be
2782     // used. A warning will be issued (if warnings are allowed as controlled by
2783     // the KMP_WARNINGS env var).
2784     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2785                   "max_active_levels for thread %d = (%d)\n",
2786                   gtid, max_active_levels));
2787     return;
2788   }
2789   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2790     // it's OK, the max_active_levels is within the valid range: [ 0;
2791     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2792     // We allow a zero value. (implementation defined behavior)
2793   } else {
2794     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2795                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2796     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2797     // Current upper limit is MAX_INT. (implementation defined behavior)
2798     // If the input exceeds the upper limit, we correct the input to be the
2799     // upper limit. (implementation defined behavior)
2800     // Actually, the flow should never get here until we use MAX_INT limit.
2801   }
2802   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2803                 "max_active_levels for thread %d = (%d)\n",
2804                 gtid, max_active_levels));
2805 
2806   thread = __kmp_threads[gtid];
2807 
2808   __kmp_save_internal_controls(thread);
2809 
2810   set__max_active_levels(thread, max_active_levels);
2811 }
2812 
2813 /* Gets max_active_levels */
2814 int __kmp_get_max_active_levels(int gtid) {
2815   kmp_info_t *thread;
2816 
2817   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2818   KMP_DEBUG_ASSERT(__kmp_init_serial);
2819 
2820   thread = __kmp_threads[gtid];
2821   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2822   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2823                 "curtask_maxaclevel=%d\n",
2824                 gtid, thread->th.th_current_task,
2825                 thread->th.th_current_task->td_icvs.max_active_levels));
2826   return thread->th.th_current_task->td_icvs.max_active_levels;
2827 }
2828 
2829 // nteams-var per-device ICV
2830 void __kmp_set_num_teams(int num_teams) {
2831   if (num_teams > 0)
2832     __kmp_nteams = num_teams;
2833 }
2834 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2835 // teams-thread-limit-var per-device ICV
2836 void __kmp_set_teams_thread_limit(int limit) {
2837   if (limit > 0)
2838     __kmp_teams_thread_limit = limit;
2839 }
2840 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2841 
2842 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2843 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2844 
2845 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2846 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2847   kmp_info_t *thread;
2848   kmp_sched_t orig_kind;
2849   //    kmp_team_t *team;
2850 
2851   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2852                 gtid, (int)kind, chunk));
2853   KMP_DEBUG_ASSERT(__kmp_init_serial);
2854 
2855   // Check if the kind parameter is valid, correct if needed.
2856   // Valid parameters should fit in one of two intervals - standard or extended:
2857   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2858   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2859   orig_kind = kind;
2860   kind = __kmp_sched_without_mods(kind);
2861 
2862   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2863       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2864     // TODO: Hint needs attention in case we change the default schedule.
2865     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2866               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2867               __kmp_msg_null);
2868     kind = kmp_sched_default;
2869     chunk = 0; // ignore chunk value in case of bad kind
2870   }
2871 
2872   thread = __kmp_threads[gtid];
2873 
2874   __kmp_save_internal_controls(thread);
2875 
2876   if (kind < kmp_sched_upper_std) {
2877     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2878       // differ static chunked vs. unchunked:  chunk should be invalid to
2879       // indicate unchunked schedule (which is the default)
2880       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2881     } else {
2882       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2883           __kmp_sch_map[kind - kmp_sched_lower - 1];
2884     }
2885   } else {
2886     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2887     //    kmp_sched_lower - 2 ];
2888     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2889         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2890                       kmp_sched_lower - 2];
2891   }
2892   __kmp_sched_apply_mods_intkind(
2893       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2894   if (kind == kmp_sched_auto || chunk < 1) {
2895     // ignore parameter chunk for schedule auto
2896     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2897   } else {
2898     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2899   }
2900 }
2901 
2902 /* Gets def_sched_var ICV values */
2903 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2904   kmp_info_t *thread;
2905   enum sched_type th_type;
2906 
2907   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2908   KMP_DEBUG_ASSERT(__kmp_init_serial);
2909 
2910   thread = __kmp_threads[gtid];
2911 
2912   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2913   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2914   case kmp_sch_static:
2915   case kmp_sch_static_greedy:
2916   case kmp_sch_static_balanced:
2917     *kind = kmp_sched_static;
2918     __kmp_sched_apply_mods_stdkind(kind, th_type);
2919     *chunk = 0; // chunk was not set, try to show this fact via zero value
2920     return;
2921   case kmp_sch_static_chunked:
2922     *kind = kmp_sched_static;
2923     break;
2924   case kmp_sch_dynamic_chunked:
2925     *kind = kmp_sched_dynamic;
2926     break;
2927   case kmp_sch_guided_chunked:
2928   case kmp_sch_guided_iterative_chunked:
2929   case kmp_sch_guided_analytical_chunked:
2930     *kind = kmp_sched_guided;
2931     break;
2932   case kmp_sch_auto:
2933     *kind = kmp_sched_auto;
2934     break;
2935   case kmp_sch_trapezoidal:
2936     *kind = kmp_sched_trapezoidal;
2937     break;
2938 #if KMP_STATIC_STEAL_ENABLED
2939   case kmp_sch_static_steal:
2940     *kind = kmp_sched_static_steal;
2941     break;
2942 #endif
2943   default:
2944     KMP_FATAL(UnknownSchedulingType, th_type);
2945   }
2946 
2947   __kmp_sched_apply_mods_stdkind(kind, th_type);
2948   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2949 }
2950 
2951 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2952 
2953   int ii, dd;
2954   kmp_team_t *team;
2955   kmp_info_t *thr;
2956 
2957   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2958   KMP_DEBUG_ASSERT(__kmp_init_serial);
2959 
2960   // validate level
2961   if (level == 0)
2962     return 0;
2963   if (level < 0)
2964     return -1;
2965   thr = __kmp_threads[gtid];
2966   team = thr->th.th_team;
2967   ii = team->t.t_level;
2968   if (level > ii)
2969     return -1;
2970 
2971   if (thr->th.th_teams_microtask) {
2972     // AC: we are in teams region where multiple nested teams have same level
2973     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2974     if (level <=
2975         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2976       KMP_DEBUG_ASSERT(ii >= tlevel);
2977       // AC: As we need to pass by the teams league, we need to artificially
2978       // increase ii
2979       if (ii == tlevel) {
2980         ii += 2; // three teams have same level
2981       } else {
2982         ii++; // two teams have same level
2983       }
2984     }
2985   }
2986 
2987   if (ii == level)
2988     return __kmp_tid_from_gtid(gtid);
2989 
2990   dd = team->t.t_serialized;
2991   level++;
2992   while (ii > level) {
2993     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2994     }
2995     if ((team->t.t_serialized) && (!dd)) {
2996       team = team->t.t_parent;
2997       continue;
2998     }
2999     if (ii > level) {
3000       team = team->t.t_parent;
3001       dd = team->t.t_serialized;
3002       ii--;
3003     }
3004   }
3005 
3006   return (dd > 1) ? (0) : (team->t.t_master_tid);
3007 }
3008 
3009 int __kmp_get_team_size(int gtid, int level) {
3010 
3011   int ii, dd;
3012   kmp_team_t *team;
3013   kmp_info_t *thr;
3014 
3015   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3016   KMP_DEBUG_ASSERT(__kmp_init_serial);
3017 
3018   // validate level
3019   if (level == 0)
3020     return 1;
3021   if (level < 0)
3022     return -1;
3023   thr = __kmp_threads[gtid];
3024   team = thr->th.th_team;
3025   ii = team->t.t_level;
3026   if (level > ii)
3027     return -1;
3028 
3029   if (thr->th.th_teams_microtask) {
3030     // AC: we are in teams region where multiple nested teams have same level
3031     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3032     if (level <=
3033         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3034       KMP_DEBUG_ASSERT(ii >= tlevel);
3035       // AC: As we need to pass by the teams league, we need to artificially
3036       // increase ii
3037       if (ii == tlevel) {
3038         ii += 2; // three teams have same level
3039       } else {
3040         ii++; // two teams have same level
3041       }
3042     }
3043   }
3044 
3045   while (ii > level) {
3046     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3047     }
3048     if (team->t.t_serialized && (!dd)) {
3049       team = team->t.t_parent;
3050       continue;
3051     }
3052     if (ii > level) {
3053       team = team->t.t_parent;
3054       ii--;
3055     }
3056   }
3057 
3058   return team->t.t_nproc;
3059 }
3060 
3061 kmp_r_sched_t __kmp_get_schedule_global() {
3062   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3063   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3064   // independently. So one can get the updated schedule here.
3065 
3066   kmp_r_sched_t r_sched;
3067 
3068   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3069   // __kmp_guided. __kmp_sched should keep original value, so that user can set
3070   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3071   // different roots (even in OMP 2.5)
3072   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3073   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3074   if (s == kmp_sch_static) {
3075     // replace STATIC with more detailed schedule (balanced or greedy)
3076     r_sched.r_sched_type = __kmp_static;
3077   } else if (s == kmp_sch_guided_chunked) {
3078     // replace GUIDED with more detailed schedule (iterative or analytical)
3079     r_sched.r_sched_type = __kmp_guided;
3080   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3081     r_sched.r_sched_type = __kmp_sched;
3082   }
3083   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3084 
3085   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3086     // __kmp_chunk may be wrong here (if it was not ever set)
3087     r_sched.chunk = KMP_DEFAULT_CHUNK;
3088   } else {
3089     r_sched.chunk = __kmp_chunk;
3090   }
3091 
3092   return r_sched;
3093 }
3094 
3095 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3096    at least argc number of *t_argv entries for the requested team. */
3097 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3098 
3099   KMP_DEBUG_ASSERT(team);
3100   if (!realloc || argc > team->t.t_max_argc) {
3101 
3102     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3103                    "current entries=%d\n",
3104                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3105     /* if previously allocated heap space for args, free them */
3106     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3107       __kmp_free((void *)team->t.t_argv);
3108 
3109     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3110       /* use unused space in the cache line for arguments */
3111       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3112       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3113                      "argv entries\n",
3114                      team->t.t_id, team->t.t_max_argc));
3115       team->t.t_argv = &team->t.t_inline_argv[0];
3116       if (__kmp_storage_map) {
3117         __kmp_print_storage_map_gtid(
3118             -1, &team->t.t_inline_argv[0],
3119             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3120             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3121             team->t.t_id);
3122       }
3123     } else {
3124       /* allocate space for arguments in the heap */
3125       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3126                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3127                                : 2 * argc;
3128       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3129                      "argv entries\n",
3130                      team->t.t_id, team->t.t_max_argc));
3131       team->t.t_argv =
3132           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3133       if (__kmp_storage_map) {
3134         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3135                                      &team->t.t_argv[team->t.t_max_argc],
3136                                      sizeof(void *) * team->t.t_max_argc,
3137                                      "team_%d.t_argv", team->t.t_id);
3138       }
3139     }
3140   }
3141 }
3142 
3143 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3144   int i;
3145   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3146   team->t.t_threads =
3147       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3148   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3149       sizeof(dispatch_shared_info_t) * num_disp_buff);
3150   team->t.t_dispatch =
3151       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3152   team->t.t_implicit_task_taskdata =
3153       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3154   team->t.t_max_nproc = max_nth;
3155 
3156   /* setup dispatch buffers */
3157   for (i = 0; i < num_disp_buff; ++i) {
3158     team->t.t_disp_buffer[i].buffer_index = i;
3159     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3160   }
3161 }
3162 
3163 static void __kmp_free_team_arrays(kmp_team_t *team) {
3164   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3165   int i;
3166   for (i = 0; i < team->t.t_max_nproc; ++i) {
3167     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3168       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3169       team->t.t_dispatch[i].th_disp_buffer = NULL;
3170     }
3171   }
3172 #if KMP_USE_HIER_SCHED
3173   __kmp_dispatch_free_hierarchies(team);
3174 #endif
3175   __kmp_free(team->t.t_threads);
3176   __kmp_free(team->t.t_disp_buffer);
3177   __kmp_free(team->t.t_dispatch);
3178   __kmp_free(team->t.t_implicit_task_taskdata);
3179   team->t.t_threads = NULL;
3180   team->t.t_disp_buffer = NULL;
3181   team->t.t_dispatch = NULL;
3182   team->t.t_implicit_task_taskdata = 0;
3183 }
3184 
3185 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3186   kmp_info_t **oldThreads = team->t.t_threads;
3187 
3188   __kmp_free(team->t.t_disp_buffer);
3189   __kmp_free(team->t.t_dispatch);
3190   __kmp_free(team->t.t_implicit_task_taskdata);
3191   __kmp_allocate_team_arrays(team, max_nth);
3192 
3193   KMP_MEMCPY(team->t.t_threads, oldThreads,
3194              team->t.t_nproc * sizeof(kmp_info_t *));
3195 
3196   __kmp_free(oldThreads);
3197 }
3198 
3199 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3200 
3201   kmp_r_sched_t r_sched =
3202       __kmp_get_schedule_global(); // get current state of scheduling globals
3203 
3204   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3205 
3206   kmp_internal_control_t g_icvs = {
3207     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3208     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3209     // adjustment of threads (per thread)
3210     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3211     // whether blocktime is explicitly set
3212     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3213 #if KMP_USE_MONITOR
3214     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3215 // intervals
3216 #endif
3217     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3218     // next parallel region (per thread)
3219     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3220     __kmp_cg_max_nth, // int thread_limit;
3221     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3222     // for max_active_levels
3223     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3224     // {sched,chunk} pair
3225     __kmp_nested_proc_bind.bind_types[0],
3226     __kmp_default_device,
3227     NULL // struct kmp_internal_control *next;
3228   };
3229 
3230   return g_icvs;
3231 }
3232 
3233 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3234 
3235   kmp_internal_control_t gx_icvs;
3236   gx_icvs.serial_nesting_level =
3237       0; // probably =team->t.t_serial like in save_inter_controls
3238   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3239   gx_icvs.next = NULL;
3240 
3241   return gx_icvs;
3242 }
3243 
3244 static void __kmp_initialize_root(kmp_root_t *root) {
3245   int f;
3246   kmp_team_t *root_team;
3247   kmp_team_t *hot_team;
3248   int hot_team_max_nth;
3249   kmp_r_sched_t r_sched =
3250       __kmp_get_schedule_global(); // get current state of scheduling globals
3251   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3252   KMP_DEBUG_ASSERT(root);
3253   KMP_ASSERT(!root->r.r_begin);
3254 
3255   /* setup the root state structure */
3256   __kmp_init_lock(&root->r.r_begin_lock);
3257   root->r.r_begin = FALSE;
3258   root->r.r_active = FALSE;
3259   root->r.r_in_parallel = 0;
3260   root->r.r_blocktime = __kmp_dflt_blocktime;
3261 #if KMP_AFFINITY_SUPPORTED
3262   root->r.r_affinity_assigned = FALSE;
3263 #endif
3264 
3265   /* setup the root team for this task */
3266   /* allocate the root team structure */
3267   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3268 
3269   root_team =
3270       __kmp_allocate_team(root,
3271                           1, // new_nproc
3272                           1, // max_nproc
3273 #if OMPT_SUPPORT
3274                           ompt_data_none, // root parallel id
3275 #endif
3276                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3277                           0 // argc
3278                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3279       );
3280 #if USE_DEBUGGER
3281   // Non-NULL value should be assigned to make the debugger display the root
3282   // team.
3283   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3284 #endif
3285 
3286   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3287 
3288   root->r.r_root_team = root_team;
3289   root_team->t.t_control_stack_top = NULL;
3290 
3291   /* initialize root team */
3292   root_team->t.t_threads[0] = NULL;
3293   root_team->t.t_nproc = 1;
3294   root_team->t.t_serialized = 1;
3295   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3296   root_team->t.t_sched.sched = r_sched.sched;
3297   KA_TRACE(
3298       20,
3299       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3300        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3301 
3302   /* setup the  hot team for this task */
3303   /* allocate the hot team structure */
3304   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3305 
3306   hot_team =
3307       __kmp_allocate_team(root,
3308                           1, // new_nproc
3309                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3310 #if OMPT_SUPPORT
3311                           ompt_data_none, // root parallel id
3312 #endif
3313                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3314                           0 // argc
3315                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3316       );
3317   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3318 
3319   root->r.r_hot_team = hot_team;
3320   root_team->t.t_control_stack_top = NULL;
3321 
3322   /* first-time initialization */
3323   hot_team->t.t_parent = root_team;
3324 
3325   /* initialize hot team */
3326   hot_team_max_nth = hot_team->t.t_max_nproc;
3327   for (f = 0; f < hot_team_max_nth; ++f) {
3328     hot_team->t.t_threads[f] = NULL;
3329   }
3330   hot_team->t.t_nproc = 1;
3331   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3332   hot_team->t.t_sched.sched = r_sched.sched;
3333   hot_team->t.t_size_changed = 0;
3334 }
3335 
3336 #ifdef KMP_DEBUG
3337 
3338 typedef struct kmp_team_list_item {
3339   kmp_team_p const *entry;
3340   struct kmp_team_list_item *next;
3341 } kmp_team_list_item_t;
3342 typedef kmp_team_list_item_t *kmp_team_list_t;
3343 
3344 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3345     kmp_team_list_t list, // List of teams.
3346     kmp_team_p const *team // Team to add.
3347 ) {
3348 
3349   // List must terminate with item where both entry and next are NULL.
3350   // Team is added to the list only once.
3351   // List is sorted in ascending order by team id.
3352   // Team id is *not* a key.
3353 
3354   kmp_team_list_t l;
3355 
3356   KMP_DEBUG_ASSERT(list != NULL);
3357   if (team == NULL) {
3358     return;
3359   }
3360 
3361   __kmp_print_structure_team_accum(list, team->t.t_parent);
3362   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3363 
3364   // Search list for the team.
3365   l = list;
3366   while (l->next != NULL && l->entry != team) {
3367     l = l->next;
3368   }
3369   if (l->next != NULL) {
3370     return; // Team has been added before, exit.
3371   }
3372 
3373   // Team is not found. Search list again for insertion point.
3374   l = list;
3375   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3376     l = l->next;
3377   }
3378 
3379   // Insert team.
3380   {
3381     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3382         sizeof(kmp_team_list_item_t));
3383     *item = *l;
3384     l->entry = team;
3385     l->next = item;
3386   }
3387 }
3388 
3389 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3390 
3391 ) {
3392   __kmp_printf("%s", title);
3393   if (team != NULL) {
3394     __kmp_printf("%2x %p\n", team->t.t_id, team);
3395   } else {
3396     __kmp_printf(" - (nil)\n");
3397   }
3398 }
3399 
3400 static void __kmp_print_structure_thread(char const *title,
3401                                          kmp_info_p const *thread) {
3402   __kmp_printf("%s", title);
3403   if (thread != NULL) {
3404     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3405   } else {
3406     __kmp_printf(" - (nil)\n");
3407   }
3408 }
3409 
3410 void __kmp_print_structure(void) {
3411 
3412   kmp_team_list_t list;
3413 
3414   // Initialize list of teams.
3415   list =
3416       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3417   list->entry = NULL;
3418   list->next = NULL;
3419 
3420   __kmp_printf("\n------------------------------\nGlobal Thread "
3421                "Table\n------------------------------\n");
3422   {
3423     int gtid;
3424     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3425       __kmp_printf("%2d", gtid);
3426       if (__kmp_threads != NULL) {
3427         __kmp_printf(" %p", __kmp_threads[gtid]);
3428       }
3429       if (__kmp_root != NULL) {
3430         __kmp_printf(" %p", __kmp_root[gtid]);
3431       }
3432       __kmp_printf("\n");
3433     }
3434   }
3435 
3436   // Print out __kmp_threads array.
3437   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3438                "----------\n");
3439   if (__kmp_threads != NULL) {
3440     int gtid;
3441     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3442       kmp_info_t const *thread = __kmp_threads[gtid];
3443       if (thread != NULL) {
3444         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3445         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3446         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3447         __kmp_print_structure_team("    Serial Team:  ",
3448                                    thread->th.th_serial_team);
3449         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3450         __kmp_print_structure_thread("    Primary:      ",
3451                                      thread->th.th_team_master);
3452         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3453         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3454         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3455         __kmp_print_structure_thread("    Next in pool: ",
3456                                      thread->th.th_next_pool);
3457         __kmp_printf("\n");
3458         __kmp_print_structure_team_accum(list, thread->th.th_team);
3459         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3460       }
3461     }
3462   } else {
3463     __kmp_printf("Threads array is not allocated.\n");
3464   }
3465 
3466   // Print out __kmp_root array.
3467   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3468                "--------\n");
3469   if (__kmp_root != NULL) {
3470     int gtid;
3471     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3472       kmp_root_t const *root = __kmp_root[gtid];
3473       if (root != NULL) {
3474         __kmp_printf("GTID %2d %p:\n", gtid, root);
3475         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3476         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3477         __kmp_print_structure_thread("    Uber Thread:  ",
3478                                      root->r.r_uber_thread);
3479         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3480         __kmp_printf("    In Parallel:  %2d\n",
3481                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3482         __kmp_printf("\n");
3483         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3484         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3485       }
3486     }
3487   } else {
3488     __kmp_printf("Ubers array is not allocated.\n");
3489   }
3490 
3491   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3492                "--------\n");
3493   while (list->next != NULL) {
3494     kmp_team_p const *team = list->entry;
3495     int i;
3496     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3497     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3498     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3499     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3500     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3501     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3502     for (i = 0; i < team->t.t_nproc; ++i) {
3503       __kmp_printf("    Thread %2d:      ", i);
3504       __kmp_print_structure_thread("", team->t.t_threads[i]);
3505     }
3506     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3507     __kmp_printf("\n");
3508     list = list->next;
3509   }
3510 
3511   // Print out __kmp_thread_pool and __kmp_team_pool.
3512   __kmp_printf("\n------------------------------\nPools\n----------------------"
3513                "--------\n");
3514   __kmp_print_structure_thread("Thread pool:          ",
3515                                CCAST(kmp_info_t *, __kmp_thread_pool));
3516   __kmp_print_structure_team("Team pool:            ",
3517                              CCAST(kmp_team_t *, __kmp_team_pool));
3518   __kmp_printf("\n");
3519 
3520   // Free team list.
3521   while (list != NULL) {
3522     kmp_team_list_item_t *item = list;
3523     list = list->next;
3524     KMP_INTERNAL_FREE(item);
3525   }
3526 }
3527 
3528 #endif
3529 
3530 //---------------------------------------------------------------------------
3531 //  Stuff for per-thread fast random number generator
3532 //  Table of primes
3533 static const unsigned __kmp_primes[] = {
3534     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3535     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3536     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3537     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3538     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3539     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3540     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3541     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3542     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3543     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3544     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3545 
3546 //---------------------------------------------------------------------------
3547 //  __kmp_get_random: Get a random number using a linear congruential method.
3548 unsigned short __kmp_get_random(kmp_info_t *thread) {
3549   unsigned x = thread->th.th_x;
3550   unsigned short r = (unsigned short)(x >> 16);
3551 
3552   thread->th.th_x = x * thread->th.th_a + 1;
3553 
3554   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3555                 thread->th.th_info.ds.ds_tid, r));
3556 
3557   return r;
3558 }
3559 //--------------------------------------------------------
3560 // __kmp_init_random: Initialize a random number generator
3561 void __kmp_init_random(kmp_info_t *thread) {
3562   unsigned seed = thread->th.th_info.ds.ds_tid;
3563 
3564   thread->th.th_a =
3565       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3566   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3567   KA_TRACE(30,
3568            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3569 }
3570 
3571 #if KMP_OS_WINDOWS
3572 /* reclaim array entries for root threads that are already dead, returns number
3573  * reclaimed */
3574 static int __kmp_reclaim_dead_roots(void) {
3575   int i, r = 0;
3576 
3577   for (i = 0; i < __kmp_threads_capacity; ++i) {
3578     if (KMP_UBER_GTID(i) &&
3579         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3580         !__kmp_root[i]
3581              ->r.r_active) { // AC: reclaim only roots died in non-active state
3582       r += __kmp_unregister_root_other_thread(i);
3583     }
3584   }
3585   return r;
3586 }
3587 #endif
3588 
3589 /* This function attempts to create free entries in __kmp_threads and
3590    __kmp_root, and returns the number of free entries generated.
3591 
3592    For Windows* OS static library, the first mechanism used is to reclaim array
3593    entries for root threads that are already dead.
3594 
3595    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3596    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3597    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3598    threadprivate cache array has been created. Synchronization with
3599    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3600 
3601    After any dead root reclamation, if the clipping value allows array expansion
3602    to result in the generation of a total of nNeed free slots, the function does
3603    that expansion. If not, nothing is done beyond the possible initial root
3604    thread reclamation.
3605 
3606    If any argument is negative, the behavior is undefined. */
3607 static int __kmp_expand_threads(int nNeed) {
3608   int added = 0;
3609   int minimumRequiredCapacity;
3610   int newCapacity;
3611   kmp_info_t **newThreads;
3612   kmp_root_t **newRoot;
3613 
3614   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3615   // resizing __kmp_threads does not need additional protection if foreign
3616   // threads are present
3617 
3618 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3619   /* only for Windows static library */
3620   /* reclaim array entries for root threads that are already dead */
3621   added = __kmp_reclaim_dead_roots();
3622 
3623   if (nNeed) {
3624     nNeed -= added;
3625     if (nNeed < 0)
3626       nNeed = 0;
3627   }
3628 #endif
3629   if (nNeed <= 0)
3630     return added;
3631 
3632   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3633   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3634   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3635   // > __kmp_max_nth in one of two ways:
3636   //
3637   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3638   //    may not be reused by another thread, so we may need to increase
3639   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3640   //
3641   // 2) New foreign root(s) are encountered.  We always register new foreign
3642   //    roots. This may cause a smaller # of threads to be allocated at
3643   //    subsequent parallel regions, but the worker threads hang around (and
3644   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3645   //
3646   // Anyway, that is the reason for moving the check to see if
3647   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3648   // instead of having it performed here. -BB
3649 
3650   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3651 
3652   /* compute expansion headroom to check if we can expand */
3653   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3654     /* possible expansion too small -- give up */
3655     return added;
3656   }
3657   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3658 
3659   newCapacity = __kmp_threads_capacity;
3660   do {
3661     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3662                                                           : __kmp_sys_max_nth;
3663   } while (newCapacity < minimumRequiredCapacity);
3664   newThreads = (kmp_info_t **)__kmp_allocate(
3665       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3666   newRoot =
3667       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3668   KMP_MEMCPY(newThreads, __kmp_threads,
3669              __kmp_threads_capacity * sizeof(kmp_info_t *));
3670   KMP_MEMCPY(newRoot, __kmp_root,
3671              __kmp_threads_capacity * sizeof(kmp_root_t *));
3672   // Put old __kmp_threads array on a list. Any ongoing references to the old
3673   // list will be valid. This list is cleaned up at library shutdown.
3674   kmp_old_threads_list_t *node =
3675       (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3676   node->threads = __kmp_threads;
3677   node->next = __kmp_old_threads_list;
3678   __kmp_old_threads_list = node;
3679 
3680   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3681   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3682   added += newCapacity - __kmp_threads_capacity;
3683   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3684 
3685   if (newCapacity > __kmp_tp_capacity) {
3686     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3687     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3688       __kmp_threadprivate_resize_cache(newCapacity);
3689     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3690       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3691     }
3692     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3693   }
3694 
3695   return added;
3696 }
3697 
3698 /* Register the current thread as a root thread and obtain our gtid. We must
3699    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3700    thread that calls from __kmp_do_serial_initialize() */
3701 int __kmp_register_root(int initial_thread) {
3702   kmp_info_t *root_thread;
3703   kmp_root_t *root;
3704   int gtid;
3705   int capacity;
3706   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3707   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3708   KMP_MB();
3709 
3710   /* 2007-03-02:
3711      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3712      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3713      work as expected -- it may return false (that means there is at least one
3714      empty slot in __kmp_threads array), but it is possible the only free slot
3715      is #0, which is reserved for initial thread and so cannot be used for this
3716      one. Following code workarounds this bug.
3717 
3718      However, right solution seems to be not reserving slot #0 for initial
3719      thread because:
3720      (1) there is no magic in slot #0,
3721      (2) we cannot detect initial thread reliably (the first thread which does
3722         serial initialization may be not a real initial thread).
3723   */
3724   capacity = __kmp_threads_capacity;
3725   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3726     --capacity;
3727   }
3728 
3729   // If it is not for initializing the hidden helper team, we need to take
3730   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3731   // in __kmp_threads_capacity.
3732   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3733     capacity -= __kmp_hidden_helper_threads_num;
3734   }
3735 
3736   /* see if there are too many threads */
3737   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3738     if (__kmp_tp_cached) {
3739       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3740                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3741                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3742     } else {
3743       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3744                   __kmp_msg_null);
3745     }
3746   }
3747 
3748   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3749   // 0: initial thread, also a regular OpenMP thread.
3750   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3751   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3752   // regular OpenMP threads.
3753   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3754     // Find an available thread slot for hidden helper thread. Slots for hidden
3755     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3756     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3757                    gtid <= __kmp_hidden_helper_threads_num;
3758          gtid++)
3759       ;
3760     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3761     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3762                  "hidden helper thread: T#%d\n",
3763                  gtid));
3764   } else {
3765     /* find an available thread slot */
3766     // Don't reassign the zero slot since we need that to only be used by
3767     // initial thread. Slots for hidden helper threads should also be skipped.
3768     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3769       gtid = 0;
3770     } else {
3771       for (gtid = __kmp_hidden_helper_threads_num + 1;
3772            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3773         ;
3774     }
3775     KA_TRACE(
3776         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3777     KMP_ASSERT(gtid < __kmp_threads_capacity);
3778   }
3779 
3780   /* update global accounting */
3781   __kmp_all_nth++;
3782   TCW_4(__kmp_nth, __kmp_nth + 1);
3783 
3784   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3785   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3786   if (__kmp_adjust_gtid_mode) {
3787     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3788       if (TCR_4(__kmp_gtid_mode) != 2) {
3789         TCW_4(__kmp_gtid_mode, 2);
3790       }
3791     } else {
3792       if (TCR_4(__kmp_gtid_mode) != 1) {
3793         TCW_4(__kmp_gtid_mode, 1);
3794       }
3795     }
3796   }
3797 
3798 #ifdef KMP_ADJUST_BLOCKTIME
3799   /* Adjust blocktime to zero if necessary            */
3800   /* Middle initialization might not have occurred yet */
3801   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3802     if (__kmp_nth > __kmp_avail_proc) {
3803       __kmp_zero_bt = TRUE;
3804     }
3805   }
3806 #endif /* KMP_ADJUST_BLOCKTIME */
3807 
3808   /* setup this new hierarchy */
3809   if (!(root = __kmp_root[gtid])) {
3810     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3811     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3812   }
3813 
3814 #if KMP_STATS_ENABLED
3815   // Initialize stats as soon as possible (right after gtid assignment).
3816   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3817   __kmp_stats_thread_ptr->startLife();
3818   KMP_SET_THREAD_STATE(SERIAL_REGION);
3819   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3820 #endif
3821   __kmp_initialize_root(root);
3822 
3823   /* setup new root thread structure */
3824   if (root->r.r_uber_thread) {
3825     root_thread = root->r.r_uber_thread;
3826   } else {
3827     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3828     if (__kmp_storage_map) {
3829       __kmp_print_thread_storage_map(root_thread, gtid);
3830     }
3831     root_thread->th.th_info.ds.ds_gtid = gtid;
3832 #if OMPT_SUPPORT
3833     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3834 #endif
3835     root_thread->th.th_root = root;
3836     if (__kmp_env_consistency_check) {
3837       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3838     }
3839 #if USE_FAST_MEMORY
3840     __kmp_initialize_fast_memory(root_thread);
3841 #endif /* USE_FAST_MEMORY */
3842 
3843 #if KMP_USE_BGET
3844     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3845     __kmp_initialize_bget(root_thread);
3846 #endif
3847     __kmp_init_random(root_thread); // Initialize random number generator
3848   }
3849 
3850   /* setup the serial team held in reserve by the root thread */
3851   if (!root_thread->th.th_serial_team) {
3852     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3853     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3854     root_thread->th.th_serial_team = __kmp_allocate_team(
3855         root, 1, 1,
3856 #if OMPT_SUPPORT
3857         ompt_data_none, // root parallel id
3858 #endif
3859         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3860   }
3861   KMP_ASSERT(root_thread->th.th_serial_team);
3862   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3863                 root_thread->th.th_serial_team));
3864 
3865   /* drop root_thread into place */
3866   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3867 
3868   root->r.r_root_team->t.t_threads[0] = root_thread;
3869   root->r.r_hot_team->t.t_threads[0] = root_thread;
3870   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3871   // AC: the team created in reserve, not for execution (it is unused for now).
3872   root_thread->th.th_serial_team->t.t_serialized = 0;
3873   root->r.r_uber_thread = root_thread;
3874 
3875   /* initialize the thread, get it ready to go */
3876   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3877   TCW_4(__kmp_init_gtid, TRUE);
3878 
3879   /* prepare the primary thread for get_gtid() */
3880   __kmp_gtid_set_specific(gtid);
3881 
3882 #if USE_ITT_BUILD
3883   __kmp_itt_thread_name(gtid);
3884 #endif /* USE_ITT_BUILD */
3885 
3886 #ifdef KMP_TDATA_GTID
3887   __kmp_gtid = gtid;
3888 #endif
3889   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3890   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3891 
3892   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3893                 "plain=%u\n",
3894                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3895                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3896                 KMP_INIT_BARRIER_STATE));
3897   { // Initialize barrier data.
3898     int b;
3899     for (b = 0; b < bs_last_barrier; ++b) {
3900       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3901 #if USE_DEBUGGER
3902       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3903 #endif
3904     }
3905   }
3906   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3907                    KMP_INIT_BARRIER_STATE);
3908 
3909 #if KMP_AFFINITY_SUPPORTED
3910   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3911   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3912   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3913   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3914 #endif /* KMP_AFFINITY_SUPPORTED */
3915   root_thread->th.th_def_allocator = __kmp_def_allocator;
3916   root_thread->th.th_prev_level = 0;
3917   root_thread->th.th_prev_num_threads = 1;
3918 
3919   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3920   tmp->cg_root = root_thread;
3921   tmp->cg_thread_limit = __kmp_cg_max_nth;
3922   tmp->cg_nthreads = 1;
3923   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3924                  " cg_nthreads init to 1\n",
3925                  root_thread, tmp));
3926   tmp->up = NULL;
3927   root_thread->th.th_cg_roots = tmp;
3928 
3929   __kmp_root_counter++;
3930 
3931 #if OMPT_SUPPORT
3932   if (!initial_thread && ompt_enabled.enabled) {
3933 
3934     kmp_info_t *root_thread = ompt_get_thread();
3935 
3936     ompt_set_thread_state(root_thread, ompt_state_overhead);
3937 
3938     if (ompt_enabled.ompt_callback_thread_begin) {
3939       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3940           ompt_thread_initial, __ompt_get_thread_data_internal());
3941     }
3942     ompt_data_t *task_data;
3943     ompt_data_t *parallel_data;
3944     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3945                                   NULL);
3946     if (ompt_enabled.ompt_callback_implicit_task) {
3947       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3948           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3949     }
3950 
3951     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3952   }
3953 #endif
3954 #if OMPD_SUPPORT
3955   if (ompd_state & OMPD_ENABLE_BP)
3956     ompd_bp_thread_begin();
3957 #endif
3958 
3959   KMP_MB();
3960   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3961 
3962   return gtid;
3963 }
3964 
3965 #if KMP_NESTED_HOT_TEAMS
3966 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3967                                 const int max_level) {
3968   int i, n, nth;
3969   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3970   if (!hot_teams || !hot_teams[level].hot_team) {
3971     return 0;
3972   }
3973   KMP_DEBUG_ASSERT(level < max_level);
3974   kmp_team_t *team = hot_teams[level].hot_team;
3975   nth = hot_teams[level].hot_team_nth;
3976   n = nth - 1; // primary thread is not freed
3977   if (level < max_level - 1) {
3978     for (i = 0; i < nth; ++i) {
3979       kmp_info_t *th = team->t.t_threads[i];
3980       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3981       if (i > 0 && th->th.th_hot_teams) {
3982         __kmp_free(th->th.th_hot_teams);
3983         th->th.th_hot_teams = NULL;
3984       }
3985     }
3986   }
3987   __kmp_free_team(root, team, NULL);
3988   return n;
3989 }
3990 #endif
3991 
3992 // Resets a root thread and clear its root and hot teams.
3993 // Returns the number of __kmp_threads entries directly and indirectly freed.
3994 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3995   kmp_team_t *root_team = root->r.r_root_team;
3996   kmp_team_t *hot_team = root->r.r_hot_team;
3997   int n = hot_team->t.t_nproc;
3998   int i;
3999 
4000   KMP_DEBUG_ASSERT(!root->r.r_active);
4001 
4002   root->r.r_root_team = NULL;
4003   root->r.r_hot_team = NULL;
4004   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4005   // before call to __kmp_free_team().
4006   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4007 #if KMP_NESTED_HOT_TEAMS
4008   if (__kmp_hot_teams_max_level >
4009       0) { // need to free nested hot teams and their threads if any
4010     for (i = 0; i < hot_team->t.t_nproc; ++i) {
4011       kmp_info_t *th = hot_team->t.t_threads[i];
4012       if (__kmp_hot_teams_max_level > 1) {
4013         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4014       }
4015       if (th->th.th_hot_teams) {
4016         __kmp_free(th->th.th_hot_teams);
4017         th->th.th_hot_teams = NULL;
4018       }
4019     }
4020   }
4021 #endif
4022   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4023 
4024   // Before we can reap the thread, we need to make certain that all other
4025   // threads in the teams that had this root as ancestor have stopped trying to
4026   // steal tasks.
4027   if (__kmp_tasking_mode != tskm_immediate_exec) {
4028     __kmp_wait_to_unref_task_teams();
4029   }
4030 
4031 #if KMP_OS_WINDOWS
4032   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4033   KA_TRACE(
4034       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4035            "\n",
4036            (LPVOID) & (root->r.r_uber_thread->th),
4037            root->r.r_uber_thread->th.th_info.ds.ds_thread));
4038   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4039 #endif /* KMP_OS_WINDOWS */
4040 
4041 #if OMPD_SUPPORT
4042   if (ompd_state & OMPD_ENABLE_BP)
4043     ompd_bp_thread_end();
4044 #endif
4045 
4046 #if OMPT_SUPPORT
4047   ompt_data_t *task_data;
4048   ompt_data_t *parallel_data;
4049   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4050                                 NULL);
4051   if (ompt_enabled.ompt_callback_implicit_task) {
4052     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4053         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4054   }
4055   if (ompt_enabled.ompt_callback_thread_end) {
4056     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4057         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4058   }
4059 #endif
4060 
4061   TCW_4(__kmp_nth,
4062         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4063   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4064   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4065                  " to %d\n",
4066                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4067                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4068   if (i == 1) {
4069     // need to free contention group structure
4070     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4071                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
4072     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4073     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4074     root->r.r_uber_thread->th.th_cg_roots = NULL;
4075   }
4076   __kmp_reap_thread(root->r.r_uber_thread, 1);
4077 
4078   // We canot put root thread to __kmp_thread_pool, so we have to reap it
4079   // instead of freeing.
4080   root->r.r_uber_thread = NULL;
4081   /* mark root as no longer in use */
4082   root->r.r_begin = FALSE;
4083 
4084   return n;
4085 }
4086 
4087 void __kmp_unregister_root_current_thread(int gtid) {
4088   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4089   /* this lock should be ok, since unregister_root_current_thread is never
4090      called during an abort, only during a normal close. furthermore, if you
4091      have the forkjoin lock, you should never try to get the initz lock */
4092   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4093   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4094     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4095                   "exiting T#%d\n",
4096                   gtid));
4097     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4098     return;
4099   }
4100   kmp_root_t *root = __kmp_root[gtid];
4101 
4102   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4103   KMP_ASSERT(KMP_UBER_GTID(gtid));
4104   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4105   KMP_ASSERT(root->r.r_active == FALSE);
4106 
4107   KMP_MB();
4108 
4109   kmp_info_t *thread = __kmp_threads[gtid];
4110   kmp_team_t *team = thread->th.th_team;
4111   kmp_task_team_t *task_team = thread->th.th_task_team;
4112 
4113   // we need to wait for the proxy tasks before finishing the thread
4114   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4115                             task_team->tt.tt_hidden_helper_task_encountered)) {
4116 #if OMPT_SUPPORT
4117     // the runtime is shutting down so we won't report any events
4118     thread->th.ompt_thread_info.state = ompt_state_undefined;
4119 #endif
4120     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4121   }
4122 
4123   __kmp_reset_root(gtid, root);
4124 
4125   KMP_MB();
4126   KC_TRACE(10,
4127            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4128 
4129   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4130 }
4131 
4132 #if KMP_OS_WINDOWS
4133 /* __kmp_forkjoin_lock must be already held
4134    Unregisters a root thread that is not the current thread.  Returns the number
4135    of __kmp_threads entries freed as a result. */
4136 static int __kmp_unregister_root_other_thread(int gtid) {
4137   kmp_root_t *root = __kmp_root[gtid];
4138   int r;
4139 
4140   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4141   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4142   KMP_ASSERT(KMP_UBER_GTID(gtid));
4143   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4144   KMP_ASSERT(root->r.r_active == FALSE);
4145 
4146   r = __kmp_reset_root(gtid, root);
4147   KC_TRACE(10,
4148            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4149   return r;
4150 }
4151 #endif
4152 
4153 #if KMP_DEBUG
4154 void __kmp_task_info() {
4155 
4156   kmp_int32 gtid = __kmp_entry_gtid();
4157   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4158   kmp_info_t *this_thr = __kmp_threads[gtid];
4159   kmp_team_t *steam = this_thr->th.th_serial_team;
4160   kmp_team_t *team = this_thr->th.th_team;
4161 
4162   __kmp_printf(
4163       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4164       "ptask=%p\n",
4165       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4166       team->t.t_implicit_task_taskdata[tid].td_parent);
4167 }
4168 #endif // KMP_DEBUG
4169 
4170 /* TODO optimize with one big memclr, take out what isn't needed, split
4171    responsibility to workers as much as possible, and delay initialization of
4172    features as much as possible  */
4173 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4174                                   int tid, int gtid) {
4175   /* this_thr->th.th_info.ds.ds_gtid is setup in
4176      kmp_allocate_thread/create_worker.
4177      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4178   KMP_DEBUG_ASSERT(this_thr != NULL);
4179   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4180   KMP_DEBUG_ASSERT(team);
4181   KMP_DEBUG_ASSERT(team->t.t_threads);
4182   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4183   kmp_info_t *master = team->t.t_threads[0];
4184   KMP_DEBUG_ASSERT(master);
4185   KMP_DEBUG_ASSERT(master->th.th_root);
4186 
4187   KMP_MB();
4188 
4189   TCW_SYNC_PTR(this_thr->th.th_team, team);
4190 
4191   this_thr->th.th_info.ds.ds_tid = tid;
4192   this_thr->th.th_set_nproc = 0;
4193   if (__kmp_tasking_mode != tskm_immediate_exec)
4194     // When tasking is possible, threads are not safe to reap until they are
4195     // done tasking; this will be set when tasking code is exited in wait
4196     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4197   else // no tasking --> always safe to reap
4198     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4199   this_thr->th.th_set_proc_bind = proc_bind_default;
4200 #if KMP_AFFINITY_SUPPORTED
4201   this_thr->th.th_new_place = this_thr->th.th_current_place;
4202 #endif
4203   this_thr->th.th_root = master->th.th_root;
4204 
4205   /* setup the thread's cache of the team structure */
4206   this_thr->th.th_team_nproc = team->t.t_nproc;
4207   this_thr->th.th_team_master = master;
4208   this_thr->th.th_team_serialized = team->t.t_serialized;
4209 
4210   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4211 
4212   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4213                 tid, gtid, this_thr, this_thr->th.th_current_task));
4214 
4215   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4216                            team, tid, TRUE);
4217 
4218   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4219                 tid, gtid, this_thr, this_thr->th.th_current_task));
4220   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4221   // __kmp_initialize_team()?
4222 
4223   /* TODO no worksharing in speculative threads */
4224   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4225 
4226   this_thr->th.th_local.this_construct = 0;
4227 
4228   if (!this_thr->th.th_pri_common) {
4229     this_thr->th.th_pri_common =
4230         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4231     if (__kmp_storage_map) {
4232       __kmp_print_storage_map_gtid(
4233           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4234           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4235     }
4236     this_thr->th.th_pri_head = NULL;
4237   }
4238 
4239   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4240       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4241     // Make new thread's CG root same as primary thread's
4242     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4243     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4244     if (tmp) {
4245       // worker changes CG, need to check if old CG should be freed
4246       int i = tmp->cg_nthreads--;
4247       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4248                      " on node %p of thread %p to %d\n",
4249                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4250       if (i == 1) {
4251         __kmp_free(tmp); // last thread left CG --> free it
4252       }
4253     }
4254     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4255     // Increment new thread's CG root's counter to add the new thread
4256     this_thr->th.th_cg_roots->cg_nthreads++;
4257     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4258                    " node %p of thread %p to %d\n",
4259                    this_thr, this_thr->th.th_cg_roots,
4260                    this_thr->th.th_cg_roots->cg_root,
4261                    this_thr->th.th_cg_roots->cg_nthreads));
4262     this_thr->th.th_current_task->td_icvs.thread_limit =
4263         this_thr->th.th_cg_roots->cg_thread_limit;
4264   }
4265 
4266   /* Initialize dynamic dispatch */
4267   {
4268     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4269     // Use team max_nproc since this will never change for the team.
4270     size_t disp_size =
4271         sizeof(dispatch_private_info_t) *
4272         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4273     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4274                   team->t.t_max_nproc));
4275     KMP_ASSERT(dispatch);
4276     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4277     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4278 
4279     dispatch->th_disp_index = 0;
4280     dispatch->th_doacross_buf_idx = 0;
4281     if (!dispatch->th_disp_buffer) {
4282       dispatch->th_disp_buffer =
4283           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4284 
4285       if (__kmp_storage_map) {
4286         __kmp_print_storage_map_gtid(
4287             gtid, &dispatch->th_disp_buffer[0],
4288             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4289                                           ? 1
4290                                           : __kmp_dispatch_num_buffers],
4291             disp_size,
4292             "th_%d.th_dispatch.th_disp_buffer "
4293             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4294             gtid, team->t.t_id, gtid);
4295       }
4296     } else {
4297       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4298     }
4299 
4300     dispatch->th_dispatch_pr_current = 0;
4301     dispatch->th_dispatch_sh_current = 0;
4302 
4303     dispatch->th_deo_fcn = 0; /* ORDERED     */
4304     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4305   }
4306 
4307   this_thr->th.th_next_pool = NULL;
4308 
4309   if (!this_thr->th.th_task_state_memo_stack) {
4310     size_t i;
4311     this_thr->th.th_task_state_memo_stack =
4312         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4313     this_thr->th.th_task_state_top = 0;
4314     this_thr->th.th_task_state_stack_sz = 4;
4315     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4316          ++i) // zero init the stack
4317       this_thr->th.th_task_state_memo_stack[i] = 0;
4318   }
4319 
4320   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4321   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4322 
4323   KMP_MB();
4324 }
4325 
4326 /* allocate a new thread for the requesting team. this is only called from
4327    within a forkjoin critical section. we will first try to get an available
4328    thread from the thread pool. if none is available, we will fork a new one
4329    assuming we are able to create a new one. this should be assured, as the
4330    caller should check on this first. */
4331 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4332                                   int new_tid) {
4333   kmp_team_t *serial_team;
4334   kmp_info_t *new_thr;
4335   int new_gtid;
4336 
4337   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4338   KMP_DEBUG_ASSERT(root && team);
4339 #if !KMP_NESTED_HOT_TEAMS
4340   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4341 #endif
4342   KMP_MB();
4343 
4344   /* first, try to get one from the thread pool */
4345   if (__kmp_thread_pool) {
4346     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4347     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4348     if (new_thr == __kmp_thread_pool_insert_pt) {
4349       __kmp_thread_pool_insert_pt = NULL;
4350     }
4351     TCW_4(new_thr->th.th_in_pool, FALSE);
4352     __kmp_suspend_initialize_thread(new_thr);
4353     __kmp_lock_suspend_mx(new_thr);
4354     if (new_thr->th.th_active_in_pool == TRUE) {
4355       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4356       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4357       new_thr->th.th_active_in_pool = FALSE;
4358     }
4359     __kmp_unlock_suspend_mx(new_thr);
4360 
4361     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4362                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4363     KMP_ASSERT(!new_thr->th.th_team);
4364     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4365 
4366     /* setup the thread structure */
4367     __kmp_initialize_info(new_thr, team, new_tid,
4368                           new_thr->th.th_info.ds.ds_gtid);
4369     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4370 
4371     TCW_4(__kmp_nth, __kmp_nth + 1);
4372 
4373     new_thr->th.th_task_state = 0;
4374     new_thr->th.th_task_state_top = 0;
4375     new_thr->th.th_task_state_stack_sz = 4;
4376 
4377     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4378       // Make sure pool thread has transitioned to waiting on own thread struct
4379       KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4380       // Thread activated in __kmp_allocate_team when increasing team size
4381     }
4382 
4383 #ifdef KMP_ADJUST_BLOCKTIME
4384     /* Adjust blocktime back to zero if necessary */
4385     /* Middle initialization might not have occurred yet */
4386     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4387       if (__kmp_nth > __kmp_avail_proc) {
4388         __kmp_zero_bt = TRUE;
4389       }
4390     }
4391 #endif /* KMP_ADJUST_BLOCKTIME */
4392 
4393 #if KMP_DEBUG
4394     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4395     // KMP_BARRIER_PARENT_FLAG.
4396     int b;
4397     kmp_balign_t *balign = new_thr->th.th_bar;
4398     for (b = 0; b < bs_last_barrier; ++b)
4399       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4400 #endif
4401 
4402     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4403                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4404 
4405     KMP_MB();
4406     return new_thr;
4407   }
4408 
4409   /* no, well fork a new one */
4410   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4411   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4412 
4413 #if KMP_USE_MONITOR
4414   // If this is the first worker thread the RTL is creating, then also
4415   // launch the monitor thread.  We try to do this as early as possible.
4416   if (!TCR_4(__kmp_init_monitor)) {
4417     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4418     if (!TCR_4(__kmp_init_monitor)) {
4419       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4420       TCW_4(__kmp_init_monitor, 1);
4421       __kmp_create_monitor(&__kmp_monitor);
4422       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4423 #if KMP_OS_WINDOWS
4424       // AC: wait until monitor has started. This is a fix for CQ232808.
4425       // The reason is that if the library is loaded/unloaded in a loop with
4426       // small (parallel) work in between, then there is high probability that
4427       // monitor thread started after the library shutdown. At shutdown it is
4428       // too late to cope with the problem, because when the primary thread is
4429       // in DllMain (process detach) the monitor has no chances to start (it is
4430       // blocked), and primary thread has no means to inform the monitor that
4431       // the library has gone, because all the memory which the monitor can
4432       // access is going to be released/reset.
4433       while (TCR_4(__kmp_init_monitor) < 2) {
4434         KMP_YIELD(TRUE);
4435       }
4436       KF_TRACE(10, ("after monitor thread has started\n"));
4437 #endif
4438     }
4439     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4440   }
4441 #endif
4442 
4443   KMP_MB();
4444 
4445   {
4446     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4447                              ? 1
4448                              : __kmp_hidden_helper_threads_num + 1;
4449 
4450     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4451          ++new_gtid) {
4452       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4453     }
4454 
4455     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4456       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4457     }
4458   }
4459 
4460   /* allocate space for it. */
4461   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4462 
4463   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4464 
4465 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4466   // suppress race conditions detection on synchronization flags in debug mode
4467   // this helps to analyze library internals eliminating false positives
4468   __itt_suppress_mark_range(
4469       __itt_suppress_range, __itt_suppress_threading_errors,
4470       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4471   __itt_suppress_mark_range(
4472       __itt_suppress_range, __itt_suppress_threading_errors,
4473       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4474 #if KMP_OS_WINDOWS
4475   __itt_suppress_mark_range(
4476       __itt_suppress_range, __itt_suppress_threading_errors,
4477       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4478 #else
4479   __itt_suppress_mark_range(__itt_suppress_range,
4480                             __itt_suppress_threading_errors,
4481                             &new_thr->th.th_suspend_init_count,
4482                             sizeof(new_thr->th.th_suspend_init_count));
4483 #endif
4484   // TODO: check if we need to also suppress b_arrived flags
4485   __itt_suppress_mark_range(__itt_suppress_range,
4486                             __itt_suppress_threading_errors,
4487                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4488                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4489   __itt_suppress_mark_range(__itt_suppress_range,
4490                             __itt_suppress_threading_errors,
4491                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4492                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4493   __itt_suppress_mark_range(__itt_suppress_range,
4494                             __itt_suppress_threading_errors,
4495                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4496                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4497 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4498   if (__kmp_storage_map) {
4499     __kmp_print_thread_storage_map(new_thr, new_gtid);
4500   }
4501 
4502   // add the reserve serialized team, initialized from the team's primary thread
4503   {
4504     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4505     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4506     new_thr->th.th_serial_team = serial_team =
4507         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4508 #if OMPT_SUPPORT
4509                                           ompt_data_none, // root parallel id
4510 #endif
4511                                           proc_bind_default, &r_icvs,
4512                                           0 USE_NESTED_HOT_ARG(NULL));
4513   }
4514   KMP_ASSERT(serial_team);
4515   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4516   // execution (it is unused for now).
4517   serial_team->t.t_threads[0] = new_thr;
4518   KF_TRACE(10,
4519            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4520             new_thr));
4521 
4522   /* setup the thread structures */
4523   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4524 
4525 #if USE_FAST_MEMORY
4526   __kmp_initialize_fast_memory(new_thr);
4527 #endif /* USE_FAST_MEMORY */
4528 
4529 #if KMP_USE_BGET
4530   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4531   __kmp_initialize_bget(new_thr);
4532 #endif
4533 
4534   __kmp_init_random(new_thr); // Initialize random number generator
4535 
4536   /* Initialize these only once when thread is grabbed for a team allocation */
4537   KA_TRACE(20,
4538            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4539             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4540 
4541   int b;
4542   kmp_balign_t *balign = new_thr->th.th_bar;
4543   for (b = 0; b < bs_last_barrier; ++b) {
4544     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4545     balign[b].bb.team = NULL;
4546     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4547     balign[b].bb.use_oncore_barrier = 0;
4548   }
4549 
4550   TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4551   new_thr->th.th_sleep_loc_type = flag_unset;
4552 
4553   new_thr->th.th_spin_here = FALSE;
4554   new_thr->th.th_next_waiting = 0;
4555 #if KMP_OS_UNIX
4556   new_thr->th.th_blocking = false;
4557 #endif
4558 
4559 #if KMP_AFFINITY_SUPPORTED
4560   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4561   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4562   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4563   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4564 #endif
4565   new_thr->th.th_def_allocator = __kmp_def_allocator;
4566   new_thr->th.th_prev_level = 0;
4567   new_thr->th.th_prev_num_threads = 1;
4568 
4569   TCW_4(new_thr->th.th_in_pool, FALSE);
4570   new_thr->th.th_active_in_pool = FALSE;
4571   TCW_4(new_thr->th.th_active, TRUE);
4572 
4573   /* adjust the global counters */
4574   __kmp_all_nth++;
4575   __kmp_nth++;
4576 
4577   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4578   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4579   if (__kmp_adjust_gtid_mode) {
4580     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4581       if (TCR_4(__kmp_gtid_mode) != 2) {
4582         TCW_4(__kmp_gtid_mode, 2);
4583       }
4584     } else {
4585       if (TCR_4(__kmp_gtid_mode) != 1) {
4586         TCW_4(__kmp_gtid_mode, 1);
4587       }
4588     }
4589   }
4590 
4591 #ifdef KMP_ADJUST_BLOCKTIME
4592   /* Adjust blocktime back to zero if necessary       */
4593   /* Middle initialization might not have occurred yet */
4594   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4595     if (__kmp_nth > __kmp_avail_proc) {
4596       __kmp_zero_bt = TRUE;
4597     }
4598   }
4599 #endif /* KMP_ADJUST_BLOCKTIME */
4600 
4601   /* actually fork it and create the new worker thread */
4602   KF_TRACE(
4603       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4604   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4605   KF_TRACE(10,
4606            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4607 
4608   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4609                 new_gtid));
4610   KMP_MB();
4611   return new_thr;
4612 }
4613 
4614 /* Reinitialize team for reuse.
4615    The hot team code calls this case at every fork barrier, so EPCC barrier
4616    test are extremely sensitive to changes in it, esp. writes to the team
4617    struct, which cause a cache invalidation in all threads.
4618    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4619 static void __kmp_reinitialize_team(kmp_team_t *team,
4620                                     kmp_internal_control_t *new_icvs,
4621                                     ident_t *loc) {
4622   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4623                 team->t.t_threads[0], team));
4624   KMP_DEBUG_ASSERT(team && new_icvs);
4625   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4626   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4627 
4628   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4629   // Copy ICVs to the primary thread's implicit taskdata
4630   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4631   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4632 
4633   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4634                 team->t.t_threads[0], team));
4635 }
4636 
4637 /* Initialize the team data structure.
4638    This assumes the t_threads and t_max_nproc are already set.
4639    Also, we don't touch the arguments */
4640 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4641                                   kmp_internal_control_t *new_icvs,
4642                                   ident_t *loc) {
4643   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4644 
4645   /* verify */
4646   KMP_DEBUG_ASSERT(team);
4647   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4648   KMP_DEBUG_ASSERT(team->t.t_threads);
4649   KMP_MB();
4650 
4651   team->t.t_master_tid = 0; /* not needed */
4652   /* team->t.t_master_bar;        not needed */
4653   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4654   team->t.t_nproc = new_nproc;
4655 
4656   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4657   team->t.t_next_pool = NULL;
4658   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4659    * up hot team */
4660 
4661   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4662   team->t.t_invoke = NULL; /* not needed */
4663 
4664   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4665   team->t.t_sched.sched = new_icvs->sched.sched;
4666 
4667 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4668   team->t.t_fp_control_saved = FALSE; /* not needed */
4669   team->t.t_x87_fpu_control_word = 0; /* not needed */
4670   team->t.t_mxcsr = 0; /* not needed */
4671 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4672 
4673   team->t.t_construct = 0;
4674 
4675   team->t.t_ordered.dt.t_value = 0;
4676   team->t.t_master_active = FALSE;
4677 
4678 #ifdef KMP_DEBUG
4679   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4680 #endif
4681 #if KMP_OS_WINDOWS
4682   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4683 #endif
4684 
4685   team->t.t_control_stack_top = NULL;
4686 
4687   __kmp_reinitialize_team(team, new_icvs, loc);
4688 
4689   KMP_MB();
4690   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4691 }
4692 
4693 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4694 /* Sets full mask for thread and returns old mask, no changes to structures. */
4695 static void
4696 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4697   if (KMP_AFFINITY_CAPABLE()) {
4698     int status;
4699     if (old_mask != NULL) {
4700       status = __kmp_get_system_affinity(old_mask, TRUE);
4701       int error = errno;
4702       if (status != 0) {
4703         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4704                     __kmp_msg_null);
4705       }
4706     }
4707     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4708   }
4709 }
4710 #endif
4711 
4712 #if KMP_AFFINITY_SUPPORTED
4713 
4714 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4715 // It calculates the worker + primary thread's partition based upon the parent
4716 // thread's partition, and binds each worker to a thread in their partition.
4717 // The primary thread's partition should already include its current binding.
4718 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4719   // Do not partition places for the hidden helper team
4720   if (KMP_HIDDEN_HELPER_TEAM(team))
4721     return;
4722   // Copy the primary thread's place partition to the team struct
4723   kmp_info_t *master_th = team->t.t_threads[0];
4724   KMP_DEBUG_ASSERT(master_th != NULL);
4725   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4726   int first_place = master_th->th.th_first_place;
4727   int last_place = master_th->th.th_last_place;
4728   int masters_place = master_th->th.th_current_place;
4729   team->t.t_first_place = first_place;
4730   team->t.t_last_place = last_place;
4731 
4732   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4733                 "bound to place %d partition = [%d,%d]\n",
4734                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4735                 team->t.t_id, masters_place, first_place, last_place));
4736 
4737   switch (proc_bind) {
4738 
4739   case proc_bind_default:
4740     // Serial teams might have the proc_bind policy set to proc_bind_default.
4741     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4742     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4743     break;
4744 
4745   case proc_bind_primary: {
4746     int f;
4747     int n_th = team->t.t_nproc;
4748     for (f = 1; f < n_th; f++) {
4749       kmp_info_t *th = team->t.t_threads[f];
4750       KMP_DEBUG_ASSERT(th != NULL);
4751       th->th.th_first_place = first_place;
4752       th->th.th_last_place = last_place;
4753       th->th.th_new_place = masters_place;
4754       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4755           team->t.t_display_affinity != 1) {
4756         team->t.t_display_affinity = 1;
4757       }
4758 
4759       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4760                      "partition = [%d,%d]\n",
4761                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4762                      f, masters_place, first_place, last_place));
4763     }
4764   } break;
4765 
4766   case proc_bind_close: {
4767     int f;
4768     int n_th = team->t.t_nproc;
4769     int n_places;
4770     if (first_place <= last_place) {
4771       n_places = last_place - first_place + 1;
4772     } else {
4773       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4774     }
4775     if (n_th <= n_places) {
4776       int place = masters_place;
4777       for (f = 1; f < n_th; f++) {
4778         kmp_info_t *th = team->t.t_threads[f];
4779         KMP_DEBUG_ASSERT(th != NULL);
4780 
4781         if (place == last_place) {
4782           place = first_place;
4783         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4784           place = 0;
4785         } else {
4786           place++;
4787         }
4788         th->th.th_first_place = first_place;
4789         th->th.th_last_place = last_place;
4790         th->th.th_new_place = place;
4791         if (__kmp_display_affinity && place != th->th.th_current_place &&
4792             team->t.t_display_affinity != 1) {
4793           team->t.t_display_affinity = 1;
4794         }
4795 
4796         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4797                        "partition = [%d,%d]\n",
4798                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4799                        team->t.t_id, f, place, first_place, last_place));
4800       }
4801     } else {
4802       int S, rem, gap, s_count;
4803       S = n_th / n_places;
4804       s_count = 0;
4805       rem = n_th - (S * n_places);
4806       gap = rem > 0 ? n_places / rem : n_places;
4807       int place = masters_place;
4808       int gap_ct = gap;
4809       for (f = 0; f < n_th; f++) {
4810         kmp_info_t *th = team->t.t_threads[f];
4811         KMP_DEBUG_ASSERT(th != NULL);
4812 
4813         th->th.th_first_place = first_place;
4814         th->th.th_last_place = last_place;
4815         th->th.th_new_place = place;
4816         if (__kmp_display_affinity && place != th->th.th_current_place &&
4817             team->t.t_display_affinity != 1) {
4818           team->t.t_display_affinity = 1;
4819         }
4820         s_count++;
4821 
4822         if ((s_count == S) && rem && (gap_ct == gap)) {
4823           // do nothing, add an extra thread to place on next iteration
4824         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4825           // we added an extra thread to this place; move to next place
4826           if (place == last_place) {
4827             place = first_place;
4828           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4829             place = 0;
4830           } else {
4831             place++;
4832           }
4833           s_count = 0;
4834           gap_ct = 1;
4835           rem--;
4836         } else if (s_count == S) { // place full; don't add extra
4837           if (place == last_place) {
4838             place = first_place;
4839           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4840             place = 0;
4841           } else {
4842             place++;
4843           }
4844           gap_ct++;
4845           s_count = 0;
4846         }
4847 
4848         KA_TRACE(100,
4849                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4850                   "partition = [%d,%d]\n",
4851                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4852                   th->th.th_new_place, first_place, last_place));
4853       }
4854       KMP_DEBUG_ASSERT(place == masters_place);
4855     }
4856   } break;
4857 
4858   case proc_bind_spread: {
4859     int f;
4860     int n_th = team->t.t_nproc;
4861     int n_places;
4862     int thidx;
4863     if (first_place <= last_place) {
4864       n_places = last_place - first_place + 1;
4865     } else {
4866       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4867     }
4868     if (n_th <= n_places) {
4869       int place = -1;
4870 
4871       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4872         int S = n_places / n_th;
4873         int s_count, rem, gap, gap_ct;
4874 
4875         place = masters_place;
4876         rem = n_places - n_th * S;
4877         gap = rem ? n_th / rem : 1;
4878         gap_ct = gap;
4879         thidx = n_th;
4880         if (update_master_only == 1)
4881           thidx = 1;
4882         for (f = 0; f < thidx; f++) {
4883           kmp_info_t *th = team->t.t_threads[f];
4884           KMP_DEBUG_ASSERT(th != NULL);
4885 
4886           th->th.th_first_place = place;
4887           th->th.th_new_place = place;
4888           if (__kmp_display_affinity && place != th->th.th_current_place &&
4889               team->t.t_display_affinity != 1) {
4890             team->t.t_display_affinity = 1;
4891           }
4892           s_count = 1;
4893           while (s_count < S) {
4894             if (place == last_place) {
4895               place = first_place;
4896             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4897               place = 0;
4898             } else {
4899               place++;
4900             }
4901             s_count++;
4902           }
4903           if (rem && (gap_ct == gap)) {
4904             if (place == last_place) {
4905               place = first_place;
4906             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4907               place = 0;
4908             } else {
4909               place++;
4910             }
4911             rem--;
4912             gap_ct = 0;
4913           }
4914           th->th.th_last_place = place;
4915           gap_ct++;
4916 
4917           if (place == last_place) {
4918             place = first_place;
4919           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4920             place = 0;
4921           } else {
4922             place++;
4923           }
4924 
4925           KA_TRACE(100,
4926                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4927                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4928                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4929                     f, th->th.th_new_place, th->th.th_first_place,
4930                     th->th.th_last_place, __kmp_affinity_num_masks));
4931         }
4932       } else {
4933         /* Having uniform space of available computation places I can create
4934            T partitions of round(P/T) size and put threads into the first
4935            place of each partition. */
4936         double current = static_cast<double>(masters_place);
4937         double spacing =
4938             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4939         int first, last;
4940         kmp_info_t *th;
4941 
4942         thidx = n_th + 1;
4943         if (update_master_only == 1)
4944           thidx = 1;
4945         for (f = 0; f < thidx; f++) {
4946           first = static_cast<int>(current);
4947           last = static_cast<int>(current + spacing) - 1;
4948           KMP_DEBUG_ASSERT(last >= first);
4949           if (first >= n_places) {
4950             if (masters_place) {
4951               first -= n_places;
4952               last -= n_places;
4953               if (first == (masters_place + 1)) {
4954                 KMP_DEBUG_ASSERT(f == n_th);
4955                 first--;
4956               }
4957               if (last == masters_place) {
4958                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4959                 last--;
4960               }
4961             } else {
4962               KMP_DEBUG_ASSERT(f == n_th);
4963               first = 0;
4964               last = 0;
4965             }
4966           }
4967           if (last >= n_places) {
4968             last = (n_places - 1);
4969           }
4970           place = first;
4971           current += spacing;
4972           if (f < n_th) {
4973             KMP_DEBUG_ASSERT(0 <= first);
4974             KMP_DEBUG_ASSERT(n_places > first);
4975             KMP_DEBUG_ASSERT(0 <= last);
4976             KMP_DEBUG_ASSERT(n_places > last);
4977             KMP_DEBUG_ASSERT(last_place >= first_place);
4978             th = team->t.t_threads[f];
4979             KMP_DEBUG_ASSERT(th);
4980             th->th.th_first_place = first;
4981             th->th.th_new_place = place;
4982             th->th.th_last_place = last;
4983             if (__kmp_display_affinity && place != th->th.th_current_place &&
4984                 team->t.t_display_affinity != 1) {
4985               team->t.t_display_affinity = 1;
4986             }
4987             KA_TRACE(100,
4988                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4989                       "partition = [%d,%d], spacing = %.4f\n",
4990                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4991                       team->t.t_id, f, th->th.th_new_place,
4992                       th->th.th_first_place, th->th.th_last_place, spacing));
4993           }
4994         }
4995       }
4996       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4997     } else {
4998       int S, rem, gap, s_count;
4999       S = n_th / n_places;
5000       s_count = 0;
5001       rem = n_th - (S * n_places);
5002       gap = rem > 0 ? n_places / rem : n_places;
5003       int place = masters_place;
5004       int gap_ct = gap;
5005       thidx = n_th;
5006       if (update_master_only == 1)
5007         thidx = 1;
5008       for (f = 0; f < thidx; f++) {
5009         kmp_info_t *th = team->t.t_threads[f];
5010         KMP_DEBUG_ASSERT(th != NULL);
5011 
5012         th->th.th_first_place = place;
5013         th->th.th_last_place = place;
5014         th->th.th_new_place = place;
5015         if (__kmp_display_affinity && place != th->th.th_current_place &&
5016             team->t.t_display_affinity != 1) {
5017           team->t.t_display_affinity = 1;
5018         }
5019         s_count++;
5020 
5021         if ((s_count == S) && rem && (gap_ct == gap)) {
5022           // do nothing, add an extra thread to place on next iteration
5023         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5024           // we added an extra thread to this place; move on to next place
5025           if (place == last_place) {
5026             place = first_place;
5027           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5028             place = 0;
5029           } else {
5030             place++;
5031           }
5032           s_count = 0;
5033           gap_ct = 1;
5034           rem--;
5035         } else if (s_count == S) { // place is full; don't add extra thread
5036           if (place == last_place) {
5037             place = first_place;
5038           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5039             place = 0;
5040           } else {
5041             place++;
5042           }
5043           gap_ct++;
5044           s_count = 0;
5045         }
5046 
5047         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5048                        "partition = [%d,%d]\n",
5049                        __kmp_gtid_from_thread(team->t.t_threads[f]),
5050                        team->t.t_id, f, th->th.th_new_place,
5051                        th->th.th_first_place, th->th.th_last_place));
5052       }
5053       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5054     }
5055   } break;
5056 
5057   default:
5058     break;
5059   }
5060 
5061   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5062 }
5063 
5064 #endif // KMP_AFFINITY_SUPPORTED
5065 
5066 /* allocate a new team data structure to use.  take one off of the free pool if
5067    available */
5068 kmp_team_t *
5069 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5070 #if OMPT_SUPPORT
5071                     ompt_data_t ompt_parallel_data,
5072 #endif
5073                     kmp_proc_bind_t new_proc_bind,
5074                     kmp_internal_control_t *new_icvs,
5075                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5076   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5077   int f;
5078   kmp_team_t *team;
5079   int use_hot_team = !root->r.r_active;
5080   int level = 0;
5081   int do_place_partition = 1;
5082 
5083   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5084   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5085   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5086   KMP_MB();
5087 
5088 #if KMP_NESTED_HOT_TEAMS
5089   kmp_hot_team_ptr_t *hot_teams;
5090   if (master) {
5091     team = master->th.th_team;
5092     level = team->t.t_active_level;
5093     if (master->th.th_teams_microtask) { // in teams construct?
5094       if (master->th.th_teams_size.nteams > 1 &&
5095           ( // #teams > 1
5096               team->t.t_pkfn ==
5097                   (microtask_t)__kmp_teams_master || // inner fork of the teams
5098               master->th.th_teams_level <
5099                   team->t.t_level)) { // or nested parallel inside the teams
5100         ++level; // not increment if #teams==1, or for outer fork of the teams;
5101         // increment otherwise
5102       }
5103       // Do not perform the place partition if inner fork of the teams
5104       // Wait until nested parallel region encountered inside teams construct
5105       if ((master->th.th_teams_size.nteams == 1 &&
5106            master->th.th_teams_level >= team->t.t_level) ||
5107           (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5108         do_place_partition = 0;
5109     }
5110     hot_teams = master->th.th_hot_teams;
5111     if (level < __kmp_hot_teams_max_level && hot_teams &&
5112         hot_teams[level].hot_team) {
5113       // hot team has already been allocated for given level
5114       use_hot_team = 1;
5115     } else {
5116       use_hot_team = 0;
5117     }
5118   } else {
5119     // check we won't access uninitialized hot_teams, just in case
5120     KMP_DEBUG_ASSERT(new_nproc == 1);
5121   }
5122 #endif
5123   // Optimization to use a "hot" team
5124   if (use_hot_team && new_nproc > 1) {
5125     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5126 #if KMP_NESTED_HOT_TEAMS
5127     team = hot_teams[level].hot_team;
5128 #else
5129     team = root->r.r_hot_team;
5130 #endif
5131 #if KMP_DEBUG
5132     if (__kmp_tasking_mode != tskm_immediate_exec) {
5133       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5134                     "task_team[1] = %p before reinit\n",
5135                     team->t.t_task_team[0], team->t.t_task_team[1]));
5136     }
5137 #endif
5138 
5139     if (team->t.t_nproc != new_nproc &&
5140         __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5141       // Distributed barrier may need a resize
5142       int old_nthr = team->t.t_nproc;
5143       __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5144     }
5145 
5146     // If not doing the place partition, then reset the team's proc bind
5147     // to indicate that partitioning of all threads still needs to take place
5148     if (do_place_partition == 0)
5149       team->t.t_proc_bind = proc_bind_default;
5150     // Has the number of threads changed?
5151     /* Let's assume the most common case is that the number of threads is
5152        unchanged, and put that case first. */
5153     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5154       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5155       // This case can mean that omp_set_num_threads() was called and the hot
5156       // team size was already reduced, so we check the special flag
5157       if (team->t.t_size_changed == -1) {
5158         team->t.t_size_changed = 1;
5159       } else {
5160         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5161       }
5162 
5163       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5164       kmp_r_sched_t new_sched = new_icvs->sched;
5165       // set primary thread's schedule as new run-time schedule
5166       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5167 
5168       __kmp_reinitialize_team(team, new_icvs,
5169                               root->r.r_uber_thread->th.th_ident);
5170 
5171       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5172                     team->t.t_threads[0], team));
5173       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5174 
5175 #if KMP_AFFINITY_SUPPORTED
5176       if ((team->t.t_size_changed == 0) &&
5177           (team->t.t_proc_bind == new_proc_bind)) {
5178         if (new_proc_bind == proc_bind_spread) {
5179           if (do_place_partition) {
5180             // add flag to update only master for spread
5181             __kmp_partition_places(team, 1);
5182           }
5183         }
5184         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5185                        "proc_bind = %d, partition = [%d,%d]\n",
5186                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5187                        team->t.t_last_place));
5188       } else {
5189         if (do_place_partition) {
5190           KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5191           __kmp_partition_places(team);
5192         }
5193       }
5194 #else
5195       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5196 #endif /* KMP_AFFINITY_SUPPORTED */
5197     } else if (team->t.t_nproc > new_nproc) {
5198       KA_TRACE(20,
5199                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5200                 new_nproc));
5201 
5202       team->t.t_size_changed = 1;
5203       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5204         // Barrier size already reduced earlier in this function
5205         // Activate team threads via th_used_in_team
5206         __kmp_add_threads_to_team(team, new_nproc);
5207       }
5208 #if KMP_NESTED_HOT_TEAMS
5209       if (__kmp_hot_teams_mode == 0) {
5210         // AC: saved number of threads should correspond to team's value in this
5211         // mode, can be bigger in mode 1, when hot team has threads in reserve
5212         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5213         hot_teams[level].hot_team_nth = new_nproc;
5214 #endif // KMP_NESTED_HOT_TEAMS
5215         /* release the extra threads we don't need any more */
5216         for (f = new_nproc; f < team->t.t_nproc; f++) {
5217           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5218           if (__kmp_tasking_mode != tskm_immediate_exec) {
5219             // When decreasing team size, threads no longer in the team should
5220             // unref task team.
5221             team->t.t_threads[f]->th.th_task_team = NULL;
5222           }
5223           __kmp_free_thread(team->t.t_threads[f]);
5224           team->t.t_threads[f] = NULL;
5225         }
5226 #if KMP_NESTED_HOT_TEAMS
5227       } // (__kmp_hot_teams_mode == 0)
5228       else {
5229         // When keeping extra threads in team, switch threads to wait on own
5230         // b_go flag
5231         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5232           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5233           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5234           for (int b = 0; b < bs_last_barrier; ++b) {
5235             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5236               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5237             }
5238             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5239           }
5240         }
5241       }
5242 #endif // KMP_NESTED_HOT_TEAMS
5243       team->t.t_nproc = new_nproc;
5244       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5245       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5246       __kmp_reinitialize_team(team, new_icvs,
5247                               root->r.r_uber_thread->th.th_ident);
5248 
5249       // Update remaining threads
5250       for (f = 0; f < new_nproc; ++f) {
5251         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5252       }
5253 
5254       // restore the current task state of the primary thread: should be the
5255       // implicit task
5256       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5257                     team->t.t_threads[0], team));
5258 
5259       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5260 
5261 #ifdef KMP_DEBUG
5262       for (f = 0; f < team->t.t_nproc; f++) {
5263         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5264                          team->t.t_threads[f]->th.th_team_nproc ==
5265                              team->t.t_nproc);
5266       }
5267 #endif
5268 
5269       if (do_place_partition) {
5270         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5271 #if KMP_AFFINITY_SUPPORTED
5272         __kmp_partition_places(team);
5273 #endif
5274       }
5275     } else { // team->t.t_nproc < new_nproc
5276 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5277       kmp_affin_mask_t *old_mask;
5278       if (KMP_AFFINITY_CAPABLE()) {
5279         KMP_CPU_ALLOC(old_mask);
5280       }
5281 #endif
5282 
5283       KA_TRACE(20,
5284                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5285                 new_nproc));
5286       int old_nproc = team->t.t_nproc; // save old value and use to update only
5287       team->t.t_size_changed = 1;
5288 
5289 #if KMP_NESTED_HOT_TEAMS
5290       int avail_threads = hot_teams[level].hot_team_nth;
5291       if (new_nproc < avail_threads)
5292         avail_threads = new_nproc;
5293       kmp_info_t **other_threads = team->t.t_threads;
5294       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5295         // Adjust barrier data of reserved threads (if any) of the team
5296         // Other data will be set in __kmp_initialize_info() below.
5297         int b;
5298         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5299         for (b = 0; b < bs_last_barrier; ++b) {
5300           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5301           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5302 #if USE_DEBUGGER
5303           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5304 #endif
5305         }
5306       }
5307       if (hot_teams[level].hot_team_nth >= new_nproc) {
5308         // we have all needed threads in reserve, no need to allocate any
5309         // this only possible in mode 1, cannot have reserved threads in mode 0
5310         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5311         team->t.t_nproc = new_nproc; // just get reserved threads involved
5312       } else {
5313         // We may have some threads in reserve, but not enough;
5314         // get reserved threads involved if any.
5315         team->t.t_nproc = hot_teams[level].hot_team_nth;
5316         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5317 #endif // KMP_NESTED_HOT_TEAMS
5318         if (team->t.t_max_nproc < new_nproc) {
5319           /* reallocate larger arrays */
5320           __kmp_reallocate_team_arrays(team, new_nproc);
5321           __kmp_reinitialize_team(team, new_icvs, NULL);
5322         }
5323 
5324 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5325         /* Temporarily set full mask for primary thread before creation of
5326            workers. The reason is that workers inherit the affinity from the
5327            primary thread, so if a lot of workers are created on the single
5328            core quickly, they don't get a chance to set their own affinity for
5329            a long time. */
5330         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5331 #endif
5332 
5333         /* allocate new threads for the hot team */
5334         for (f = team->t.t_nproc; f < new_nproc; f++) {
5335           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5336           KMP_DEBUG_ASSERT(new_worker);
5337           team->t.t_threads[f] = new_worker;
5338 
5339           KA_TRACE(20,
5340                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5341                     "join=%llu, plain=%llu\n",
5342                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5343                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5344                     team->t.t_bar[bs_plain_barrier].b_arrived));
5345 
5346           { // Initialize barrier data for new threads.
5347             int b;
5348             kmp_balign_t *balign = new_worker->th.th_bar;
5349             for (b = 0; b < bs_last_barrier; ++b) {
5350               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5351               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5352                                KMP_BARRIER_PARENT_FLAG);
5353 #if USE_DEBUGGER
5354               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5355 #endif
5356             }
5357           }
5358         }
5359 
5360 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5361         if (KMP_AFFINITY_CAPABLE()) {
5362           /* Restore initial primary thread's affinity mask */
5363           __kmp_set_system_affinity(old_mask, TRUE);
5364           KMP_CPU_FREE(old_mask);
5365         }
5366 #endif
5367 #if KMP_NESTED_HOT_TEAMS
5368       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5369 #endif // KMP_NESTED_HOT_TEAMS
5370       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5371         // Barrier size already increased earlier in this function
5372         // Activate team threads via th_used_in_team
5373         __kmp_add_threads_to_team(team, new_nproc);
5374       }
5375       /* make sure everyone is syncronized */
5376       // new threads below
5377       __kmp_initialize_team(team, new_nproc, new_icvs,
5378                             root->r.r_uber_thread->th.th_ident);
5379 
5380       /* reinitialize the threads */
5381       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5382       for (f = 0; f < team->t.t_nproc; ++f)
5383         __kmp_initialize_info(team->t.t_threads[f], team, f,
5384                               __kmp_gtid_from_tid(f, team));
5385 
5386       if (level) { // set th_task_state for new threads in nested hot team
5387         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5388         // only need to set the th_task_state for the new threads. th_task_state
5389         // for primary thread will not be accurate until after this in
5390         // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5391         // get the correct value.
5392         for (f = old_nproc; f < team->t.t_nproc; ++f)
5393           team->t.t_threads[f]->th.th_task_state =
5394               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5395       } else { // set th_task_state for new threads in non-nested hot team
5396         // copy primary thread's state
5397         kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5398         for (f = old_nproc; f < team->t.t_nproc; ++f)
5399           team->t.t_threads[f]->th.th_task_state = old_state;
5400       }
5401 
5402 #ifdef KMP_DEBUG
5403       for (f = 0; f < team->t.t_nproc; ++f) {
5404         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5405                          team->t.t_threads[f]->th.th_team_nproc ==
5406                              team->t.t_nproc);
5407       }
5408 #endif
5409 
5410       if (do_place_partition) {
5411         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5412 #if KMP_AFFINITY_SUPPORTED
5413         __kmp_partition_places(team);
5414 #endif
5415       }
5416     } // Check changes in number of threads
5417 
5418     kmp_info_t *master = team->t.t_threads[0];
5419     if (master->th.th_teams_microtask) {
5420       for (f = 1; f < new_nproc; ++f) {
5421         // propagate teams construct specific info to workers
5422         kmp_info_t *thr = team->t.t_threads[f];
5423         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5424         thr->th.th_teams_level = master->th.th_teams_level;
5425         thr->th.th_teams_size = master->th.th_teams_size;
5426       }
5427     }
5428 #if KMP_NESTED_HOT_TEAMS
5429     if (level) {
5430       // Sync barrier state for nested hot teams, not needed for outermost hot
5431       // team.
5432       for (f = 1; f < new_nproc; ++f) {
5433         kmp_info_t *thr = team->t.t_threads[f];
5434         int b;
5435         kmp_balign_t *balign = thr->th.th_bar;
5436         for (b = 0; b < bs_last_barrier; ++b) {
5437           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5438           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5439 #if USE_DEBUGGER
5440           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5441 #endif
5442         }
5443       }
5444     }
5445 #endif // KMP_NESTED_HOT_TEAMS
5446 
5447     /* reallocate space for arguments if necessary */
5448     __kmp_alloc_argv_entries(argc, team, TRUE);
5449     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5450     // The hot team re-uses the previous task team,
5451     // if untouched during the previous release->gather phase.
5452 
5453     KF_TRACE(10, (" hot_team = %p\n", team));
5454 
5455 #if KMP_DEBUG
5456     if (__kmp_tasking_mode != tskm_immediate_exec) {
5457       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5458                     "task_team[1] = %p after reinit\n",
5459                     team->t.t_task_team[0], team->t.t_task_team[1]));
5460     }
5461 #endif
5462 
5463 #if OMPT_SUPPORT
5464     __ompt_team_assign_id(team, ompt_parallel_data);
5465 #endif
5466 
5467     KMP_MB();
5468 
5469     return team;
5470   }
5471 
5472   /* next, let's try to take one from the team pool */
5473   KMP_MB();
5474   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5475     /* TODO: consider resizing undersized teams instead of reaping them, now
5476        that we have a resizing mechanism */
5477     if (team->t.t_max_nproc >= max_nproc) {
5478       /* take this team from the team pool */
5479       __kmp_team_pool = team->t.t_next_pool;
5480 
5481       if (max_nproc > 1 &&
5482           __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5483         if (!team->t.b) { // Allocate barrier structure
5484           team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5485         }
5486       }
5487 
5488       /* setup the team for fresh use */
5489       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5490 
5491       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5492                     "task_team[1] %p to NULL\n",
5493                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5494       team->t.t_task_team[0] = NULL;
5495       team->t.t_task_team[1] = NULL;
5496 
5497       /* reallocate space for arguments if necessary */
5498       __kmp_alloc_argv_entries(argc, team, TRUE);
5499       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5500 
5501       KA_TRACE(
5502           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5503                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5504       { // Initialize barrier data.
5505         int b;
5506         for (b = 0; b < bs_last_barrier; ++b) {
5507           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5508 #if USE_DEBUGGER
5509           team->t.t_bar[b].b_master_arrived = 0;
5510           team->t.t_bar[b].b_team_arrived = 0;
5511 #endif
5512         }
5513       }
5514 
5515       team->t.t_proc_bind = new_proc_bind;
5516 
5517       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5518                     team->t.t_id));
5519 
5520 #if OMPT_SUPPORT
5521       __ompt_team_assign_id(team, ompt_parallel_data);
5522 #endif
5523 
5524       KMP_MB();
5525 
5526       return team;
5527     }
5528 
5529     /* reap team if it is too small, then loop back and check the next one */
5530     // not sure if this is wise, but, will be redone during the hot-teams
5531     // rewrite.
5532     /* TODO: Use technique to find the right size hot-team, don't reap them */
5533     team = __kmp_reap_team(team);
5534     __kmp_team_pool = team;
5535   }
5536 
5537   /* nothing available in the pool, no matter, make a new team! */
5538   KMP_MB();
5539   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5540 
5541   /* and set it up */
5542   team->t.t_max_nproc = max_nproc;
5543   if (max_nproc > 1 &&
5544       __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5545     // Allocate barrier structure
5546     team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5547   }
5548 
5549   /* NOTE well, for some reason allocating one big buffer and dividing it up
5550      seems to really hurt performance a lot on the P4, so, let's not use this */
5551   __kmp_allocate_team_arrays(team, max_nproc);
5552 
5553   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5554   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5555 
5556   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5557                 "%p to NULL\n",
5558                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5559   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5560   // memory, no need to duplicate
5561   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5562   // memory, no need to duplicate
5563 
5564   if (__kmp_storage_map) {
5565     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5566   }
5567 
5568   /* allocate space for arguments */
5569   __kmp_alloc_argv_entries(argc, team, FALSE);
5570   team->t.t_argc = argc;
5571 
5572   KA_TRACE(20,
5573            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5574             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5575   { // Initialize barrier data.
5576     int b;
5577     for (b = 0; b < bs_last_barrier; ++b) {
5578       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5579 #if USE_DEBUGGER
5580       team->t.t_bar[b].b_master_arrived = 0;
5581       team->t.t_bar[b].b_team_arrived = 0;
5582 #endif
5583     }
5584   }
5585 
5586   team->t.t_proc_bind = new_proc_bind;
5587 
5588 #if OMPT_SUPPORT
5589   __ompt_team_assign_id(team, ompt_parallel_data);
5590   team->t.ompt_serialized_team_info = NULL;
5591 #endif
5592 
5593   KMP_MB();
5594 
5595   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5596                 team->t.t_id));
5597 
5598   return team;
5599 }
5600 
5601 /* TODO implement hot-teams at all levels */
5602 /* TODO implement lazy thread release on demand (disband request) */
5603 
5604 /* free the team.  return it to the team pool.  release all the threads
5605  * associated with it */
5606 void __kmp_free_team(kmp_root_t *root,
5607                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5608   int f;
5609   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5610                 team->t.t_id));
5611 
5612   /* verify state */
5613   KMP_DEBUG_ASSERT(root);
5614   KMP_DEBUG_ASSERT(team);
5615   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5616   KMP_DEBUG_ASSERT(team->t.t_threads);
5617 
5618   int use_hot_team = team == root->r.r_hot_team;
5619 #if KMP_NESTED_HOT_TEAMS
5620   int level;
5621   if (master) {
5622     level = team->t.t_active_level - 1;
5623     if (master->th.th_teams_microtask) { // in teams construct?
5624       if (master->th.th_teams_size.nteams > 1) {
5625         ++level; // level was not increased in teams construct for
5626         // team_of_masters
5627       }
5628       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5629           master->th.th_teams_level == team->t.t_level) {
5630         ++level; // level was not increased in teams construct for
5631         // team_of_workers before the parallel
5632       } // team->t.t_level will be increased inside parallel
5633     }
5634 #if KMP_DEBUG
5635     kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5636 #endif
5637     if (level < __kmp_hot_teams_max_level) {
5638       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5639       use_hot_team = 1;
5640     }
5641   }
5642 #endif // KMP_NESTED_HOT_TEAMS
5643 
5644   /* team is done working */
5645   TCW_SYNC_PTR(team->t.t_pkfn,
5646                NULL); // Important for Debugging Support Library.
5647 #if KMP_OS_WINDOWS
5648   team->t.t_copyin_counter = 0; // init counter for possible reuse
5649 #endif
5650   // Do not reset pointer to parent team to NULL for hot teams.
5651 
5652   /* if we are non-hot team, release our threads */
5653   if (!use_hot_team) {
5654     if (__kmp_tasking_mode != tskm_immediate_exec) {
5655       // Wait for threads to reach reapable state
5656       for (f = 1; f < team->t.t_nproc; ++f) {
5657         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5658         kmp_info_t *th = team->t.t_threads[f];
5659         volatile kmp_uint32 *state = &th->th.th_reap_state;
5660         while (*state != KMP_SAFE_TO_REAP) {
5661 #if KMP_OS_WINDOWS
5662           // On Windows a thread can be killed at any time, check this
5663           DWORD ecode;
5664           if (!__kmp_is_thread_alive(th, &ecode)) {
5665             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5666             break;
5667           }
5668 #endif
5669           // first check if thread is sleeping
5670           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5671           if (fl.is_sleeping())
5672             fl.resume(__kmp_gtid_from_thread(th));
5673           KMP_CPU_PAUSE();
5674         }
5675       }
5676 
5677       // Delete task teams
5678       int tt_idx;
5679       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5680         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5681         if (task_team != NULL) {
5682           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5683             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5684             team->t.t_threads[f]->th.th_task_team = NULL;
5685           }
5686           KA_TRACE(
5687               20,
5688               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5689                __kmp_get_gtid(), task_team, team->t.t_id));
5690 #if KMP_NESTED_HOT_TEAMS
5691           __kmp_free_task_team(master, task_team);
5692 #endif
5693           team->t.t_task_team[tt_idx] = NULL;
5694         }
5695       }
5696     }
5697 
5698     // Reset pointer to parent team only for non-hot teams.
5699     team->t.t_parent = NULL;
5700     team->t.t_level = 0;
5701     team->t.t_active_level = 0;
5702 
5703     /* free the worker threads */
5704     for (f = 1; f < team->t.t_nproc; ++f) {
5705       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5706       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5707         KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5708                                     1, 2);
5709       }
5710       __kmp_free_thread(team->t.t_threads[f]);
5711     }
5712 
5713     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5714       if (team->t.b) {
5715         // wake up thread at old location
5716         team->t.b->go_release();
5717         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5718           for (f = 1; f < team->t.t_nproc; ++f) {
5719             if (team->t.b->sleep[f].sleep) {
5720               __kmp_atomic_resume_64(
5721                   team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5722                   (kmp_atomic_flag_64<> *)NULL);
5723             }
5724           }
5725         }
5726         // Wait for threads to be removed from team
5727         for (int f = 1; f < team->t.t_nproc; ++f) {
5728           while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5729             KMP_CPU_PAUSE();
5730         }
5731       }
5732     }
5733 
5734     for (f = 1; f < team->t.t_nproc; ++f) {
5735       team->t.t_threads[f] = NULL;
5736     }
5737 
5738     if (team->t.t_max_nproc > 1 &&
5739         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5740       distributedBarrier::deallocate(team->t.b);
5741       team->t.b = NULL;
5742     }
5743     /* put the team back in the team pool */
5744     /* TODO limit size of team pool, call reap_team if pool too large */
5745     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5746     __kmp_team_pool = (volatile kmp_team_t *)team;
5747   } else { // Check if team was created for primary threads in teams construct
5748     // See if first worker is a CG root
5749     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5750                      team->t.t_threads[1]->th.th_cg_roots);
5751     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5752       // Clean up the CG root nodes on workers so that this team can be re-used
5753       for (f = 1; f < team->t.t_nproc; ++f) {
5754         kmp_info_t *thr = team->t.t_threads[f];
5755         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5756                          thr->th.th_cg_roots->cg_root == thr);
5757         // Pop current CG root off list
5758         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5759         thr->th.th_cg_roots = tmp->up;
5760         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5761                        " up to node %p. cg_nthreads was %d\n",
5762                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5763         int i = tmp->cg_nthreads--;
5764         if (i == 1) {
5765           __kmp_free(tmp); // free CG if we are the last thread in it
5766         }
5767         // Restore current task's thread_limit from CG root
5768         if (thr->th.th_cg_roots)
5769           thr->th.th_current_task->td_icvs.thread_limit =
5770               thr->th.th_cg_roots->cg_thread_limit;
5771       }
5772     }
5773   }
5774 
5775   KMP_MB();
5776 }
5777 
5778 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5779 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5780   kmp_team_t *next_pool = team->t.t_next_pool;
5781 
5782   KMP_DEBUG_ASSERT(team);
5783   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5784   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5785   KMP_DEBUG_ASSERT(team->t.t_threads);
5786   KMP_DEBUG_ASSERT(team->t.t_argv);
5787 
5788   /* TODO clean the threads that are a part of this? */
5789 
5790   /* free stuff */
5791   __kmp_free_team_arrays(team);
5792   if (team->t.t_argv != &team->t.t_inline_argv[0])
5793     __kmp_free((void *)team->t.t_argv);
5794   __kmp_free(team);
5795 
5796   KMP_MB();
5797   return next_pool;
5798 }
5799 
5800 // Free the thread.  Don't reap it, just place it on the pool of available
5801 // threads.
5802 //
5803 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5804 // binding for the affinity mechanism to be useful.
5805 //
5806 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5807 // However, we want to avoid a potential performance problem by always
5808 // scanning through the list to find the correct point at which to insert
5809 // the thread (potential N**2 behavior).  To do this we keep track of the
5810 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5811 // With single-level parallelism, threads will always be added to the tail
5812 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5813 // parallelism, all bets are off and we may need to scan through the entire
5814 // free list.
5815 //
5816 // This change also has a potentially large performance benefit, for some
5817 // applications.  Previously, as threads were freed from the hot team, they
5818 // would be placed back on the free list in inverse order.  If the hot team
5819 // grew back to it's original size, then the freed thread would be placed
5820 // back on the hot team in reverse order.  This could cause bad cache
5821 // locality problems on programs where the size of the hot team regularly
5822 // grew and shrunk.
5823 //
5824 // Now, for single-level parallelism, the OMP tid is always == gtid.
5825 void __kmp_free_thread(kmp_info_t *this_th) {
5826   int gtid;
5827   kmp_info_t **scan;
5828 
5829   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5830                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5831 
5832   KMP_DEBUG_ASSERT(this_th);
5833 
5834   // When moving thread to pool, switch thread to wait on own b_go flag, and
5835   // uninitialized (NULL team).
5836   int b;
5837   kmp_balign_t *balign = this_th->th.th_bar;
5838   for (b = 0; b < bs_last_barrier; ++b) {
5839     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5840       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5841     balign[b].bb.team = NULL;
5842     balign[b].bb.leaf_kids = 0;
5843   }
5844   this_th->th.th_task_state = 0;
5845   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5846 
5847   /* put thread back on the free pool */
5848   TCW_PTR(this_th->th.th_team, NULL);
5849   TCW_PTR(this_th->th.th_root, NULL);
5850   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5851 
5852   while (this_th->th.th_cg_roots) {
5853     this_th->th.th_cg_roots->cg_nthreads--;
5854     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5855                    " %p of thread  %p to %d\n",
5856                    this_th, this_th->th.th_cg_roots,
5857                    this_th->th.th_cg_roots->cg_root,
5858                    this_th->th.th_cg_roots->cg_nthreads));
5859     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5860     if (tmp->cg_root == this_th) { // Thread is a cg_root
5861       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5862       KA_TRACE(
5863           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5864       this_th->th.th_cg_roots = tmp->up;
5865       __kmp_free(tmp);
5866     } else { // Worker thread
5867       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5868         __kmp_free(tmp);
5869       }
5870       this_th->th.th_cg_roots = NULL;
5871       break;
5872     }
5873   }
5874 
5875   /* If the implicit task assigned to this thread can be used by other threads
5876    * -> multiple threads can share the data and try to free the task at
5877    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5878    * with higher probability when hot team is disabled but can occurs even when
5879    * the hot team is enabled */
5880   __kmp_free_implicit_task(this_th);
5881   this_th->th.th_current_task = NULL;
5882 
5883   // If the __kmp_thread_pool_insert_pt is already past the new insert
5884   // point, then we need to re-scan the entire list.
5885   gtid = this_th->th.th_info.ds.ds_gtid;
5886   if (__kmp_thread_pool_insert_pt != NULL) {
5887     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5888     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5889       __kmp_thread_pool_insert_pt = NULL;
5890     }
5891   }
5892 
5893   // Scan down the list to find the place to insert the thread.
5894   // scan is the address of a link in the list, possibly the address of
5895   // __kmp_thread_pool itself.
5896   //
5897   // In the absence of nested parallelism, the for loop will have 0 iterations.
5898   if (__kmp_thread_pool_insert_pt != NULL) {
5899     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5900   } else {
5901     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5902   }
5903   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5904        scan = &((*scan)->th.th_next_pool))
5905     ;
5906 
5907   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5908   // to its address.
5909   TCW_PTR(this_th->th.th_next_pool, *scan);
5910   __kmp_thread_pool_insert_pt = *scan = this_th;
5911   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5912                    (this_th->th.th_info.ds.ds_gtid <
5913                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5914   TCW_4(this_th->th.th_in_pool, TRUE);
5915   __kmp_suspend_initialize_thread(this_th);
5916   __kmp_lock_suspend_mx(this_th);
5917   if (this_th->th.th_active == TRUE) {
5918     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5919     this_th->th.th_active_in_pool = TRUE;
5920   }
5921 #if KMP_DEBUG
5922   else {
5923     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5924   }
5925 #endif
5926   __kmp_unlock_suspend_mx(this_th);
5927 
5928   TCW_4(__kmp_nth, __kmp_nth - 1);
5929 
5930 #ifdef KMP_ADJUST_BLOCKTIME
5931   /* Adjust blocktime back to user setting or default if necessary */
5932   /* Middle initialization might never have occurred                */
5933   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5934     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5935     if (__kmp_nth <= __kmp_avail_proc) {
5936       __kmp_zero_bt = FALSE;
5937     }
5938   }
5939 #endif /* KMP_ADJUST_BLOCKTIME */
5940 
5941   KMP_MB();
5942 }
5943 
5944 /* ------------------------------------------------------------------------ */
5945 
5946 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5947 #if OMP_PROFILING_SUPPORT
5948   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5949   // TODO: add a configuration option for time granularity
5950   if (ProfileTraceFile)
5951     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5952 #endif
5953 
5954   int gtid = this_thr->th.th_info.ds.ds_gtid;
5955   /*    void                 *stack_data;*/
5956   kmp_team_t **volatile pteam;
5957 
5958   KMP_MB();
5959   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5960 
5961   if (__kmp_env_consistency_check) {
5962     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5963   }
5964 
5965 #if OMPD_SUPPORT
5966   if (ompd_state & OMPD_ENABLE_BP)
5967     ompd_bp_thread_begin();
5968 #endif
5969 
5970 #if OMPT_SUPPORT
5971   ompt_data_t *thread_data = nullptr;
5972   if (ompt_enabled.enabled) {
5973     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5974     *thread_data = ompt_data_none;
5975 
5976     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5977     this_thr->th.ompt_thread_info.wait_id = 0;
5978     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5979     this_thr->th.ompt_thread_info.parallel_flags = 0;
5980     if (ompt_enabled.ompt_callback_thread_begin) {
5981       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5982           ompt_thread_worker, thread_data);
5983     }
5984     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5985   }
5986 #endif
5987 
5988   /* This is the place where threads wait for work */
5989   while (!TCR_4(__kmp_global.g.g_done)) {
5990     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5991     KMP_MB();
5992 
5993     /* wait for work to do */
5994     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5995 
5996     /* No tid yet since not part of a team */
5997     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5998 
5999 #if OMPT_SUPPORT
6000     if (ompt_enabled.enabled) {
6001       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6002     }
6003 #endif
6004 
6005     pteam = &this_thr->th.th_team;
6006 
6007     /* have we been allocated? */
6008     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6009       /* we were just woken up, so run our new task */
6010       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6011         int rc;
6012         KA_TRACE(20,
6013                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6014                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6015                   (*pteam)->t.t_pkfn));
6016 
6017         updateHWFPControl(*pteam);
6018 
6019 #if OMPT_SUPPORT
6020         if (ompt_enabled.enabled) {
6021           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6022         }
6023 #endif
6024 
6025         rc = (*pteam)->t.t_invoke(gtid);
6026         KMP_ASSERT(rc);
6027 
6028         KMP_MB();
6029         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6030                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6031                       (*pteam)->t.t_pkfn));
6032       }
6033 #if OMPT_SUPPORT
6034       if (ompt_enabled.enabled) {
6035         /* no frame set while outside task */
6036         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6037 
6038         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6039       }
6040 #endif
6041       /* join barrier after parallel region */
6042       __kmp_join_barrier(gtid);
6043     }
6044   }
6045   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6046 
6047 #if OMPD_SUPPORT
6048   if (ompd_state & OMPD_ENABLE_BP)
6049     ompd_bp_thread_end();
6050 #endif
6051 
6052 #if OMPT_SUPPORT
6053   if (ompt_enabled.ompt_callback_thread_end) {
6054     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6055   }
6056 #endif
6057 
6058   this_thr->th.th_task_team = NULL;
6059   /* run the destructors for the threadprivate data for this thread */
6060   __kmp_common_destroy_gtid(gtid);
6061 
6062   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6063   KMP_MB();
6064 
6065 #if OMP_PROFILING_SUPPORT
6066   llvm::timeTraceProfilerFinishThread();
6067 #endif
6068   return this_thr;
6069 }
6070 
6071 /* ------------------------------------------------------------------------ */
6072 
6073 void __kmp_internal_end_dest(void *specific_gtid) {
6074   // Make sure no significant bits are lost
6075   int gtid;
6076   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6077 
6078   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6079   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6080    * this is because 0 is reserved for the nothing-stored case */
6081 
6082   __kmp_internal_end_thread(gtid);
6083 }
6084 
6085 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6086 
6087 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6088   __kmp_internal_end_atexit();
6089 }
6090 
6091 #endif
6092 
6093 /* [Windows] josh: when the atexit handler is called, there may still be more
6094    than one thread alive */
6095 void __kmp_internal_end_atexit(void) {
6096   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6097   /* [Windows]
6098      josh: ideally, we want to completely shutdown the library in this atexit
6099      handler, but stat code that depends on thread specific data for gtid fails
6100      because that data becomes unavailable at some point during the shutdown, so
6101      we call __kmp_internal_end_thread instead. We should eventually remove the
6102      dependency on __kmp_get_specific_gtid in the stat code and use
6103      __kmp_internal_end_library to cleanly shutdown the library.
6104 
6105      // TODO: Can some of this comment about GVS be removed?
6106      I suspect that the offending stat code is executed when the calling thread
6107      tries to clean up a dead root thread's data structures, resulting in GVS
6108      code trying to close the GVS structures for that thread, but since the stat
6109      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6110      the calling thread is cleaning up itself instead of another thread, it get
6111      confused. This happens because allowing a thread to unregister and cleanup
6112      another thread is a recent modification for addressing an issue.
6113      Based on the current design (20050722), a thread may end up
6114      trying to unregister another thread only if thread death does not trigger
6115      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
6116      thread specific data destructor function to detect thread death. For
6117      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6118      is nothing.  Thus, the workaround is applicable only for Windows static
6119      stat library. */
6120   __kmp_internal_end_library(-1);
6121 #if KMP_OS_WINDOWS
6122   __kmp_close_console();
6123 #endif
6124 }
6125 
6126 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6127   // It is assumed __kmp_forkjoin_lock is acquired.
6128 
6129   int gtid;
6130 
6131   KMP_DEBUG_ASSERT(thread != NULL);
6132 
6133   gtid = thread->th.th_info.ds.ds_gtid;
6134 
6135   if (!is_root) {
6136     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6137       /* Assume the threads are at the fork barrier here */
6138       KA_TRACE(
6139           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6140                gtid));
6141       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6142         while (
6143             !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6144           KMP_CPU_PAUSE();
6145         __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6146       } else {
6147         /* Need release fence here to prevent seg faults for tree forkjoin
6148            barrier (GEH) */
6149         kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6150                            thread);
6151         __kmp_release_64(&flag);
6152       }
6153     }
6154 
6155     // Terminate OS thread.
6156     __kmp_reap_worker(thread);
6157 
6158     // The thread was killed asynchronously.  If it was actively
6159     // spinning in the thread pool, decrement the global count.
6160     //
6161     // There is a small timing hole here - if the worker thread was just waking
6162     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6163     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6164     // the global counter might not get updated.
6165     //
6166     // Currently, this can only happen as the library is unloaded,
6167     // so there are no harmful side effects.
6168     if (thread->th.th_active_in_pool) {
6169       thread->th.th_active_in_pool = FALSE;
6170       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6171       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6172     }
6173   }
6174 
6175   __kmp_free_implicit_task(thread);
6176 
6177 // Free the fast memory for tasking
6178 #if USE_FAST_MEMORY
6179   __kmp_free_fast_memory(thread);
6180 #endif /* USE_FAST_MEMORY */
6181 
6182   __kmp_suspend_uninitialize_thread(thread);
6183 
6184   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6185   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6186 
6187   --__kmp_all_nth;
6188   // __kmp_nth was decremented when thread is added to the pool.
6189 
6190 #ifdef KMP_ADJUST_BLOCKTIME
6191   /* Adjust blocktime back to user setting or default if necessary */
6192   /* Middle initialization might never have occurred                */
6193   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6194     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6195     if (__kmp_nth <= __kmp_avail_proc) {
6196       __kmp_zero_bt = FALSE;
6197     }
6198   }
6199 #endif /* KMP_ADJUST_BLOCKTIME */
6200 
6201   /* free the memory being used */
6202   if (__kmp_env_consistency_check) {
6203     if (thread->th.th_cons) {
6204       __kmp_free_cons_stack(thread->th.th_cons);
6205       thread->th.th_cons = NULL;
6206     }
6207   }
6208 
6209   if (thread->th.th_pri_common != NULL) {
6210     __kmp_free(thread->th.th_pri_common);
6211     thread->th.th_pri_common = NULL;
6212   }
6213 
6214   if (thread->th.th_task_state_memo_stack != NULL) {
6215     __kmp_free(thread->th.th_task_state_memo_stack);
6216     thread->th.th_task_state_memo_stack = NULL;
6217   }
6218 
6219 #if KMP_USE_BGET
6220   if (thread->th.th_local.bget_data != NULL) {
6221     __kmp_finalize_bget(thread);
6222   }
6223 #endif
6224 
6225 #if KMP_AFFINITY_SUPPORTED
6226   if (thread->th.th_affin_mask != NULL) {
6227     KMP_CPU_FREE(thread->th.th_affin_mask);
6228     thread->th.th_affin_mask = NULL;
6229   }
6230 #endif /* KMP_AFFINITY_SUPPORTED */
6231 
6232 #if KMP_USE_HIER_SCHED
6233   if (thread->th.th_hier_bar_data != NULL) {
6234     __kmp_free(thread->th.th_hier_bar_data);
6235     thread->th.th_hier_bar_data = NULL;
6236   }
6237 #endif
6238 
6239   __kmp_reap_team(thread->th.th_serial_team);
6240   thread->th.th_serial_team = NULL;
6241   __kmp_free(thread);
6242 
6243   KMP_MB();
6244 
6245 } // __kmp_reap_thread
6246 
6247 static void __kmp_itthash_clean(kmp_info_t *th) {
6248 #if USE_ITT_NOTIFY
6249   if (__kmp_itt_region_domains.count > 0) {
6250     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6251       kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6252       while (bucket) {
6253         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6254         __kmp_thread_free(th, bucket);
6255         bucket = next;
6256       }
6257     }
6258   }
6259   if (__kmp_itt_barrier_domains.count > 0) {
6260     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6261       kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6262       while (bucket) {
6263         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6264         __kmp_thread_free(th, bucket);
6265         bucket = next;
6266       }
6267     }
6268   }
6269 #endif
6270 }
6271 
6272 static void __kmp_internal_end(void) {
6273   int i;
6274 
6275   /* First, unregister the library */
6276   __kmp_unregister_library();
6277 
6278 #if KMP_OS_WINDOWS
6279   /* In Win static library, we can't tell when a root actually dies, so we
6280      reclaim the data structures for any root threads that have died but not
6281      unregistered themselves, in order to shut down cleanly.
6282      In Win dynamic library we also can't tell when a thread dies.  */
6283   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6284 // dead roots
6285 #endif
6286 
6287   for (i = 0; i < __kmp_threads_capacity; i++)
6288     if (__kmp_root[i])
6289       if (__kmp_root[i]->r.r_active)
6290         break;
6291   KMP_MB(); /* Flush all pending memory write invalidates.  */
6292   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6293 
6294   if (i < __kmp_threads_capacity) {
6295 #if KMP_USE_MONITOR
6296     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6297     KMP_MB(); /* Flush all pending memory write invalidates.  */
6298 
6299     // Need to check that monitor was initialized before reaping it. If we are
6300     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6301     // __kmp_monitor will appear to contain valid data, but it is only valid in
6302     // the parent process, not the child.
6303     // New behavior (201008): instead of keying off of the flag
6304     // __kmp_init_parallel, the monitor thread creation is keyed off
6305     // of the new flag __kmp_init_monitor.
6306     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6307     if (TCR_4(__kmp_init_monitor)) {
6308       __kmp_reap_monitor(&__kmp_monitor);
6309       TCW_4(__kmp_init_monitor, 0);
6310     }
6311     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6312     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6313 #endif // KMP_USE_MONITOR
6314   } else {
6315 /* TODO move this to cleanup code */
6316 #ifdef KMP_DEBUG
6317     /* make sure that everything has properly ended */
6318     for (i = 0; i < __kmp_threads_capacity; i++) {
6319       if (__kmp_root[i]) {
6320         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6321         //                    there can be uber threads alive here
6322         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6323       }
6324     }
6325 #endif
6326 
6327     KMP_MB();
6328 
6329     // Reap the worker threads.
6330     // This is valid for now, but be careful if threads are reaped sooner.
6331     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6332       // Get the next thread from the pool.
6333       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6334       __kmp_thread_pool = thread->th.th_next_pool;
6335       // Reap it.
6336       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6337       thread->th.th_next_pool = NULL;
6338       thread->th.th_in_pool = FALSE;
6339       __kmp_reap_thread(thread, 0);
6340     }
6341     __kmp_thread_pool_insert_pt = NULL;
6342 
6343     // Reap teams.
6344     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6345       // Get the next team from the pool.
6346       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6347       __kmp_team_pool = team->t.t_next_pool;
6348       // Reap it.
6349       team->t.t_next_pool = NULL;
6350       __kmp_reap_team(team);
6351     }
6352 
6353     __kmp_reap_task_teams();
6354 
6355 #if KMP_OS_UNIX
6356     // Threads that are not reaped should not access any resources since they
6357     // are going to be deallocated soon, so the shutdown sequence should wait
6358     // until all threads either exit the final spin-waiting loop or begin
6359     // sleeping after the given blocktime.
6360     for (i = 0; i < __kmp_threads_capacity; i++) {
6361       kmp_info_t *thr = __kmp_threads[i];
6362       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6363         KMP_CPU_PAUSE();
6364     }
6365 #endif
6366 
6367     for (i = 0; i < __kmp_threads_capacity; ++i) {
6368       // TBD: Add some checking...
6369       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6370     }
6371 
6372     /* Make sure all threadprivate destructors get run by joining with all
6373        worker threads before resetting this flag */
6374     TCW_SYNC_4(__kmp_init_common, FALSE);
6375 
6376     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6377     KMP_MB();
6378 
6379 #if KMP_USE_MONITOR
6380     // See note above: One of the possible fixes for CQ138434 / CQ140126
6381     //
6382     // FIXME: push both code fragments down and CSE them?
6383     // push them into __kmp_cleanup() ?
6384     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6385     if (TCR_4(__kmp_init_monitor)) {
6386       __kmp_reap_monitor(&__kmp_monitor);
6387       TCW_4(__kmp_init_monitor, 0);
6388     }
6389     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6390     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6391 #endif
6392   } /* else !__kmp_global.t_active */
6393   TCW_4(__kmp_init_gtid, FALSE);
6394   KMP_MB(); /* Flush all pending memory write invalidates.  */
6395 
6396   __kmp_cleanup();
6397 #if OMPT_SUPPORT
6398   ompt_fini();
6399 #endif
6400 }
6401 
6402 void __kmp_internal_end_library(int gtid_req) {
6403   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6404   /* this shouldn't be a race condition because __kmp_internal_end() is the
6405      only place to clear __kmp_serial_init */
6406   /* we'll check this later too, after we get the lock */
6407   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6408   // redundant, because the next check will work in any case.
6409   if (__kmp_global.g.g_abort) {
6410     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6411     /* TODO abort? */
6412     return;
6413   }
6414   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6415     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6416     return;
6417   }
6418 
6419   // If hidden helper team has been initialized, we need to deinit it
6420   if (TCR_4(__kmp_init_hidden_helper) &&
6421       !TCR_4(__kmp_hidden_helper_team_done)) {
6422     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6423     // First release the main thread to let it continue its work
6424     __kmp_hidden_helper_main_thread_release();
6425     // Wait until the hidden helper team has been destroyed
6426     __kmp_hidden_helper_threads_deinitz_wait();
6427   }
6428 
6429   KMP_MB(); /* Flush all pending memory write invalidates.  */
6430   /* find out who we are and what we should do */
6431   {
6432     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6433     KA_TRACE(
6434         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6435     if (gtid == KMP_GTID_SHUTDOWN) {
6436       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6437                     "already shutdown\n"));
6438       return;
6439     } else if (gtid == KMP_GTID_MONITOR) {
6440       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6441                     "registered, or system shutdown\n"));
6442       return;
6443     } else if (gtid == KMP_GTID_DNE) {
6444       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6445                     "shutdown\n"));
6446       /* we don't know who we are, but we may still shutdown the library */
6447     } else if (KMP_UBER_GTID(gtid)) {
6448       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6449       if (__kmp_root[gtid]->r.r_active) {
6450         __kmp_global.g.g_abort = -1;
6451         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6452         __kmp_unregister_library();
6453         KA_TRACE(10,
6454                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6455                   gtid));
6456         return;
6457       } else {
6458         __kmp_itthash_clean(__kmp_threads[gtid]);
6459         KA_TRACE(
6460             10,
6461             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6462         __kmp_unregister_root_current_thread(gtid);
6463       }
6464     } else {
6465 /* worker threads may call this function through the atexit handler, if they
6466  * call exit() */
6467 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6468    TODO: do a thorough shutdown instead */
6469 #ifdef DUMP_DEBUG_ON_EXIT
6470       if (__kmp_debug_buf)
6471         __kmp_dump_debug_buffer();
6472 #endif
6473       // added unregister library call here when we switch to shm linux
6474       // if we don't, it will leave lots of files in /dev/shm
6475       // cleanup shared memory file before exiting.
6476       __kmp_unregister_library();
6477       return;
6478     }
6479   }
6480   /* synchronize the termination process */
6481   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6482 
6483   /* have we already finished */
6484   if (__kmp_global.g.g_abort) {
6485     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6486     /* TODO abort? */
6487     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6488     return;
6489   }
6490   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6491     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6492     return;
6493   }
6494 
6495   /* We need this lock to enforce mutex between this reading of
6496      __kmp_threads_capacity and the writing by __kmp_register_root.
6497      Alternatively, we can use a counter of roots that is atomically updated by
6498      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6499      __kmp_internal_end_*.  */
6500   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6501 
6502   /* now we can safely conduct the actual termination */
6503   __kmp_internal_end();
6504 
6505   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6506   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6507 
6508   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6509 
6510 #ifdef DUMP_DEBUG_ON_EXIT
6511   if (__kmp_debug_buf)
6512     __kmp_dump_debug_buffer();
6513 #endif
6514 
6515 #if KMP_OS_WINDOWS
6516   __kmp_close_console();
6517 #endif
6518 
6519   __kmp_fini_allocator();
6520 
6521 } // __kmp_internal_end_library
6522 
6523 void __kmp_internal_end_thread(int gtid_req) {
6524   int i;
6525 
6526   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6527   /* this shouldn't be a race condition because __kmp_internal_end() is the
6528    * only place to clear __kmp_serial_init */
6529   /* we'll check this later too, after we get the lock */
6530   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6531   // redundant, because the next check will work in any case.
6532   if (__kmp_global.g.g_abort) {
6533     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6534     /* TODO abort? */
6535     return;
6536   }
6537   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6538     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6539     return;
6540   }
6541 
6542   // If hidden helper team has been initialized, we need to deinit it
6543   if (TCR_4(__kmp_init_hidden_helper) &&
6544       !TCR_4(__kmp_hidden_helper_team_done)) {
6545     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6546     // First release the main thread to let it continue its work
6547     __kmp_hidden_helper_main_thread_release();
6548     // Wait until the hidden helper team has been destroyed
6549     __kmp_hidden_helper_threads_deinitz_wait();
6550   }
6551 
6552   KMP_MB(); /* Flush all pending memory write invalidates.  */
6553 
6554   /* find out who we are and what we should do */
6555   {
6556     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6557     KA_TRACE(10,
6558              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6559     if (gtid == KMP_GTID_SHUTDOWN) {
6560       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6561                     "already shutdown\n"));
6562       return;
6563     } else if (gtid == KMP_GTID_MONITOR) {
6564       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6565                     "registered, or system shutdown\n"));
6566       return;
6567     } else if (gtid == KMP_GTID_DNE) {
6568       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6569                     "shutdown\n"));
6570       return;
6571       /* we don't know who we are */
6572     } else if (KMP_UBER_GTID(gtid)) {
6573       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6574       if (__kmp_root[gtid]->r.r_active) {
6575         __kmp_global.g.g_abort = -1;
6576         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6577         KA_TRACE(10,
6578                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6579                   gtid));
6580         return;
6581       } else {
6582         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6583                       gtid));
6584         __kmp_unregister_root_current_thread(gtid);
6585       }
6586     } else {
6587       /* just a worker thread, let's leave */
6588       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6589 
6590       if (gtid >= 0) {
6591         __kmp_threads[gtid]->th.th_task_team = NULL;
6592       }
6593 
6594       KA_TRACE(10,
6595                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6596                 gtid));
6597       return;
6598     }
6599   }
6600 #if KMP_DYNAMIC_LIB
6601   if (__kmp_pause_status != kmp_hard_paused)
6602   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6603   // because we will better shutdown later in the library destructor.
6604   {
6605     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6606     return;
6607   }
6608 #endif
6609   /* synchronize the termination process */
6610   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6611 
6612   /* have we already finished */
6613   if (__kmp_global.g.g_abort) {
6614     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6615     /* TODO abort? */
6616     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6617     return;
6618   }
6619   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6620     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6621     return;
6622   }
6623 
6624   /* We need this lock to enforce mutex between this reading of
6625      __kmp_threads_capacity and the writing by __kmp_register_root.
6626      Alternatively, we can use a counter of roots that is atomically updated by
6627      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6628      __kmp_internal_end_*.  */
6629 
6630   /* should we finish the run-time?  are all siblings done? */
6631   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6632 
6633   for (i = 0; i < __kmp_threads_capacity; ++i) {
6634     if (KMP_UBER_GTID(i)) {
6635       KA_TRACE(
6636           10,
6637           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6638       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6639       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6640       return;
6641     }
6642   }
6643 
6644   /* now we can safely conduct the actual termination */
6645 
6646   __kmp_internal_end();
6647 
6648   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6649   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6650 
6651   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6652 
6653 #ifdef DUMP_DEBUG_ON_EXIT
6654   if (__kmp_debug_buf)
6655     __kmp_dump_debug_buffer();
6656 #endif
6657 } // __kmp_internal_end_thread
6658 
6659 // -----------------------------------------------------------------------------
6660 // Library registration stuff.
6661 
6662 static long __kmp_registration_flag = 0;
6663 // Random value used to indicate library initialization.
6664 static char *__kmp_registration_str = NULL;
6665 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6666 
6667 static inline char *__kmp_reg_status_name() {
6668 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6669    each thread. If registration and unregistration go in different threads
6670    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6671    env var can not be found, because the name will contain different pid. */
6672 // macOS* complains about name being too long with additional getuid()
6673 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6674   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6675                           (int)getuid());
6676 #else
6677   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6678 #endif
6679 } // __kmp_reg_status_get
6680 
6681 void __kmp_register_library_startup(void) {
6682 
6683   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6684   int done = 0;
6685   union {
6686     double dtime;
6687     long ltime;
6688   } time;
6689 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6690   __kmp_initialize_system_tick();
6691 #endif
6692   __kmp_read_system_time(&time.dtime);
6693   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6694   __kmp_registration_str =
6695       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6696                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6697 
6698   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6699                 __kmp_registration_str));
6700 
6701   while (!done) {
6702 
6703     char *value = NULL; // Actual value of the environment variable.
6704 
6705 #if defined(KMP_USE_SHM)
6706     char *shm_name = __kmp_str_format("/%s", name);
6707     int shm_preexist = 0;
6708     char *data1;
6709     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6710     if ((fd1 == -1) && (errno == EEXIST)) {
6711       // file didn't open because it already exists.
6712       // try opening existing file
6713       fd1 = shm_open(shm_name, O_RDWR, 0666);
6714       if (fd1 == -1) { // file didn't open
6715         // error out here
6716         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6717                     __kmp_msg_null);
6718       } else {
6719         // able to open existing file
6720         shm_preexist = 1;
6721       }
6722     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6723       // already exists.
6724       // error out here.
6725       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6726                   __kmp_msg_null);
6727     }
6728     if (shm_preexist == 0) {
6729       // we created SHM now set size
6730       if (ftruncate(fd1, SHM_SIZE) == -1) {
6731         // error occured setting size;
6732         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6733                     KMP_ERR(errno), __kmp_msg_null);
6734       }
6735     }
6736     data1 =
6737         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6738     if (data1 == MAP_FAILED) {
6739       // failed to map shared memory
6740       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6741                   __kmp_msg_null);
6742     }
6743     if (shm_preexist == 0) { // set data to SHM, set value
6744       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6745     }
6746     // Read value from either what we just wrote or existing file.
6747     value = __kmp_str_format("%s", data1); // read value from SHM
6748     munmap(data1, SHM_SIZE);
6749     close(fd1);
6750 #else // Windows and unix with static library
6751     // Set environment variable, but do not overwrite if it is exist.
6752     __kmp_env_set(name, __kmp_registration_str, 0);
6753     // read value to see if it got set
6754     value = __kmp_env_get(name);
6755 #endif
6756 
6757     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6758       done = 1; // Ok, environment variable set successfully, exit the loop.
6759     } else {
6760       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6761       // Check whether it alive or dead.
6762       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6763       char *tail = value;
6764       char *flag_addr_str = NULL;
6765       char *flag_val_str = NULL;
6766       char const *file_name = NULL;
6767       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6768       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6769       file_name = tail;
6770       if (tail != NULL) {
6771         unsigned long *flag_addr = 0;
6772         unsigned long flag_val = 0;
6773         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6774         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6775         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6776           // First, check whether environment-encoded address is mapped into
6777           // addr space.
6778           // If so, dereference it to see if it still has the right value.
6779           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6780             neighbor = 1;
6781           } else {
6782             // If not, then we know the other copy of the library is no longer
6783             // running.
6784             neighbor = 2;
6785           }
6786         }
6787       }
6788       switch (neighbor) {
6789       case 0: // Cannot parse environment variable -- neighbor status unknown.
6790         // Assume it is the incompatible format of future version of the
6791         // library. Assume the other library is alive.
6792         // WARN( ... ); // TODO: Issue a warning.
6793         file_name = "unknown library";
6794         KMP_FALLTHROUGH();
6795       // Attention! Falling to the next case. That's intentional.
6796       case 1: { // Neighbor is alive.
6797         // Check it is allowed.
6798         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6799         if (!__kmp_str_match_true(duplicate_ok)) {
6800           // That's not allowed. Issue fatal error.
6801           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6802                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6803         }
6804         KMP_INTERNAL_FREE(duplicate_ok);
6805         __kmp_duplicate_library_ok = 1;
6806         done = 1; // Exit the loop.
6807       } break;
6808       case 2: { // Neighbor is dead.
6809 
6810 #if defined(KMP_USE_SHM)
6811         // close shared memory.
6812         shm_unlink(shm_name); // this removes file in /dev/shm
6813 #else
6814         // Clear the variable and try to register library again.
6815         __kmp_env_unset(name);
6816 #endif
6817       } break;
6818       default: {
6819         KMP_DEBUG_ASSERT(0);
6820       } break;
6821       }
6822     }
6823     KMP_INTERNAL_FREE((void *)value);
6824 #if defined(KMP_USE_SHM)
6825     KMP_INTERNAL_FREE((void *)shm_name);
6826 #endif
6827   } // while
6828   KMP_INTERNAL_FREE((void *)name);
6829 
6830 } // func __kmp_register_library_startup
6831 
6832 void __kmp_unregister_library(void) {
6833 
6834   char *name = __kmp_reg_status_name();
6835   char *value = NULL;
6836 
6837 #if defined(KMP_USE_SHM)
6838   char *shm_name = __kmp_str_format("/%s", name);
6839   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6840   if (fd1 == -1) {
6841     // file did not open. return.
6842     return;
6843   }
6844   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6845   if (data1 != MAP_FAILED) {
6846     value = __kmp_str_format("%s", data1); // read value from SHM
6847     munmap(data1, SHM_SIZE);
6848   }
6849   close(fd1);
6850 #else
6851   value = __kmp_env_get(name);
6852 #endif
6853 
6854   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6855   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6856   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6857 //  Ok, this is our variable. Delete it.
6858 #if defined(KMP_USE_SHM)
6859     shm_unlink(shm_name); // this removes file in /dev/shm
6860 #else
6861     __kmp_env_unset(name);
6862 #endif
6863   }
6864 
6865 #if defined(KMP_USE_SHM)
6866   KMP_INTERNAL_FREE(shm_name);
6867 #endif
6868 
6869   KMP_INTERNAL_FREE(__kmp_registration_str);
6870   KMP_INTERNAL_FREE(value);
6871   KMP_INTERNAL_FREE(name);
6872 
6873   __kmp_registration_flag = 0;
6874   __kmp_registration_str = NULL;
6875 
6876 } // __kmp_unregister_library
6877 
6878 // End of Library registration stuff.
6879 // -----------------------------------------------------------------------------
6880 
6881 #if KMP_MIC_SUPPORTED
6882 
6883 static void __kmp_check_mic_type() {
6884   kmp_cpuid_t cpuid_state = {0};
6885   kmp_cpuid_t *cs_p = &cpuid_state;
6886   __kmp_x86_cpuid(1, 0, cs_p);
6887   // We don't support mic1 at the moment
6888   if ((cs_p->eax & 0xff0) == 0xB10) {
6889     __kmp_mic_type = mic2;
6890   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6891     __kmp_mic_type = mic3;
6892   } else {
6893     __kmp_mic_type = non_mic;
6894   }
6895 }
6896 
6897 #endif /* KMP_MIC_SUPPORTED */
6898 
6899 #if KMP_HAVE_UMWAIT
6900 static void __kmp_user_level_mwait_init() {
6901   struct kmp_cpuid buf;
6902   __kmp_x86_cpuid(7, 0, &buf);
6903   __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6904   __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6905   __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6906   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6907                 __kmp_umwait_enabled));
6908 }
6909 #elif KMP_HAVE_MWAIT
6910 #ifndef AT_INTELPHIUSERMWAIT
6911 // Spurious, non-existent value that should always fail to return anything.
6912 // Will be replaced with the correct value when we know that.
6913 #define AT_INTELPHIUSERMWAIT 10000
6914 #endif
6915 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6916 // earlier OS is used to build the RTL, we'll use the following internal
6917 // function when the entry is not found.
6918 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6919 unsigned long getauxval(unsigned long) { return 0; }
6920 
6921 static void __kmp_user_level_mwait_init() {
6922   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6923   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6924   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6925   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6926   if (__kmp_mic_type == mic3) {
6927     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6928     if ((res & 0x1) || __kmp_user_level_mwait) {
6929       __kmp_mwait_enabled = TRUE;
6930       if (__kmp_user_level_mwait) {
6931         KMP_INFORM(EnvMwaitWarn);
6932       }
6933     } else {
6934       __kmp_mwait_enabled = FALSE;
6935     }
6936   }
6937   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6938                 "__kmp_mwait_enabled = %d\n",
6939                 __kmp_mic_type, __kmp_mwait_enabled));
6940 }
6941 #endif /* KMP_HAVE_UMWAIT */
6942 
6943 static void __kmp_do_serial_initialize(void) {
6944   int i, gtid;
6945   size_t size;
6946 
6947   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6948 
6949   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6950   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6951   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6952   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6953   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6954 
6955 #if OMPT_SUPPORT
6956   ompt_pre_init();
6957 #endif
6958 #if OMPD_SUPPORT
6959   __kmp_env_dump();
6960   ompd_init();
6961 #endif
6962 
6963   __kmp_validate_locks();
6964 
6965   /* Initialize internal memory allocator */
6966   __kmp_init_allocator();
6967 
6968   /* Register the library startup via an environment variable or via mapped
6969      shared memory file and check to see whether another copy of the library is
6970      already registered. Since forked child process is often terminated, we
6971      postpone the registration till middle initialization in the child */
6972   if (__kmp_need_register_serial)
6973     __kmp_register_library_startup();
6974 
6975   /* TODO reinitialization of library */
6976   if (TCR_4(__kmp_global.g.g_done)) {
6977     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6978   }
6979 
6980   __kmp_global.g.g_abort = 0;
6981   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6982 
6983 /* initialize the locks */
6984 #if KMP_USE_ADAPTIVE_LOCKS
6985 #if KMP_DEBUG_ADAPTIVE_LOCKS
6986   __kmp_init_speculative_stats();
6987 #endif
6988 #endif
6989 #if KMP_STATS_ENABLED
6990   __kmp_stats_init();
6991 #endif
6992   __kmp_init_lock(&__kmp_global_lock);
6993   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6994   __kmp_init_lock(&__kmp_debug_lock);
6995   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6996   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6997   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6998   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6999   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7000   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7001   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7002   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7003   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7004   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7005   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7006   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7007   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7008   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7009   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7010 #if KMP_USE_MONITOR
7011   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7012 #endif
7013   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7014 
7015   /* conduct initialization and initial setup of configuration */
7016 
7017   __kmp_runtime_initialize();
7018 
7019 #if KMP_MIC_SUPPORTED
7020   __kmp_check_mic_type();
7021 #endif
7022 
7023 // Some global variable initialization moved here from kmp_env_initialize()
7024 #ifdef KMP_DEBUG
7025   kmp_diag = 0;
7026 #endif
7027   __kmp_abort_delay = 0;
7028 
7029   // From __kmp_init_dflt_team_nth()
7030   /* assume the entire machine will be used */
7031   __kmp_dflt_team_nth_ub = __kmp_xproc;
7032   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7033     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7034   }
7035   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7036     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7037   }
7038   __kmp_max_nth = __kmp_sys_max_nth;
7039   __kmp_cg_max_nth = __kmp_sys_max_nth;
7040   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7041   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7042     __kmp_teams_max_nth = __kmp_sys_max_nth;
7043   }
7044 
7045   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7046   // part
7047   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7048 #if KMP_USE_MONITOR
7049   __kmp_monitor_wakeups =
7050       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7051   __kmp_bt_intervals =
7052       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7053 #endif
7054   // From "KMP_LIBRARY" part of __kmp_env_initialize()
7055   __kmp_library = library_throughput;
7056   // From KMP_SCHEDULE initialization
7057   __kmp_static = kmp_sch_static_balanced;
7058 // AC: do not use analytical here, because it is non-monotonous
7059 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7060 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7061 // need to repeat assignment
7062 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7063 // bit control and barrier method control parts
7064 #if KMP_FAST_REDUCTION_BARRIER
7065 #define kmp_reduction_barrier_gather_bb ((int)1)
7066 #define kmp_reduction_barrier_release_bb ((int)1)
7067 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7068 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7069 #endif // KMP_FAST_REDUCTION_BARRIER
7070   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7071     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7072     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7073     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7074     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7075 #if KMP_FAST_REDUCTION_BARRIER
7076     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7077       // lin_64 ): hyper,1
7078       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7079       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7080       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7081       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7082     }
7083 #endif // KMP_FAST_REDUCTION_BARRIER
7084   }
7085 #if KMP_FAST_REDUCTION_BARRIER
7086 #undef kmp_reduction_barrier_release_pat
7087 #undef kmp_reduction_barrier_gather_pat
7088 #undef kmp_reduction_barrier_release_bb
7089 #undef kmp_reduction_barrier_gather_bb
7090 #endif // KMP_FAST_REDUCTION_BARRIER
7091 #if KMP_MIC_SUPPORTED
7092   if (__kmp_mic_type == mic2) { // KNC
7093     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7094     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7095     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7096         1; // forkjoin release
7097     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7098     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7099   }
7100 #if KMP_FAST_REDUCTION_BARRIER
7101   if (__kmp_mic_type == mic2) { // KNC
7102     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7103     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7104   }
7105 #endif // KMP_FAST_REDUCTION_BARRIER
7106 #endif // KMP_MIC_SUPPORTED
7107 
7108 // From KMP_CHECKS initialization
7109 #ifdef KMP_DEBUG
7110   __kmp_env_checks = TRUE; /* development versions have the extra checks */
7111 #else
7112   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7113 #endif
7114 
7115   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7116   __kmp_foreign_tp = TRUE;
7117 
7118   __kmp_global.g.g_dynamic = FALSE;
7119   __kmp_global.g.g_dynamic_mode = dynamic_default;
7120 
7121   __kmp_init_nesting_mode();
7122 
7123   __kmp_env_initialize(NULL);
7124 
7125 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7126   __kmp_user_level_mwait_init();
7127 #endif
7128 // Print all messages in message catalog for testing purposes.
7129 #ifdef KMP_DEBUG
7130   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7131   if (__kmp_str_match_true(val)) {
7132     kmp_str_buf_t buffer;
7133     __kmp_str_buf_init(&buffer);
7134     __kmp_i18n_dump_catalog(&buffer);
7135     __kmp_printf("%s", buffer.str);
7136     __kmp_str_buf_free(&buffer);
7137   }
7138   __kmp_env_free(&val);
7139 #endif
7140 
7141   __kmp_threads_capacity =
7142       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7143   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7144   __kmp_tp_capacity = __kmp_default_tp_capacity(
7145       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7146 
7147   // If the library is shut down properly, both pools must be NULL. Just in
7148   // case, set them to NULL -- some memory may leak, but subsequent code will
7149   // work even if pools are not freed.
7150   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7151   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7152   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7153   __kmp_thread_pool = NULL;
7154   __kmp_thread_pool_insert_pt = NULL;
7155   __kmp_team_pool = NULL;
7156 
7157   /* Allocate all of the variable sized records */
7158   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7159    * expandable */
7160   /* Since allocation is cache-aligned, just add extra padding at the end */
7161   size =
7162       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7163       CACHE_LINE;
7164   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7165   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7166                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
7167 
7168   /* init thread counts */
7169   KMP_DEBUG_ASSERT(__kmp_all_nth ==
7170                    0); // Asserts fail if the library is reinitializing and
7171   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7172   __kmp_all_nth = 0;
7173   __kmp_nth = 0;
7174 
7175   /* setup the uber master thread and hierarchy */
7176   gtid = __kmp_register_root(TRUE);
7177   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
7178   KMP_ASSERT(KMP_UBER_GTID(gtid));
7179   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7180 
7181   KMP_MB(); /* Flush all pending memory write invalidates.  */
7182 
7183   __kmp_common_initialize();
7184 
7185 #if KMP_OS_UNIX
7186   /* invoke the child fork handler */
7187   __kmp_register_atfork();
7188 #endif
7189 
7190 #if !KMP_DYNAMIC_LIB
7191   {
7192     /* Invoke the exit handler when the program finishes, only for static
7193        library. For dynamic library, we already have _fini and DllMain. */
7194     int rc = atexit(__kmp_internal_end_atexit);
7195     if (rc != 0) {
7196       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7197                   __kmp_msg_null);
7198     }
7199   }
7200 #endif
7201 
7202 #if KMP_HANDLE_SIGNALS
7203 #if KMP_OS_UNIX
7204   /* NOTE: make sure that this is called before the user installs their own
7205      signal handlers so that the user handlers are called first. this way they
7206      can return false, not call our handler, avoid terminating the library, and
7207      continue execution where they left off. */
7208   __kmp_install_signals(FALSE);
7209 #endif /* KMP_OS_UNIX */
7210 #if KMP_OS_WINDOWS
7211   __kmp_install_signals(TRUE);
7212 #endif /* KMP_OS_WINDOWS */
7213 #endif
7214 
7215   /* we have finished the serial initialization */
7216   __kmp_init_counter++;
7217 
7218   __kmp_init_serial = TRUE;
7219 
7220   if (__kmp_settings) {
7221     __kmp_env_print();
7222   }
7223 
7224   if (__kmp_display_env || __kmp_display_env_verbose) {
7225     __kmp_env_print_2();
7226   }
7227 
7228 #if OMPT_SUPPORT
7229   ompt_post_init();
7230 #endif
7231 
7232   KMP_MB();
7233 
7234   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7235 }
7236 
7237 void __kmp_serial_initialize(void) {
7238   if (__kmp_init_serial) {
7239     return;
7240   }
7241   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7242   if (__kmp_init_serial) {
7243     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7244     return;
7245   }
7246   __kmp_do_serial_initialize();
7247   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7248 }
7249 
7250 static void __kmp_do_middle_initialize(void) {
7251   int i, j;
7252   int prev_dflt_team_nth;
7253 
7254   if (!__kmp_init_serial) {
7255     __kmp_do_serial_initialize();
7256   }
7257 
7258   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7259 
7260   if (UNLIKELY(!__kmp_need_register_serial)) {
7261     // We are in a forked child process. The registration was skipped during
7262     // serial initialization in __kmp_atfork_child handler. Do it here.
7263     __kmp_register_library_startup();
7264   }
7265 
7266   // Save the previous value for the __kmp_dflt_team_nth so that
7267   // we can avoid some reinitialization if it hasn't changed.
7268   prev_dflt_team_nth = __kmp_dflt_team_nth;
7269 
7270 #if KMP_AFFINITY_SUPPORTED
7271   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7272   // number of cores on the machine.
7273   __kmp_affinity_initialize();
7274 
7275 #endif /* KMP_AFFINITY_SUPPORTED */
7276 
7277   KMP_ASSERT(__kmp_xproc > 0);
7278   if (__kmp_avail_proc == 0) {
7279     __kmp_avail_proc = __kmp_xproc;
7280   }
7281 
7282   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7283   // correct them now
7284   j = 0;
7285   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7286     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7287         __kmp_avail_proc;
7288     j++;
7289   }
7290 
7291   if (__kmp_dflt_team_nth == 0) {
7292 #ifdef KMP_DFLT_NTH_CORES
7293     // Default #threads = #cores
7294     __kmp_dflt_team_nth = __kmp_ncores;
7295     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7296                   "__kmp_ncores (%d)\n",
7297                   __kmp_dflt_team_nth));
7298 #else
7299     // Default #threads = #available OS procs
7300     __kmp_dflt_team_nth = __kmp_avail_proc;
7301     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7302                   "__kmp_avail_proc(%d)\n",
7303                   __kmp_dflt_team_nth));
7304 #endif /* KMP_DFLT_NTH_CORES */
7305   }
7306 
7307   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7308     __kmp_dflt_team_nth = KMP_MIN_NTH;
7309   }
7310   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7311     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7312   }
7313 
7314   if (__kmp_nesting_mode > 0)
7315     __kmp_set_nesting_mode_threads();
7316 
7317   // There's no harm in continuing if the following check fails,
7318   // but it indicates an error in the previous logic.
7319   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7320 
7321   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7322     // Run through the __kmp_threads array and set the num threads icv for each
7323     // root thread that is currently registered with the RTL (which has not
7324     // already explicitly set its nthreads-var with a call to
7325     // omp_set_num_threads()).
7326     for (i = 0; i < __kmp_threads_capacity; i++) {
7327       kmp_info_t *thread = __kmp_threads[i];
7328       if (thread == NULL)
7329         continue;
7330       if (thread->th.th_current_task->td_icvs.nproc != 0)
7331         continue;
7332 
7333       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7334     }
7335   }
7336   KA_TRACE(
7337       20,
7338       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7339        __kmp_dflt_team_nth));
7340 
7341 #ifdef KMP_ADJUST_BLOCKTIME
7342   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7343   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7344     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7345     if (__kmp_nth > __kmp_avail_proc) {
7346       __kmp_zero_bt = TRUE;
7347     }
7348   }
7349 #endif /* KMP_ADJUST_BLOCKTIME */
7350 
7351   /* we have finished middle initialization */
7352   TCW_SYNC_4(__kmp_init_middle, TRUE);
7353 
7354   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7355 }
7356 
7357 void __kmp_middle_initialize(void) {
7358   if (__kmp_init_middle) {
7359     return;
7360   }
7361   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7362   if (__kmp_init_middle) {
7363     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7364     return;
7365   }
7366   __kmp_do_middle_initialize();
7367   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7368 }
7369 
7370 void __kmp_parallel_initialize(void) {
7371   int gtid = __kmp_entry_gtid(); // this might be a new root
7372 
7373   /* synchronize parallel initialization (for sibling) */
7374   if (TCR_4(__kmp_init_parallel))
7375     return;
7376   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7377   if (TCR_4(__kmp_init_parallel)) {
7378     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7379     return;
7380   }
7381 
7382   /* TODO reinitialization after we have already shut down */
7383   if (TCR_4(__kmp_global.g.g_done)) {
7384     KA_TRACE(
7385         10,
7386         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7387     __kmp_infinite_loop();
7388   }
7389 
7390   /* jc: The lock __kmp_initz_lock is already held, so calling
7391      __kmp_serial_initialize would cause a deadlock.  So we call
7392      __kmp_do_serial_initialize directly. */
7393   if (!__kmp_init_middle) {
7394     __kmp_do_middle_initialize();
7395   }
7396   __kmp_assign_root_init_mask();
7397   __kmp_resume_if_hard_paused();
7398 
7399   /* begin initialization */
7400   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7401   KMP_ASSERT(KMP_UBER_GTID(gtid));
7402 
7403 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7404   // Save the FP control regs.
7405   // Worker threads will set theirs to these values at thread startup.
7406   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7407   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7408   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7409 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7410 
7411 #if KMP_OS_UNIX
7412 #if KMP_HANDLE_SIGNALS
7413   /*  must be after __kmp_serial_initialize  */
7414   __kmp_install_signals(TRUE);
7415 #endif
7416 #endif
7417 
7418   __kmp_suspend_initialize();
7419 
7420 #if defined(USE_LOAD_BALANCE)
7421   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7422     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7423   }
7424 #else
7425   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7426     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7427   }
7428 #endif
7429 
7430   if (__kmp_version) {
7431     __kmp_print_version_2();
7432   }
7433 
7434   /* we have finished parallel initialization */
7435   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7436 
7437   KMP_MB();
7438   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7439 
7440   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7441 }
7442 
7443 void __kmp_hidden_helper_initialize() {
7444   if (TCR_4(__kmp_init_hidden_helper))
7445     return;
7446 
7447   // __kmp_parallel_initialize is required before we initialize hidden helper
7448   if (!TCR_4(__kmp_init_parallel))
7449     __kmp_parallel_initialize();
7450 
7451   // Double check. Note that this double check should not be placed before
7452   // __kmp_parallel_initialize as it will cause dead lock.
7453   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7454   if (TCR_4(__kmp_init_hidden_helper)) {
7455     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7456     return;
7457   }
7458 
7459   // Set the count of hidden helper tasks to be executed to zero
7460   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7461 
7462   // Set the global variable indicating that we're initializing hidden helper
7463   // team/threads
7464   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7465 
7466   // Platform independent initialization
7467   __kmp_do_initialize_hidden_helper_threads();
7468 
7469   // Wait here for the finish of initialization of hidden helper teams
7470   __kmp_hidden_helper_threads_initz_wait();
7471 
7472   // We have finished hidden helper initialization
7473   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7474 
7475   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7476 }
7477 
7478 /* ------------------------------------------------------------------------ */
7479 
7480 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7481                                    kmp_team_t *team) {
7482   kmp_disp_t *dispatch;
7483 
7484   KMP_MB();
7485 
7486   /* none of the threads have encountered any constructs, yet. */
7487   this_thr->th.th_local.this_construct = 0;
7488 #if KMP_CACHE_MANAGE
7489   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7490 #endif /* KMP_CACHE_MANAGE */
7491   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7492   KMP_DEBUG_ASSERT(dispatch);
7493   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7494   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7495   // this_thr->th.th_info.ds.ds_tid ] );
7496 
7497   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7498   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7499   if (__kmp_env_consistency_check)
7500     __kmp_push_parallel(gtid, team->t.t_ident);
7501 
7502   KMP_MB(); /* Flush all pending memory write invalidates.  */
7503 }
7504 
7505 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7506                                   kmp_team_t *team) {
7507   if (__kmp_env_consistency_check)
7508     __kmp_pop_parallel(gtid, team->t.t_ident);
7509 
7510   __kmp_finish_implicit_task(this_thr);
7511 }
7512 
7513 int __kmp_invoke_task_func(int gtid) {
7514   int rc;
7515   int tid = __kmp_tid_from_gtid(gtid);
7516   kmp_info_t *this_thr = __kmp_threads[gtid];
7517   kmp_team_t *team = this_thr->th.th_team;
7518 
7519   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7520 #if USE_ITT_BUILD
7521   if (__itt_stack_caller_create_ptr) {
7522     // inform ittnotify about entering user's code
7523     if (team->t.t_stack_id != NULL) {
7524       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7525     } else {
7526       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7527       __kmp_itt_stack_callee_enter(
7528           (__itt_caller)team->t.t_parent->t.t_stack_id);
7529     }
7530   }
7531 #endif /* USE_ITT_BUILD */
7532 #if INCLUDE_SSC_MARKS
7533   SSC_MARK_INVOKING();
7534 #endif
7535 
7536 #if OMPT_SUPPORT
7537   void *dummy;
7538   void **exit_frame_p;
7539   ompt_data_t *my_task_data;
7540   ompt_data_t *my_parallel_data;
7541   int ompt_team_size;
7542 
7543   if (ompt_enabled.enabled) {
7544     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7545                          .ompt_task_info.frame.exit_frame.ptr);
7546   } else {
7547     exit_frame_p = &dummy;
7548   }
7549 
7550   my_task_data =
7551       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7552   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7553   if (ompt_enabled.ompt_callback_implicit_task) {
7554     ompt_team_size = team->t.t_nproc;
7555     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7556         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7557         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7558     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7559   }
7560 #endif
7561 
7562 #if KMP_STATS_ENABLED
7563   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7564   if (previous_state == stats_state_e::TEAMS_REGION) {
7565     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7566   } else {
7567     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7568   }
7569   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7570 #endif
7571 
7572   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7573                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7574 #if OMPT_SUPPORT
7575                               ,
7576                               exit_frame_p
7577 #endif
7578   );
7579 #if OMPT_SUPPORT
7580   *exit_frame_p = NULL;
7581   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7582 #endif
7583 
7584 #if KMP_STATS_ENABLED
7585   if (previous_state == stats_state_e::TEAMS_REGION) {
7586     KMP_SET_THREAD_STATE(previous_state);
7587   }
7588   KMP_POP_PARTITIONED_TIMER();
7589 #endif
7590 
7591 #if USE_ITT_BUILD
7592   if (__itt_stack_caller_create_ptr) {
7593     // inform ittnotify about leaving user's code
7594     if (team->t.t_stack_id != NULL) {
7595       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7596     } else {
7597       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7598       __kmp_itt_stack_callee_leave(
7599           (__itt_caller)team->t.t_parent->t.t_stack_id);
7600     }
7601   }
7602 #endif /* USE_ITT_BUILD */
7603   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7604 
7605   return rc;
7606 }
7607 
7608 void __kmp_teams_master(int gtid) {
7609   // This routine is called by all primary threads in teams construct
7610   kmp_info_t *thr = __kmp_threads[gtid];
7611   kmp_team_t *team = thr->th.th_team;
7612   ident_t *loc = team->t.t_ident;
7613   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7614   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7615   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7616   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7617                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7618 
7619   // This thread is a new CG root.  Set up the proper variables.
7620   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7621   tmp->cg_root = thr; // Make thr the CG root
7622   // Init to thread limit stored when league primary threads were forked
7623   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7624   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7625   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7626                  " cg_nthreads to 1\n",
7627                  thr, tmp));
7628   tmp->up = thr->th.th_cg_roots;
7629   thr->th.th_cg_roots = tmp;
7630 
7631 // Launch league of teams now, but not let workers execute
7632 // (they hang on fork barrier until next parallel)
7633 #if INCLUDE_SSC_MARKS
7634   SSC_MARK_FORKING();
7635 #endif
7636   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7637                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7638                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7639 #if INCLUDE_SSC_MARKS
7640   SSC_MARK_JOINING();
7641 #endif
7642   // If the team size was reduced from the limit, set it to the new size
7643   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7644     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7645   // AC: last parameter "1" eliminates join barrier which won't work because
7646   // worker threads are in a fork barrier waiting for more parallel regions
7647   __kmp_join_call(loc, gtid
7648 #if OMPT_SUPPORT
7649                   ,
7650                   fork_context_intel
7651 #endif
7652                   ,
7653                   1);
7654 }
7655 
7656 int __kmp_invoke_teams_master(int gtid) {
7657   kmp_info_t *this_thr = __kmp_threads[gtid];
7658   kmp_team_t *team = this_thr->th.th_team;
7659 #if KMP_DEBUG
7660   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7661     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7662                      (void *)__kmp_teams_master);
7663 #endif
7664   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7665 #if OMPT_SUPPORT
7666   int tid = __kmp_tid_from_gtid(gtid);
7667   ompt_data_t *task_data =
7668       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7669   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7670   if (ompt_enabled.ompt_callback_implicit_task) {
7671     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7672         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7673         ompt_task_initial);
7674     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7675   }
7676 #endif
7677   __kmp_teams_master(gtid);
7678 #if OMPT_SUPPORT
7679   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7680 #endif
7681   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7682   return 1;
7683 }
7684 
7685 /* this sets the requested number of threads for the next parallel region
7686    encountered by this team. since this should be enclosed in the forkjoin
7687    critical section it should avoid race conditions with asymmetrical nested
7688    parallelism */
7689 
7690 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7691   kmp_info_t *thr = __kmp_threads[gtid];
7692 
7693   if (num_threads > 0)
7694     thr->th.th_set_nproc = num_threads;
7695 }
7696 
7697 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7698                                     int num_threads) {
7699   KMP_DEBUG_ASSERT(thr);
7700   // Remember the number of threads for inner parallel regions
7701   if (!TCR_4(__kmp_init_middle))
7702     __kmp_middle_initialize(); // get internal globals calculated
7703   __kmp_assign_root_init_mask();
7704   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7705   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7706 
7707   if (num_threads == 0) {
7708     if (__kmp_teams_thread_limit > 0) {
7709       num_threads = __kmp_teams_thread_limit;
7710     } else {
7711       num_threads = __kmp_avail_proc / num_teams;
7712     }
7713     // adjust num_threads w/o warning as it is not user setting
7714     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7715     // no thread_limit clause specified -  do not change thread-limit-var ICV
7716     if (num_threads > __kmp_dflt_team_nth) {
7717       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7718     }
7719     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7720       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7721     } // prevent team size to exceed thread-limit-var
7722     if (num_teams * num_threads > __kmp_teams_max_nth) {
7723       num_threads = __kmp_teams_max_nth / num_teams;
7724     }
7725     if (num_threads == 0) {
7726       num_threads = 1;
7727     }
7728   } else {
7729     if (num_threads < 0) {
7730       __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7731                 __kmp_msg_null);
7732       num_threads = 1;
7733     }
7734     // This thread will be the primary thread of the league primary threads
7735     // Store new thread limit; old limit is saved in th_cg_roots list
7736     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7737     // num_threads = min(num_threads, nthreads-var)
7738     if (num_threads > __kmp_dflt_team_nth) {
7739       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7740     }
7741     if (num_teams * num_threads > __kmp_teams_max_nth) {
7742       int new_threads = __kmp_teams_max_nth / num_teams;
7743       if (new_threads == 0) {
7744         new_threads = 1;
7745       }
7746       if (new_threads != num_threads) {
7747         if (!__kmp_reserve_warn) { // user asked for too many threads
7748           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7749           __kmp_msg(kmp_ms_warning,
7750                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7751                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7752         }
7753       }
7754       num_threads = new_threads;
7755     }
7756   }
7757   thr->th.th_teams_size.nth = num_threads;
7758 }
7759 
7760 /* this sets the requested number of teams for the teams region and/or
7761    the number of threads for the next parallel region encountered  */
7762 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7763                           int num_threads) {
7764   kmp_info_t *thr = __kmp_threads[gtid];
7765   if (num_teams < 0) {
7766     // OpenMP specification requires requested values to be positive,
7767     // but people can send us any value, so we'd better check
7768     __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7769               __kmp_msg_null);
7770     num_teams = 1;
7771   }
7772   if (num_teams == 0) {
7773     if (__kmp_nteams > 0) {
7774       num_teams = __kmp_nteams;
7775     } else {
7776       num_teams = 1; // default number of teams is 1.
7777     }
7778   }
7779   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7780     if (!__kmp_reserve_warn) {
7781       __kmp_reserve_warn = 1;
7782       __kmp_msg(kmp_ms_warning,
7783                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7784                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7785     }
7786     num_teams = __kmp_teams_max_nth;
7787   }
7788   // Set number of teams (number of threads in the outer "parallel" of the
7789   // teams)
7790   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7791 
7792   __kmp_push_thread_limit(thr, num_teams, num_threads);
7793 }
7794 
7795 /* This sets the requested number of teams for the teams region and/or
7796    the number of threads for the next parallel region encountered  */
7797 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7798                              int num_teams_ub, int num_threads) {
7799   kmp_info_t *thr = __kmp_threads[gtid];
7800   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7801   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7802   KMP_DEBUG_ASSERT(num_threads >= 0);
7803 
7804   if (num_teams_lb > num_teams_ub) {
7805     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7806                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7807   }
7808 
7809   int num_teams = 1; // defalt number of teams is 1.
7810 
7811   if (num_teams_lb == 0 && num_teams_ub > 0)
7812     num_teams_lb = num_teams_ub;
7813 
7814   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7815     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7816     if (num_teams > __kmp_teams_max_nth) {
7817       if (!__kmp_reserve_warn) {
7818         __kmp_reserve_warn = 1;
7819         __kmp_msg(kmp_ms_warning,
7820                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7821                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7822       }
7823       num_teams = __kmp_teams_max_nth;
7824     }
7825   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7826     num_teams = num_teams_ub;
7827   } else { // num_teams_lb <= num_teams <= num_teams_ub
7828     if (num_threads <= 0) {
7829       if (num_teams_ub > __kmp_teams_max_nth) {
7830         num_teams = num_teams_lb;
7831       } else {
7832         num_teams = num_teams_ub;
7833       }
7834     } else {
7835       num_teams = (num_threads > __kmp_teams_max_nth)
7836                       ? num_teams
7837                       : __kmp_teams_max_nth / num_threads;
7838       if (num_teams < num_teams_lb) {
7839         num_teams = num_teams_lb;
7840       } else if (num_teams > num_teams_ub) {
7841         num_teams = num_teams_ub;
7842       }
7843     }
7844   }
7845   // Set number of teams (number of threads in the outer "parallel" of the
7846   // teams)
7847   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7848 
7849   __kmp_push_thread_limit(thr, num_teams, num_threads);
7850 }
7851 
7852 // Set the proc_bind var to use in the following parallel region.
7853 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7854   kmp_info_t *thr = __kmp_threads[gtid];
7855   thr->th.th_set_proc_bind = proc_bind;
7856 }
7857 
7858 /* Launch the worker threads into the microtask. */
7859 
7860 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7861   kmp_info_t *this_thr = __kmp_threads[gtid];
7862 
7863 #ifdef KMP_DEBUG
7864   int f;
7865 #endif /* KMP_DEBUG */
7866 
7867   KMP_DEBUG_ASSERT(team);
7868   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7869   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7870   KMP_MB(); /* Flush all pending memory write invalidates.  */
7871 
7872   team->t.t_construct = 0; /* no single directives seen yet */
7873   team->t.t_ordered.dt.t_value =
7874       0; /* thread 0 enters the ordered section first */
7875 
7876   /* Reset the identifiers on the dispatch buffer */
7877   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7878   if (team->t.t_max_nproc > 1) {
7879     int i;
7880     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7881       team->t.t_disp_buffer[i].buffer_index = i;
7882       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7883     }
7884   } else {
7885     team->t.t_disp_buffer[0].buffer_index = 0;
7886     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7887   }
7888 
7889   KMP_MB(); /* Flush all pending memory write invalidates.  */
7890   KMP_ASSERT(this_thr->th.th_team == team);
7891 
7892 #ifdef KMP_DEBUG
7893   for (f = 0; f < team->t.t_nproc; f++) {
7894     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7895                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7896   }
7897 #endif /* KMP_DEBUG */
7898 
7899   /* release the worker threads so they may begin working */
7900   __kmp_fork_barrier(gtid, 0);
7901 }
7902 
7903 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7904   kmp_info_t *this_thr = __kmp_threads[gtid];
7905 
7906   KMP_DEBUG_ASSERT(team);
7907   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7908   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7909   KMP_MB(); /* Flush all pending memory write invalidates.  */
7910 
7911   /* Join barrier after fork */
7912 
7913 #ifdef KMP_DEBUG
7914   if (__kmp_threads[gtid] &&
7915       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7916     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7917                  __kmp_threads[gtid]);
7918     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7919                  "team->t.t_nproc=%d\n",
7920                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7921                  team->t.t_nproc);
7922     __kmp_print_structure();
7923   }
7924   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7925                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7926 #endif /* KMP_DEBUG */
7927 
7928   __kmp_join_barrier(gtid); /* wait for everyone */
7929 #if OMPT_SUPPORT
7930   if (ompt_enabled.enabled &&
7931       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7932     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7933     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7934     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7935 #if OMPT_OPTIONAL
7936     void *codeptr = NULL;
7937     if (KMP_MASTER_TID(ds_tid) &&
7938         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7939          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7940       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7941 
7942     if (ompt_enabled.ompt_callback_sync_region_wait) {
7943       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7944           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7945           codeptr);
7946     }
7947     if (ompt_enabled.ompt_callback_sync_region) {
7948       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7949           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7950           codeptr);
7951     }
7952 #endif
7953     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7954       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7955           ompt_scope_end, NULL, task_data, 0, ds_tid,
7956           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7957     }
7958   }
7959 #endif
7960 
7961   KMP_MB(); /* Flush all pending memory write invalidates.  */
7962   KMP_ASSERT(this_thr->th.th_team == team);
7963 }
7964 
7965 /* ------------------------------------------------------------------------ */
7966 
7967 #ifdef USE_LOAD_BALANCE
7968 
7969 // Return the worker threads actively spinning in the hot team, if we
7970 // are at the outermost level of parallelism.  Otherwise, return 0.
7971 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7972   int i;
7973   int retval;
7974   kmp_team_t *hot_team;
7975 
7976   if (root->r.r_active) {
7977     return 0;
7978   }
7979   hot_team = root->r.r_hot_team;
7980   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7981     return hot_team->t.t_nproc - 1; // Don't count primary thread
7982   }
7983 
7984   // Skip the primary thread - it is accounted for elsewhere.
7985   retval = 0;
7986   for (i = 1; i < hot_team->t.t_nproc; i++) {
7987     if (hot_team->t.t_threads[i]->th.th_active) {
7988       retval++;
7989     }
7990   }
7991   return retval;
7992 }
7993 
7994 // Perform an automatic adjustment to the number of
7995 // threads used by the next parallel region.
7996 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7997   int retval;
7998   int pool_active;
7999   int hot_team_active;
8000   int team_curr_active;
8001   int system_active;
8002 
8003   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8004                 set_nproc));
8005   KMP_DEBUG_ASSERT(root);
8006   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8007                        ->th.th_current_task->td_icvs.dynamic == TRUE);
8008   KMP_DEBUG_ASSERT(set_nproc > 1);
8009 
8010   if (set_nproc == 1) {
8011     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8012     return 1;
8013   }
8014 
8015   // Threads that are active in the thread pool, active in the hot team for this
8016   // particular root (if we are at the outer par level), and the currently
8017   // executing thread (to become the primary thread) are available to add to the
8018   // new team, but are currently contributing to the system load, and must be
8019   // accounted for.
8020   pool_active = __kmp_thread_pool_active_nth;
8021   hot_team_active = __kmp_active_hot_team_nproc(root);
8022   team_curr_active = pool_active + hot_team_active + 1;
8023 
8024   // Check the system load.
8025   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8026   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8027                 "hot team active = %d\n",
8028                 system_active, pool_active, hot_team_active));
8029 
8030   if (system_active < 0) {
8031     // There was an error reading the necessary info from /proc, so use the
8032     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8033     // = dynamic_thread_limit, we shouldn't wind up getting back here.
8034     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8035     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8036 
8037     // Make this call behave like the thread limit algorithm.
8038     retval = __kmp_avail_proc - __kmp_nth +
8039              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8040     if (retval > set_nproc) {
8041       retval = set_nproc;
8042     }
8043     if (retval < KMP_MIN_NTH) {
8044       retval = KMP_MIN_NTH;
8045     }
8046 
8047     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8048                   retval));
8049     return retval;
8050   }
8051 
8052   // There is a slight delay in the load balance algorithm in detecting new
8053   // running procs. The real system load at this instant should be at least as
8054   // large as the #active omp thread that are available to add to the team.
8055   if (system_active < team_curr_active) {
8056     system_active = team_curr_active;
8057   }
8058   retval = __kmp_avail_proc - system_active + team_curr_active;
8059   if (retval > set_nproc) {
8060     retval = set_nproc;
8061   }
8062   if (retval < KMP_MIN_NTH) {
8063     retval = KMP_MIN_NTH;
8064   }
8065 
8066   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8067   return retval;
8068 } // __kmp_load_balance_nproc()
8069 
8070 #endif /* USE_LOAD_BALANCE */
8071 
8072 /* ------------------------------------------------------------------------ */
8073 
8074 /* NOTE: this is called with the __kmp_init_lock held */
8075 void __kmp_cleanup(void) {
8076   int f;
8077 
8078   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8079 
8080   if (TCR_4(__kmp_init_parallel)) {
8081 #if KMP_HANDLE_SIGNALS
8082     __kmp_remove_signals();
8083 #endif
8084     TCW_4(__kmp_init_parallel, FALSE);
8085   }
8086 
8087   if (TCR_4(__kmp_init_middle)) {
8088 #if KMP_AFFINITY_SUPPORTED
8089     __kmp_affinity_uninitialize();
8090 #endif /* KMP_AFFINITY_SUPPORTED */
8091     __kmp_cleanup_hierarchy();
8092     TCW_4(__kmp_init_middle, FALSE);
8093   }
8094 
8095   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8096 
8097   if (__kmp_init_serial) {
8098     __kmp_runtime_destroy();
8099     __kmp_init_serial = FALSE;
8100   }
8101 
8102   __kmp_cleanup_threadprivate_caches();
8103 
8104   for (f = 0; f < __kmp_threads_capacity; f++) {
8105     if (__kmp_root[f] != NULL) {
8106       __kmp_free(__kmp_root[f]);
8107       __kmp_root[f] = NULL;
8108     }
8109   }
8110   __kmp_free(__kmp_threads);
8111   // __kmp_threads and __kmp_root were allocated at once, as single block, so
8112   // there is no need in freeing __kmp_root.
8113   __kmp_threads = NULL;
8114   __kmp_root = NULL;
8115   __kmp_threads_capacity = 0;
8116 
8117   // Free old __kmp_threads arrays if they exist.
8118   kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8119   while (ptr) {
8120     kmp_old_threads_list_t *next = ptr->next;
8121     __kmp_free(ptr->threads);
8122     __kmp_free(ptr);
8123     ptr = next;
8124   }
8125 
8126 #if KMP_USE_DYNAMIC_LOCK
8127   __kmp_cleanup_indirect_user_locks();
8128 #else
8129   __kmp_cleanup_user_locks();
8130 #endif
8131 #if OMPD_SUPPORT
8132   if (ompd_state) {
8133     __kmp_free(ompd_env_block);
8134     ompd_env_block = NULL;
8135     ompd_env_block_size = 0;
8136   }
8137 #endif
8138 
8139 #if KMP_AFFINITY_SUPPORTED
8140   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8141   __kmp_cpuinfo_file = NULL;
8142 #endif /* KMP_AFFINITY_SUPPORTED */
8143 
8144 #if KMP_USE_ADAPTIVE_LOCKS
8145 #if KMP_DEBUG_ADAPTIVE_LOCKS
8146   __kmp_print_speculative_stats();
8147 #endif
8148 #endif
8149   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8150   __kmp_nested_nth.nth = NULL;
8151   __kmp_nested_nth.size = 0;
8152   __kmp_nested_nth.used = 0;
8153   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8154   __kmp_nested_proc_bind.bind_types = NULL;
8155   __kmp_nested_proc_bind.size = 0;
8156   __kmp_nested_proc_bind.used = 0;
8157   if (__kmp_affinity_format) {
8158     KMP_INTERNAL_FREE(__kmp_affinity_format);
8159     __kmp_affinity_format = NULL;
8160   }
8161 
8162   __kmp_i18n_catclose();
8163 
8164 #if KMP_USE_HIER_SCHED
8165   __kmp_hier_scheds.deallocate();
8166 #endif
8167 
8168 #if KMP_STATS_ENABLED
8169   __kmp_stats_fini();
8170 #endif
8171 
8172   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8173 }
8174 
8175 /* ------------------------------------------------------------------------ */
8176 
8177 int __kmp_ignore_mppbeg(void) {
8178   char *env;
8179 
8180   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8181     if (__kmp_str_match_false(env))
8182       return FALSE;
8183   }
8184   // By default __kmpc_begin() is no-op.
8185   return TRUE;
8186 }
8187 
8188 int __kmp_ignore_mppend(void) {
8189   char *env;
8190 
8191   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8192     if (__kmp_str_match_false(env))
8193       return FALSE;
8194   }
8195   // By default __kmpc_end() is no-op.
8196   return TRUE;
8197 }
8198 
8199 void __kmp_internal_begin(void) {
8200   int gtid;
8201   kmp_root_t *root;
8202 
8203   /* this is a very important step as it will register new sibling threads
8204      and assign these new uber threads a new gtid */
8205   gtid = __kmp_entry_gtid();
8206   root = __kmp_threads[gtid]->th.th_root;
8207   KMP_ASSERT(KMP_UBER_GTID(gtid));
8208 
8209   if (root->r.r_begin)
8210     return;
8211   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8212   if (root->r.r_begin) {
8213     __kmp_release_lock(&root->r.r_begin_lock, gtid);
8214     return;
8215   }
8216 
8217   root->r.r_begin = TRUE;
8218 
8219   __kmp_release_lock(&root->r.r_begin_lock, gtid);
8220 }
8221 
8222 /* ------------------------------------------------------------------------ */
8223 
8224 void __kmp_user_set_library(enum library_type arg) {
8225   int gtid;
8226   kmp_root_t *root;
8227   kmp_info_t *thread;
8228 
8229   /* first, make sure we are initialized so we can get our gtid */
8230 
8231   gtid = __kmp_entry_gtid();
8232   thread = __kmp_threads[gtid];
8233 
8234   root = thread->th.th_root;
8235 
8236   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8237                 library_serial));
8238   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8239                                   thread */
8240     KMP_WARNING(SetLibraryIncorrectCall);
8241     return;
8242   }
8243 
8244   switch (arg) {
8245   case library_serial:
8246     thread->th.th_set_nproc = 0;
8247     set__nproc(thread, 1);
8248     break;
8249   case library_turnaround:
8250     thread->th.th_set_nproc = 0;
8251     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8252                                            : __kmp_dflt_team_nth_ub);
8253     break;
8254   case library_throughput:
8255     thread->th.th_set_nproc = 0;
8256     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8257                                            : __kmp_dflt_team_nth_ub);
8258     break;
8259   default:
8260     KMP_FATAL(UnknownLibraryType, arg);
8261   }
8262 
8263   __kmp_aux_set_library(arg);
8264 }
8265 
8266 void __kmp_aux_set_stacksize(size_t arg) {
8267   if (!__kmp_init_serial)
8268     __kmp_serial_initialize();
8269 
8270 #if KMP_OS_DARWIN
8271   if (arg & (0x1000 - 1)) {
8272     arg &= ~(0x1000 - 1);
8273     if (arg + 0x1000) /* check for overflow if we round up */
8274       arg += 0x1000;
8275   }
8276 #endif
8277   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8278 
8279   /* only change the default stacksize before the first parallel region */
8280   if (!TCR_4(__kmp_init_parallel)) {
8281     size_t value = arg; /* argument is in bytes */
8282 
8283     if (value < __kmp_sys_min_stksize)
8284       value = __kmp_sys_min_stksize;
8285     else if (value > KMP_MAX_STKSIZE)
8286       value = KMP_MAX_STKSIZE;
8287 
8288     __kmp_stksize = value;
8289 
8290     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8291   }
8292 
8293   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8294 }
8295 
8296 /* set the behaviour of the runtime library */
8297 /* TODO this can cause some odd behaviour with sibling parallelism... */
8298 void __kmp_aux_set_library(enum library_type arg) {
8299   __kmp_library = arg;
8300 
8301   switch (__kmp_library) {
8302   case library_serial: {
8303     KMP_INFORM(LibraryIsSerial);
8304   } break;
8305   case library_turnaround:
8306     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8307       __kmp_use_yield = 2; // only yield when oversubscribed
8308     break;
8309   case library_throughput:
8310     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8311       __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8312     break;
8313   default:
8314     KMP_FATAL(UnknownLibraryType, arg);
8315   }
8316 }
8317 
8318 /* Getting team information common for all team API */
8319 // Returns NULL if not in teams construct
8320 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8321   kmp_info_t *thr = __kmp_entry_thread();
8322   teams_serialized = 0;
8323   if (thr->th.th_teams_microtask) {
8324     kmp_team_t *team = thr->th.th_team;
8325     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8326     int ii = team->t.t_level;
8327     teams_serialized = team->t.t_serialized;
8328     int level = tlevel + 1;
8329     KMP_DEBUG_ASSERT(ii >= tlevel);
8330     while (ii > level) {
8331       for (teams_serialized = team->t.t_serialized;
8332            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8333       }
8334       if (team->t.t_serialized && (!teams_serialized)) {
8335         team = team->t.t_parent;
8336         continue;
8337       }
8338       if (ii > level) {
8339         team = team->t.t_parent;
8340         ii--;
8341       }
8342     }
8343     return team;
8344   }
8345   return NULL;
8346 }
8347 
8348 int __kmp_aux_get_team_num() {
8349   int serialized;
8350   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8351   if (team) {
8352     if (serialized > 1) {
8353       return 0; // teams region is serialized ( 1 team of 1 thread ).
8354     } else {
8355       return team->t.t_master_tid;
8356     }
8357   }
8358   return 0;
8359 }
8360 
8361 int __kmp_aux_get_num_teams() {
8362   int serialized;
8363   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8364   if (team) {
8365     if (serialized > 1) {
8366       return 1;
8367     } else {
8368       return team->t.t_parent->t.t_nproc;
8369     }
8370   }
8371   return 1;
8372 }
8373 
8374 /* ------------------------------------------------------------------------ */
8375 
8376 /*
8377  * Affinity Format Parser
8378  *
8379  * Field is in form of: %[[[0].]size]type
8380  * % and type are required (%% means print a literal '%')
8381  * type is either single char or long name surrounded by {},
8382  * e.g., N or {num_threads}
8383  * 0 => leading zeros
8384  * . => right justified when size is specified
8385  * by default output is left justified
8386  * size is the *minimum* field length
8387  * All other characters are printed as is
8388  *
8389  * Available field types:
8390  * L {thread_level}      - omp_get_level()
8391  * n {thread_num}        - omp_get_thread_num()
8392  * h {host}              - name of host machine
8393  * P {process_id}        - process id (integer)
8394  * T {thread_identifier} - native thread identifier (integer)
8395  * N {num_threads}       - omp_get_num_threads()
8396  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8397  * a {thread_affinity}   - comma separated list of integers or integer ranges
8398  *                         (values of affinity mask)
8399  *
8400  * Implementation-specific field types can be added
8401  * If a type is unknown, print "undefined"
8402  */
8403 
8404 // Structure holding the short name, long name, and corresponding data type
8405 // for snprintf.  A table of these will represent the entire valid keyword
8406 // field types.
8407 typedef struct kmp_affinity_format_field_t {
8408   char short_name; // from spec e.g., L -> thread level
8409   const char *long_name; // from spec thread_level -> thread level
8410   char field_format; // data type for snprintf (typically 'd' or 's'
8411   // for integer or string)
8412 } kmp_affinity_format_field_t;
8413 
8414 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8415 #if KMP_AFFINITY_SUPPORTED
8416     {'A', "thread_affinity", 's'},
8417 #endif
8418     {'t', "team_num", 'd'},
8419     {'T', "num_teams", 'd'},
8420     {'L', "nesting_level", 'd'},
8421     {'n', "thread_num", 'd'},
8422     {'N', "num_threads", 'd'},
8423     {'a', "ancestor_tnum", 'd'},
8424     {'H', "host", 's'},
8425     {'P', "process_id", 'd'},
8426     {'i', "native_thread_id", 'd'}};
8427 
8428 // Return the number of characters it takes to hold field
8429 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8430                                             const char **ptr,
8431                                             kmp_str_buf_t *field_buffer) {
8432   int rc, format_index, field_value;
8433   const char *width_left, *width_right;
8434   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8435   static const int FORMAT_SIZE = 20;
8436   char format[FORMAT_SIZE] = {0};
8437   char absolute_short_name = 0;
8438 
8439   KMP_DEBUG_ASSERT(gtid >= 0);
8440   KMP_DEBUG_ASSERT(th);
8441   KMP_DEBUG_ASSERT(**ptr == '%');
8442   KMP_DEBUG_ASSERT(field_buffer);
8443 
8444   __kmp_str_buf_clear(field_buffer);
8445 
8446   // Skip the initial %
8447   (*ptr)++;
8448 
8449   // Check for %% first
8450   if (**ptr == '%') {
8451     __kmp_str_buf_cat(field_buffer, "%", 1);
8452     (*ptr)++; // skip over the second %
8453     return 1;
8454   }
8455 
8456   // Parse field modifiers if they are present
8457   pad_zeros = false;
8458   if (**ptr == '0') {
8459     pad_zeros = true;
8460     (*ptr)++; // skip over 0
8461   }
8462   right_justify = false;
8463   if (**ptr == '.') {
8464     right_justify = true;
8465     (*ptr)++; // skip over .
8466   }
8467   // Parse width of field: [width_left, width_right)
8468   width_left = width_right = NULL;
8469   if (**ptr >= '0' && **ptr <= '9') {
8470     width_left = *ptr;
8471     SKIP_DIGITS(*ptr);
8472     width_right = *ptr;
8473   }
8474 
8475   // Create the format for KMP_SNPRINTF based on flags parsed above
8476   format_index = 0;
8477   format[format_index++] = '%';
8478   if (!right_justify)
8479     format[format_index++] = '-';
8480   if (pad_zeros)
8481     format[format_index++] = '0';
8482   if (width_left && width_right) {
8483     int i = 0;
8484     // Only allow 8 digit number widths.
8485     // This also prevents overflowing format variable
8486     while (i < 8 && width_left < width_right) {
8487       format[format_index++] = *width_left;
8488       width_left++;
8489       i++;
8490     }
8491   }
8492 
8493   // Parse a name (long or short)
8494   // Canonicalize the name into absolute_short_name
8495   found_valid_name = false;
8496   parse_long_name = (**ptr == '{');
8497   if (parse_long_name)
8498     (*ptr)++; // skip initial left brace
8499   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8500                              sizeof(__kmp_affinity_format_table[0]);
8501        ++i) {
8502     char short_name = __kmp_affinity_format_table[i].short_name;
8503     const char *long_name = __kmp_affinity_format_table[i].long_name;
8504     char field_format = __kmp_affinity_format_table[i].field_format;
8505     if (parse_long_name) {
8506       size_t length = KMP_STRLEN(long_name);
8507       if (strncmp(*ptr, long_name, length) == 0) {
8508         found_valid_name = true;
8509         (*ptr) += length; // skip the long name
8510       }
8511     } else if (**ptr == short_name) {
8512       found_valid_name = true;
8513       (*ptr)++; // skip the short name
8514     }
8515     if (found_valid_name) {
8516       format[format_index++] = field_format;
8517       format[format_index++] = '\0';
8518       absolute_short_name = short_name;
8519       break;
8520     }
8521   }
8522   if (parse_long_name) {
8523     if (**ptr != '}') {
8524       absolute_short_name = 0;
8525     } else {
8526       (*ptr)++; // skip over the right brace
8527     }
8528   }
8529 
8530   // Attempt to fill the buffer with the requested
8531   // value using snprintf within __kmp_str_buf_print()
8532   switch (absolute_short_name) {
8533   case 't':
8534     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8535     break;
8536   case 'T':
8537     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8538     break;
8539   case 'L':
8540     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8541     break;
8542   case 'n':
8543     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8544     break;
8545   case 'H': {
8546     static const int BUFFER_SIZE = 256;
8547     char buf[BUFFER_SIZE];
8548     __kmp_expand_host_name(buf, BUFFER_SIZE);
8549     rc = __kmp_str_buf_print(field_buffer, format, buf);
8550   } break;
8551   case 'P':
8552     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8553     break;
8554   case 'i':
8555     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8556     break;
8557   case 'N':
8558     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8559     break;
8560   case 'a':
8561     field_value =
8562         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8563     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8564     break;
8565 #if KMP_AFFINITY_SUPPORTED
8566   case 'A': {
8567     kmp_str_buf_t buf;
8568     __kmp_str_buf_init(&buf);
8569     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8570     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8571     __kmp_str_buf_free(&buf);
8572   } break;
8573 #endif
8574   default:
8575     // According to spec, If an implementation does not have info for field
8576     // type, then "undefined" is printed
8577     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8578     // Skip the field
8579     if (parse_long_name) {
8580       SKIP_TOKEN(*ptr);
8581       if (**ptr == '}')
8582         (*ptr)++;
8583     } else {
8584       (*ptr)++;
8585     }
8586   }
8587 
8588   KMP_ASSERT(format_index <= FORMAT_SIZE);
8589   return rc;
8590 }
8591 
8592 /*
8593  * Return number of characters needed to hold the affinity string
8594  * (not including null byte character)
8595  * The resultant string is printed to buffer, which the caller can then
8596  * handle afterwards
8597  */
8598 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8599                                   kmp_str_buf_t *buffer) {
8600   const char *parse_ptr;
8601   size_t retval;
8602   const kmp_info_t *th;
8603   kmp_str_buf_t field;
8604 
8605   KMP_DEBUG_ASSERT(buffer);
8606   KMP_DEBUG_ASSERT(gtid >= 0);
8607 
8608   __kmp_str_buf_init(&field);
8609   __kmp_str_buf_clear(buffer);
8610 
8611   th = __kmp_threads[gtid];
8612   retval = 0;
8613 
8614   // If format is NULL or zero-length string, then we use
8615   // affinity-format-var ICV
8616   parse_ptr = format;
8617   if (parse_ptr == NULL || *parse_ptr == '\0') {
8618     parse_ptr = __kmp_affinity_format;
8619   }
8620   KMP_DEBUG_ASSERT(parse_ptr);
8621 
8622   while (*parse_ptr != '\0') {
8623     // Parse a field
8624     if (*parse_ptr == '%') {
8625       // Put field in the buffer
8626       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8627       __kmp_str_buf_catbuf(buffer, &field);
8628       retval += rc;
8629     } else {
8630       // Put literal character in buffer
8631       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8632       retval++;
8633       parse_ptr++;
8634     }
8635   }
8636   __kmp_str_buf_free(&field);
8637   return retval;
8638 }
8639 
8640 // Displays the affinity string to stdout
8641 void __kmp_aux_display_affinity(int gtid, const char *format) {
8642   kmp_str_buf_t buf;
8643   __kmp_str_buf_init(&buf);
8644   __kmp_aux_capture_affinity(gtid, format, &buf);
8645   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8646   __kmp_str_buf_free(&buf);
8647 }
8648 
8649 /* ------------------------------------------------------------------------ */
8650 
8651 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8652   int blocktime = arg; /* argument is in milliseconds */
8653 #if KMP_USE_MONITOR
8654   int bt_intervals;
8655 #endif
8656   kmp_int8 bt_set;
8657 
8658   __kmp_save_internal_controls(thread);
8659 
8660   /* Normalize and set blocktime for the teams */
8661   if (blocktime < KMP_MIN_BLOCKTIME)
8662     blocktime = KMP_MIN_BLOCKTIME;
8663   else if (blocktime > KMP_MAX_BLOCKTIME)
8664     blocktime = KMP_MAX_BLOCKTIME;
8665 
8666   set__blocktime_team(thread->th.th_team, tid, blocktime);
8667   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8668 
8669 #if KMP_USE_MONITOR
8670   /* Calculate and set blocktime intervals for the teams */
8671   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8672 
8673   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8674   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8675 #endif
8676 
8677   /* Set whether blocktime has been set to "TRUE" */
8678   bt_set = TRUE;
8679 
8680   set__bt_set_team(thread->th.th_team, tid, bt_set);
8681   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8682 #if KMP_USE_MONITOR
8683   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8684                 "bt_intervals=%d, monitor_updates=%d\n",
8685                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8686                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8687                 __kmp_monitor_wakeups));
8688 #else
8689   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8690                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8691                 thread->th.th_team->t.t_id, tid, blocktime));
8692 #endif
8693 }
8694 
8695 void __kmp_aux_set_defaults(char const *str, size_t len) {
8696   if (!__kmp_init_serial) {
8697     __kmp_serial_initialize();
8698   }
8699   __kmp_env_initialize(str);
8700 
8701   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8702     __kmp_env_print();
8703   }
8704 } // __kmp_aux_set_defaults
8705 
8706 /* ------------------------------------------------------------------------ */
8707 /* internal fast reduction routines */
8708 
8709 PACKED_REDUCTION_METHOD_T
8710 __kmp_determine_reduction_method(
8711     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8712     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8713     kmp_critical_name *lck) {
8714 
8715   // Default reduction method: critical construct ( lck != NULL, like in current
8716   // PAROPT )
8717   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8718   // can be selected by RTL
8719   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8720   // can be selected by RTL
8721   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8722   // among generated by PAROPT.
8723 
8724   PACKED_REDUCTION_METHOD_T retval;
8725 
8726   int team_size;
8727 
8728   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8729   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8730 
8731 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8732   (loc &&                                                                      \
8733    ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8734 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8735 
8736   retval = critical_reduce_block;
8737 
8738   // another choice of getting a team size (with 1 dynamic deference) is slower
8739   team_size = __kmp_get_team_num_threads(global_tid);
8740   if (team_size == 1) {
8741 
8742     retval = empty_reduce_block;
8743 
8744   } else {
8745 
8746     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8747 
8748 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8749     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8750 
8751 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8752     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8753 
8754     int teamsize_cutoff = 4;
8755 
8756 #if KMP_MIC_SUPPORTED
8757     if (__kmp_mic_type != non_mic) {
8758       teamsize_cutoff = 8;
8759     }
8760 #endif
8761     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8762     if (tree_available) {
8763       if (team_size <= teamsize_cutoff) {
8764         if (atomic_available) {
8765           retval = atomic_reduce_block;
8766         }
8767       } else {
8768         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8769       }
8770     } else if (atomic_available) {
8771       retval = atomic_reduce_block;
8772     }
8773 #else
8774 #error "Unknown or unsupported OS"
8775 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8776        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8777 
8778 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8779 
8780 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8781 
8782     // basic tuning
8783 
8784     if (atomic_available) {
8785       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8786         retval = atomic_reduce_block;
8787       }
8788     } // otherwise: use critical section
8789 
8790 #elif KMP_OS_DARWIN
8791 
8792     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8793     if (atomic_available && (num_vars <= 3)) {
8794       retval = atomic_reduce_block;
8795     } else if (tree_available) {
8796       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8797           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8798         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8799       }
8800     } // otherwise: use critical section
8801 
8802 #else
8803 #error "Unknown or unsupported OS"
8804 #endif
8805 
8806 #else
8807 #error "Unknown or unsupported architecture"
8808 #endif
8809   }
8810 
8811   // KMP_FORCE_REDUCTION
8812 
8813   // If the team is serialized (team_size == 1), ignore the forced reduction
8814   // method and stay with the unsynchronized method (empty_reduce_block)
8815   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8816       team_size != 1) {
8817 
8818     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8819 
8820     int atomic_available, tree_available;
8821 
8822     switch ((forced_retval = __kmp_force_reduction_method)) {
8823     case critical_reduce_block:
8824       KMP_ASSERT(lck); // lck should be != 0
8825       break;
8826 
8827     case atomic_reduce_block:
8828       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8829       if (!atomic_available) {
8830         KMP_WARNING(RedMethodNotSupported, "atomic");
8831         forced_retval = critical_reduce_block;
8832       }
8833       break;
8834 
8835     case tree_reduce_block:
8836       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8837       if (!tree_available) {
8838         KMP_WARNING(RedMethodNotSupported, "tree");
8839         forced_retval = critical_reduce_block;
8840       } else {
8841 #if KMP_FAST_REDUCTION_BARRIER
8842         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8843 #endif
8844       }
8845       break;
8846 
8847     default:
8848       KMP_ASSERT(0); // "unsupported method specified"
8849     }
8850 
8851     retval = forced_retval;
8852   }
8853 
8854   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8855 
8856 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8857 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8858 
8859   return (retval);
8860 }
8861 // this function is for testing set/get/determine reduce method
8862 kmp_int32 __kmp_get_reduce_method(void) {
8863   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8864 }
8865 
8866 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8867 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8868 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8869 
8870 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8871 // OpenMP is used subsequently.
8872 void __kmp_hard_pause() {
8873   __kmp_pause_status = kmp_hard_paused;
8874   __kmp_internal_end_thread(-1);
8875 }
8876 
8877 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8878 void __kmp_resume_if_soft_paused() {
8879   if (__kmp_pause_status == kmp_soft_paused) {
8880     __kmp_pause_status = kmp_not_paused;
8881 
8882     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8883       kmp_info_t *thread = __kmp_threads[gtid];
8884       if (thread) { // Wake it if sleeping
8885         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8886                          thread);
8887         if (fl.is_sleeping())
8888           fl.resume(gtid);
8889         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8890           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8891         } else { // thread holds the lock and may sleep soon
8892           do { // until either the thread sleeps, or we can get the lock
8893             if (fl.is_sleeping()) {
8894               fl.resume(gtid);
8895               break;
8896             } else if (__kmp_try_suspend_mx(thread)) {
8897               __kmp_unlock_suspend_mx(thread);
8898               break;
8899             }
8900           } while (1);
8901         }
8902       }
8903     }
8904   }
8905 }
8906 
8907 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8908 // TODO: add warning messages
8909 int __kmp_pause_resource(kmp_pause_status_t level) {
8910   if (level == kmp_not_paused) { // requesting resume
8911     if (__kmp_pause_status == kmp_not_paused) {
8912       // error message about runtime not being paused, so can't resume
8913       return 1;
8914     } else {
8915       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8916                        __kmp_pause_status == kmp_hard_paused);
8917       __kmp_pause_status = kmp_not_paused;
8918       return 0;
8919     }
8920   } else if (level == kmp_soft_paused) { // requesting soft pause
8921     if (__kmp_pause_status != kmp_not_paused) {
8922       // error message about already being paused
8923       return 1;
8924     } else {
8925       __kmp_soft_pause();
8926       return 0;
8927     }
8928   } else if (level == kmp_hard_paused) { // requesting hard pause
8929     if (__kmp_pause_status != kmp_not_paused) {
8930       // error message about already being paused
8931       return 1;
8932     } else {
8933       __kmp_hard_pause();
8934       return 0;
8935     }
8936   } else {
8937     // error message about invalid level
8938     return 1;
8939   }
8940 }
8941 
8942 void __kmp_omp_display_env(int verbose) {
8943   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8944   if (__kmp_init_serial == 0)
8945     __kmp_do_serial_initialize();
8946   __kmp_display_env_impl(!verbose, verbose);
8947   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8948 }
8949 
8950 // The team size is changing, so distributed barrier must be modified
8951 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
8952                                int new_nthreads) {
8953   KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
8954                    bp_dist_bar);
8955   kmp_info_t **other_threads = team->t.t_threads;
8956 
8957   // We want all the workers to stop waiting on the barrier while we adjust the
8958   // size of the team.
8959   for (int f = 1; f < old_nthreads; ++f) {
8960     KMP_DEBUG_ASSERT(other_threads[f] != NULL);
8961     // Ignore threads that are already inactive or not present in the team
8962     if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
8963       // teams construct causes thread_limit to get passed in, and some of
8964       // those could be inactive; just ignore them
8965       continue;
8966     }
8967     // If thread is transitioning still to in_use state, wait for it
8968     if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
8969       while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
8970         KMP_CPU_PAUSE();
8971     }
8972     // The thread should be in_use now
8973     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
8974     // Transition to unused state
8975     team->t.t_threads[f]->th.th_used_in_team.store(2);
8976     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
8977   }
8978   // Release all the workers
8979   team->t.b->go_release();
8980 
8981   KMP_MFENCE();
8982 
8983   // Workers should see transition status 2 and move to 0; but may need to be
8984   // woken up first
8985   int count = old_nthreads - 1;
8986   while (count > 0) {
8987     count = old_nthreads - 1;
8988     for (int f = 1; f < old_nthreads; ++f) {
8989       if (other_threads[f]->th.th_used_in_team.load() != 0) {
8990         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
8991           kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
8992               void *, other_threads[f]->th.th_sleep_loc);
8993           __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
8994         }
8995       } else {
8996         KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
8997         count--;
8998       }
8999     }
9000   }
9001   // Now update the barrier size
9002   team->t.b->update_num_threads(new_nthreads);
9003   team->t.b->go_reset();
9004 }
9005 
9006 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9007   // Add the threads back to the team
9008   KMP_DEBUG_ASSERT(team);
9009   // Threads were paused and pointed at th_used_in_team temporarily during a
9010   // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9011   // the thread that it should transition itself back into the team. Then, if
9012   // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9013   // to wake it up.
9014   for (int f = 1; f < new_nthreads; ++f) {
9015     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9016     KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9017                                 3);
9018     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9019       __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9020                       (kmp_flag_32<false, false> *)NULL);
9021     }
9022   }
9023   // The threads should be transitioning to the team; when they are done, they
9024   // should have set th_used_in_team to 1. This loop forces master to wait until
9025   // all threads have moved into the team and are waiting in the barrier.
9026   int count = new_nthreads - 1;
9027   while (count > 0) {
9028     count = new_nthreads - 1;
9029     for (int f = 1; f < new_nthreads; ++f) {
9030       if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9031         count--;
9032       }
9033     }
9034   }
9035 }
9036 
9037 // Globals and functions for hidden helper task
9038 kmp_info_t **__kmp_hidden_helper_threads;
9039 kmp_info_t *__kmp_hidden_helper_main_thread;
9040 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9041 #if KMP_OS_LINUX
9042 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9043 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9044 #else
9045 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9046 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9047 #endif
9048 
9049 namespace {
9050 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9051 
9052 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9053   // This is an explicit synchronization on all hidden helper threads in case
9054   // that when a regular thread pushes a hidden helper task to one hidden
9055   // helper thread, the thread has not been awaken once since they're released
9056   // by the main thread after creating the team.
9057   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9058   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9059          __kmp_hidden_helper_threads_num)
9060     ;
9061 
9062   // If main thread, then wait for signal
9063   if (__kmpc_master(nullptr, *gtid)) {
9064     // First, unset the initial state and release the initial thread
9065     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9066     __kmp_hidden_helper_initz_release();
9067     __kmp_hidden_helper_main_thread_wait();
9068     // Now wake up all worker threads
9069     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9070       __kmp_hidden_helper_worker_thread_signal();
9071     }
9072   }
9073 }
9074 } // namespace
9075 
9076 void __kmp_hidden_helper_threads_initz_routine() {
9077   // Create a new root for hidden helper team/threads
9078   const int gtid = __kmp_register_root(TRUE);
9079   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9080   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9081   __kmp_hidden_helper_main_thread->th.th_set_nproc =
9082       __kmp_hidden_helper_threads_num;
9083 
9084   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9085 
9086   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9087 
9088   // Set the initialization flag to FALSE
9089   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9090 
9091   __kmp_hidden_helper_threads_deinitz_release();
9092 }
9093 
9094 /* Nesting Mode:
9095    Set via KMP_NESTING_MODE, which takes an integer.
9096    Note: we skip duplicate topology levels, and skip levels with only
9097       one entity.
9098    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9099    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9100       in the topology, and initializes the number of threads at each of those
9101       levels to the number of entities at each level, respectively, below the
9102       entity at the parent level.
9103    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9104       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9105       the user to turn nesting on explicitly. This is an even more experimental
9106       option to this experimental feature, and may change or go away in the
9107       future.
9108 */
9109 
9110 // Allocate space to store nesting levels
9111 void __kmp_init_nesting_mode() {
9112   int levels = KMP_HW_LAST;
9113   __kmp_nesting_mode_nlevels = levels;
9114   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9115   for (int i = 0; i < levels; ++i)
9116     __kmp_nesting_nth_level[i] = 0;
9117   if (__kmp_nested_nth.size < levels) {
9118     __kmp_nested_nth.nth =
9119         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9120     __kmp_nested_nth.size = levels;
9121   }
9122 }
9123 
9124 // Set # threads for top levels of nesting; must be called after topology set
9125 void __kmp_set_nesting_mode_threads() {
9126   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9127 
9128   if (__kmp_nesting_mode == 1)
9129     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9130   else if (__kmp_nesting_mode > 1)
9131     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9132 
9133   if (__kmp_topology) { // use topology info
9134     int loc, hw_level;
9135     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9136                                 loc < __kmp_nesting_mode_nlevels;
9137          loc++, hw_level++) {
9138       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9139       if (__kmp_nesting_nth_level[loc] == 1)
9140         loc--;
9141     }
9142     // Make sure all cores are used
9143     if (__kmp_nesting_mode > 1 && loc > 1) {
9144       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9145       int num_cores = __kmp_topology->get_count(core_level);
9146       int upper_levels = 1;
9147       for (int level = 0; level < loc - 1; ++level)
9148         upper_levels *= __kmp_nesting_nth_level[level];
9149       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9150         __kmp_nesting_nth_level[loc - 1] =
9151             num_cores / __kmp_nesting_nth_level[loc - 2];
9152     }
9153     __kmp_nesting_mode_nlevels = loc;
9154     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9155   } else { // no topology info available; provide a reasonable guesstimation
9156     if (__kmp_avail_proc >= 4) {
9157       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9158       __kmp_nesting_nth_level[1] = 2;
9159       __kmp_nesting_mode_nlevels = 2;
9160     } else {
9161       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9162       __kmp_nesting_mode_nlevels = 1;
9163     }
9164     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9165   }
9166   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9167     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9168   }
9169   set__nproc(thread, __kmp_nesting_nth_level[0]);
9170   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9171     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9172   if (get__max_active_levels(thread) > 1) {
9173     // if max levels was set, set nesting mode levels to same
9174     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9175   }
9176   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9177     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9178 }
9179