1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61     KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69     KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85                                   int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87                                   kmp_internal_control_t *new_icvs,
88                                   ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91                                    int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97                           kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111                                int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118   int i;
119   kmp_info_t **other_threads;
120   size_t stack_data;
121   char *stack_addr;
122   size_t stack_size;
123   char *stack_base;
124 
125   KA_TRACE(
126       1000,
127       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
128        __kmp_nth, __kmp_all_nth));
129 
130   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133      __kmp_init_gtid for this to work. */
134 
135   if (!TCR_4(__kmp_init_gtid))
136     return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139   if (TCR_4(__kmp_gtid_mode) >= 3) {
140     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141     return __kmp_gtid;
142   }
143 #endif
144   if (TCR_4(__kmp_gtid_mode) >= 2) {
145     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146     return __kmp_gtid_get_specific();
147   }
148   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150   stack_addr = (char *)&stack_data;
151   other_threads = __kmp_threads;
152 
153   /* ATT: The code below is a source of potential bugs due to unsynchronized
154      access to __kmp_threads array. For example:
155      1. Current thread loads other_threads[i] to thr and checks it, it is
156         non-NULL.
157      2. Current thread is suspended by OS.
158      3. Another thread unregisters and finishes (debug versions of free()
159         may fill memory with something like 0xEF).
160      4. Current thread is resumed.
161      5. Current thread reads junk from *thr.
162      TODO: Fix it.  --ln  */
163 
164   for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167     if (!thr)
168       continue;
169 
170     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173     /* stack grows down -- search through all of the active threads */
174 
175     if (stack_addr <= stack_base) {
176       size_t stack_diff = stack_base - stack_addr;
177 
178       if (stack_diff <= stack_size) {
179         /* The only way we can be closer than the allocated */
180         /* stack size is if we are running on this thread. */
181         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182         return i;
183       }
184     }
185   }
186 
187   /* get specific to try and determine our gtid */
188   KA_TRACE(1000,
189            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190             "thread, using TLS\n"));
191   i = __kmp_gtid_get_specific();
192 
193   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
194 
195   /* if we havn't been assigned a gtid, then return code */
196   if (i < 0)
197     return i;
198 
199   /* dynamically updated stack window for uber threads to avoid get_specific
200      call */
201   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202     KMP_FATAL(StackOverflow, i);
203   }
204 
205   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206   if (stack_addr > stack_base) {
207     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210                 stack_base);
211   } else {
212     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213             stack_base - stack_addr);
214   }
215 
216   /* Reprint stack bounds for ubermaster since they have been refined */
217   if (__kmp_storage_map) {
218     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221                                  other_threads[i]->th.th_info.ds.ds_stacksize,
222                                  "th_%d stack (refinement)", i);
223   }
224   return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228   int gtid;
229 
230   if (!__kmp_init_serial) {
231     gtid = KMP_GTID_DNE;
232   } else
233 #ifdef KMP_TDATA_GTID
234       if (TCR_4(__kmp_gtid_mode) >= 3) {
235     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236     gtid = __kmp_gtid;
237   } else
238 #endif
239       if (TCR_4(__kmp_gtid_mode) >= 2) {
240     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241     gtid = __kmp_gtid_get_specific();
242   } else {
243     KA_TRACE(1000,
244              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245     gtid = __kmp_get_global_thread_id();
246   }
247 
248   /* we must be a new uber master sibling thread */
249   if (gtid == KMP_GTID_DNE) {
250     KA_TRACE(10,
251              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252               "Registering a new gtid.\n"));
253     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254     if (!__kmp_init_serial) {
255       __kmp_do_serial_initialize();
256       gtid = __kmp_gtid_get_specific();
257     } else {
258       gtid = __kmp_register_root(FALSE);
259     }
260     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262   }
263 
264   KMP_DEBUG_ASSERT(gtid >= 0);
265 
266   return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271   int f;
272   char *stack_beg = NULL;
273   char *stack_end = NULL;
274   int gtid;
275 
276   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277   if (__kmp_storage_map) {
278     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281     gtid = __kmp_gtid_from_thread(th);
282 
283     if (gtid == KMP_GTID_MONITOR) {
284       __kmp_print_storage_map_gtid(
285           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286           "th_%s stack (%s)", "mon",
287           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288     } else {
289       __kmp_print_storage_map_gtid(
290           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291           "th_%d stack (%s)", gtid,
292           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293     }
294   }
295 
296   /* No point in checking ubermaster threads since they use refinement and
297    * cannot overlap */
298   gtid = __kmp_gtid_from_thread(th);
299   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300     KA_TRACE(10,
301              ("__kmp_check_stack_overlap: performing extensive checking\n"));
302     if (stack_beg == NULL) {
303       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305     }
306 
307     for (f = 0; f < __kmp_threads_capacity; f++) {
308       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310       if (f_th && f_th != th) {
311         char *other_stack_end =
312             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313         char *other_stack_beg =
314             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318           /* Print the other stack values before the abort */
319           if (__kmp_storage_map)
320             __kmp_print_storage_map_gtid(
321                 -1, other_stack_beg, other_stack_end,
322                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326                       __kmp_msg_null);
327         }
328       }
329     }
330   }
331   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337   static int done = FALSE;
338 
339   while (!done) {
340     KMP_YIELD(TRUE);
341   }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347                                   char const *format, ...) {
348   char buffer[MAX_MESSAGE];
349   va_list ap;
350 
351   va_start(ap, format);
352   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353                p2, (unsigned long)size, format);
354   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355   __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357   int node;
358   if (gtid >= 0) {
359     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360       if (__kmp_storage_map_verbose) {
361         node = __kmp_get_host_node(p1);
362         if (node < 0) /* doesn't work, so don't try this next time */
363           __kmp_storage_map_verbose = FALSE;
364         else {
365           char *last;
366           int lastNode;
367           int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369           const int page_size = KMP_GET_PAGE_SIZE();
370 
371           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373           if (localProc >= 0)
374             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
375                                  localProc >> 1);
376           else
377             __kmp_printf_no_lock("  GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379           /* The more elaborate format is disabled for now because of the prctl
380            * hanging bug. */
381           do {
382             last = p1;
383             lastNode = node;
384             /* This loop collates adjacent pages with the same host node. */
385             do {
386               (char *)p1 += page_size;
387             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
389                                  lastNode);
390           } while (p1 <= p2);
391 #else
392           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
393                                (char *)p1 + (page_size - 1),
394                                __kmp_get_host_node(p1));
395           if (p1 < p2) {
396             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
397                                  (char *)p2 + (page_size - 1),
398                                  __kmp_get_host_node(p2));
399           }
400 #endif
401         }
402       }
403     } else
404       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
405   }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 }
409 
410 void __kmp_warn(char const *format, ...) {
411   char buffer[MAX_MESSAGE];
412   va_list ap;
413 
414   if (__kmp_generate_warnings == kmp_warnings_off) {
415     return;
416   }
417 
418   va_start(ap, format);
419 
420   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422   __kmp_vprintf(kmp_err, buffer, ap);
423   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424 
425   va_end(ap);
426 }
427 
428 void __kmp_abort_process() {
429   // Later threads may stall here, but that's ok because abort() will kill them.
430   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431 
432   if (__kmp_debug_buf) {
433     __kmp_dump_debug_buffer();
434   }
435 
436   if (KMP_OS_WINDOWS) {
437     // Let other threads know of abnormal termination and prevent deadlock
438     // if abort happened during library initialization or shutdown
439     __kmp_global.g.g_abort = SIGABRT;
440 
441     /* On Windows* OS by default abort() causes pop-up error box, which stalls
442        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443        boxes. _set_abort_behavior() works well, but this function is not
444        available in VS7 (this is not problem for DLL, but it is a problem for
445        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446        help, at least in some versions of MS C RTL.
447 
448        It seems following sequence is the only way to simulate abort() and
449        avoid pop-up error box. */
450     raise(SIGABRT);
451     _exit(3); // Just in case, if signal ignored, exit anyway.
452   } else {
453     __kmp_unregister_library();
454     abort();
455   }
456 
457   __kmp_infinite_loop();
458   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459 
460 } // __kmp_abort_process
461 
462 void __kmp_abort_thread(void) {
463   // TODO: Eliminate g_abort global variable and this function.
464   // In case of abort just call abort(), it will kill all the threads.
465   __kmp_infinite_loop();
466 } // __kmp_abort_thread
467 
468 /* Print out the storage map for the major kmp_info_t thread data structures
469    that are allocated together. */
470 
471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473                                gtid);
474 
475   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477 
478   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
480 
481   __kmp_print_storage_map_gtid(
482       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484 
485   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486                                &thr->th.th_bar[bs_plain_barrier + 1],
487                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488                                gtid);
489 
490   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
492                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493                                gtid);
494 
495 #if KMP_FAST_REDUCTION_BARRIER
496   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497                                &thr->th.th_bar[bs_reduction_barrier + 1],
498                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499                                gtid);
500 #endif // KMP_FAST_REDUCTION_BARRIER
501 }
502 
503 /* Print out the storage map for the major kmp_team_t team data structures
504    that are allocated together. */
505 
506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507                                          int team_id, int num_thr) {
508   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510                                header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513                                &team->t.t_bar[bs_last_barrier],
514                                sizeof(kmp_balign_team_t) * bs_last_barrier,
515                                "%s_%d.t_bar", header, team_id);
516 
517   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518                                &team->t.t_bar[bs_plain_barrier + 1],
519                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520                                header, team_id);
521 
522   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523                                &team->t.t_bar[bs_forkjoin_barrier + 1],
524                                sizeof(kmp_balign_team_t),
525                                "%s_%d.t_bar[forkjoin]", header, team_id);
526 
527 #if KMP_FAST_REDUCTION_BARRIER
528   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529                                &team->t.t_bar[bs_reduction_barrier + 1],
530                                sizeof(kmp_balign_team_t),
531                                "%s_%d.t_bar[reduction]", header, team_id);
532 #endif // KMP_FAST_REDUCTION_BARRIER
533 
534   __kmp_print_storage_map_gtid(
535       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537 
538   __kmp_print_storage_map_gtid(
539       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541 
542   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543                                &team->t.t_disp_buffer[num_disp_buff],
544                                sizeof(dispatch_shared_info_t) * num_disp_buff,
545                                "%s_%d.t_disp_buffer", header, team_id);
546 }
547 
548 static void __kmp_init_allocator() {
549   __kmp_init_memkind();
550   __kmp_init_target_mem();
551 }
552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553 
554 /* ------------------------------------------------------------------------ */
555 
556 #if KMP_DYNAMIC_LIB
557 #if KMP_OS_WINDOWS
558 
559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561 
562   switch (fdwReason) {
563 
564   case DLL_PROCESS_ATTACH:
565     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566 
567     return TRUE;
568 
569   case DLL_PROCESS_DETACH:
570     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571 
572     // According to Windows* documentation for DllMain entry point:
573     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574     //   lpReserved == NULL when FreeLibrary() is called,
575     //   lpReserved != NULL when the process is terminated.
576     // When FreeLibrary() is called, worker threads remain alive. So the
577     // runtime's state is consistent and executing proper shutdown is OK.
578     // When the process is terminated, worker threads have exited or been
579     // forcefully terminated by the OS and only the shutdown thread remains.
580     // This can leave the runtime in an inconsistent state.
581     // Hence, only attempt proper cleanup when FreeLibrary() is called.
582     // Otherwise, rely on OS to reclaim resources.
583     if (lpReserved == NULL)
584       __kmp_internal_end_library(__kmp_gtid_get_specific());
585 
586     return TRUE;
587 
588   case DLL_THREAD_ATTACH:
589     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590 
591     /* if we want to register new siblings all the time here call
592      * __kmp_get_gtid(); */
593     return TRUE;
594 
595   case DLL_THREAD_DETACH:
596     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597 
598     __kmp_internal_end_thread(__kmp_gtid_get_specific());
599     return TRUE;
600   }
601 
602   return TRUE;
603 }
604 
605 #endif /* KMP_OS_WINDOWS */
606 #endif /* KMP_DYNAMIC_LIB */
607 
608 /* __kmp_parallel_deo -- Wait until it's our turn. */
609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610   int gtid = *gtid_ref;
611 #ifdef BUILD_PARALLEL_ORDERED
612   kmp_team_t *team = __kmp_team_from_gtid(gtid);
613 #endif /* BUILD_PARALLEL_ORDERED */
614 
615   if (__kmp_env_consistency_check) {
616     if (__kmp_threads[gtid]->th.th_root->r.r_active)
617 #if KMP_USE_DYNAMIC_LOCK
618       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619 #else
620       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621 #endif
622   }
623 #ifdef BUILD_PARALLEL_ORDERED
624   if (!team->t.t_serialized) {
625     KMP_MB();
626     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627              NULL);
628     KMP_MB();
629   }
630 #endif /* BUILD_PARALLEL_ORDERED */
631 }
632 
633 /* __kmp_parallel_dxo -- Signal the next task. */
634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635   int gtid = *gtid_ref;
636 #ifdef BUILD_PARALLEL_ORDERED
637   int tid = __kmp_tid_from_gtid(gtid);
638   kmp_team_t *team = __kmp_team_from_gtid(gtid);
639 #endif /* BUILD_PARALLEL_ORDERED */
640 
641   if (__kmp_env_consistency_check) {
642     if (__kmp_threads[gtid]->th.th_root->r.r_active)
643       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644   }
645 #ifdef BUILD_PARALLEL_ORDERED
646   if (!team->t.t_serialized) {
647     KMP_MB(); /* Flush all pending memory write invalidates.  */
648 
649     /* use the tid of the next thread in this team */
650     /* TODO replace with general release procedure */
651     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652 
653     KMP_MB(); /* Flush all pending memory write invalidates.  */
654   }
655 #endif /* BUILD_PARALLEL_ORDERED */
656 }
657 
658 /* ------------------------------------------------------------------------ */
659 /* The BARRIER for a SINGLE process section is always explicit   */
660 
661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662   int status;
663   kmp_info_t *th;
664   kmp_team_t *team;
665 
666   if (!TCR_4(__kmp_init_parallel))
667     __kmp_parallel_initialize();
668   __kmp_resume_if_soft_paused();
669 
670   th = __kmp_threads[gtid];
671   team = th->th.th_team;
672   status = 0;
673 
674   th->th.th_ident = id_ref;
675 
676   if (team->t.t_serialized) {
677     status = 1;
678   } else {
679     kmp_int32 old_this = th->th.th_local.this_construct;
680 
681     ++th->th.th_local.this_construct;
682     /* try to set team count to thread count--success means thread got the
683        single block */
684     /* TODO: Should this be acquire or release? */
685     if (team->t.t_construct == old_this) {
686       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687                                               th->th.th_local.this_construct);
688     }
689 #if USE_ITT_BUILD
690     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692         team->t.t_active_level == 1) {
693       // Only report metadata by primary thread of active team at level 1
694       __kmp_itt_metadata_single(id_ref);
695     }
696 #endif /* USE_ITT_BUILD */
697   }
698 
699   if (__kmp_env_consistency_check) {
700     if (status && push_ws) {
701       __kmp_push_workshare(gtid, ct_psingle, id_ref);
702     } else {
703       __kmp_check_workshare(gtid, ct_psingle, id_ref);
704     }
705   }
706 #if USE_ITT_BUILD
707   if (status) {
708     __kmp_itt_single_start(gtid);
709   }
710 #endif /* USE_ITT_BUILD */
711   return status;
712 }
713 
714 void __kmp_exit_single(int gtid) {
715 #if USE_ITT_BUILD
716   __kmp_itt_single_end(gtid);
717 #endif /* USE_ITT_BUILD */
718   if (__kmp_env_consistency_check)
719     __kmp_pop_workshare(gtid, ct_psingle, NULL);
720 }
721 
722 /* determine if we can go parallel or must use a serialized parallel region and
723  * how many threads we can use
724  * set_nproc is the number of threads requested for the team
725  * returns 0 if we should serialize or only use one thread,
726  * otherwise the number of threads to use
727  * The forkjoin lock is held by the caller. */
728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729                                  int master_tid, int set_nthreads,
730                                  int enter_teams) {
731   int capacity;
732   int new_nthreads;
733   KMP_DEBUG_ASSERT(__kmp_init_serial);
734   KMP_DEBUG_ASSERT(root && parent_team);
735   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736 
737   // If dyn-var is set, dynamically adjust the number of desired threads,
738   // according to the method specified by dynamic_mode.
739   new_nthreads = set_nthreads;
740   if (!get__dynamic_2(parent_team, master_tid)) {
741     ;
742   }
743 #ifdef USE_LOAD_BALANCE
744   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746     if (new_nthreads == 1) {
747       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748                     "reservation to 1 thread\n",
749                     master_tid));
750       return 1;
751     }
752     if (new_nthreads < set_nthreads) {
753       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754                     "reservation to %d threads\n",
755                     master_tid, new_nthreads));
756     }
757   }
758 #endif /* USE_LOAD_BALANCE */
759   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760     new_nthreads = __kmp_avail_proc - __kmp_nth +
761                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762     if (new_nthreads <= 1) {
763       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764                     "reservation to 1 thread\n",
765                     master_tid));
766       return 1;
767     }
768     if (new_nthreads < set_nthreads) {
769       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770                     "reservation to %d threads\n",
771                     master_tid, new_nthreads));
772     } else {
773       new_nthreads = set_nthreads;
774     }
775   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776     if (set_nthreads > 2) {
777       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778       new_nthreads = (new_nthreads % set_nthreads) + 1;
779       if (new_nthreads == 1) {
780         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781                       "reservation to 1 thread\n",
782                       master_tid));
783         return 1;
784       }
785       if (new_nthreads < set_nthreads) {
786         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787                       "reservation to %d threads\n",
788                       master_tid, new_nthreads));
789       }
790     }
791   } else {
792     KMP_ASSERT(0);
793   }
794 
795   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796   if (__kmp_nth + new_nthreads -
797           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798       __kmp_max_nth) {
799     int tl_nthreads = __kmp_max_nth - __kmp_nth +
800                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801     if (tl_nthreads <= 0) {
802       tl_nthreads = 1;
803     }
804 
805     // If dyn-var is false, emit a 1-time warning.
806     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807       __kmp_reserve_warn = 1;
808       __kmp_msg(kmp_ms_warning,
809                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811     }
812     if (tl_nthreads == 1) {
813       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814                     "reduced reservation to 1 thread\n",
815                     master_tid));
816       return 1;
817     }
818     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819                   "reservation to %d threads\n",
820                   master_tid, tl_nthreads));
821     new_nthreads = tl_nthreads;
822   }
823 
824   // Respect OMP_THREAD_LIMIT
825   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827   if (cg_nthreads + new_nthreads -
828           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829       max_cg_threads) {
830     int tl_nthreads = max_cg_threads - cg_nthreads +
831                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832     if (tl_nthreads <= 0) {
833       tl_nthreads = 1;
834     }
835 
836     // If dyn-var is false, emit a 1-time warning.
837     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838       __kmp_reserve_warn = 1;
839       __kmp_msg(kmp_ms_warning,
840                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842     }
843     if (tl_nthreads == 1) {
844       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845                     "reduced reservation to 1 thread\n",
846                     master_tid));
847       return 1;
848     }
849     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850                   "reservation to %d threads\n",
851                   master_tid, tl_nthreads));
852     new_nthreads = tl_nthreads;
853   }
854 
855   // Check if the threads array is large enough, or needs expanding.
856   // See comment in __kmp_register_root() about the adjustment if
857   // __kmp_threads[0] == NULL.
858   capacity = __kmp_threads_capacity;
859   if (TCR_PTR(__kmp_threads[0]) == NULL) {
860     --capacity;
861   }
862   // If it is not for initializing the hidden helper team, we need to take
863   // __kmp_hidden_helper_threads_num out of the capacity because it is included
864   // in __kmp_threads_capacity.
865   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866     capacity -= __kmp_hidden_helper_threads_num;
867   }
868   if (__kmp_nth + new_nthreads -
869           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870       capacity) {
871     // Expand the threads array.
872     int slotsRequired = __kmp_nth + new_nthreads -
873                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874                         capacity;
875     int slotsAdded = __kmp_expand_threads(slotsRequired);
876     if (slotsAdded < slotsRequired) {
877       // The threads array was not expanded enough.
878       new_nthreads -= (slotsRequired - slotsAdded);
879       KMP_ASSERT(new_nthreads >= 1);
880 
881       // If dyn-var is false, emit a 1-time warning.
882       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883         __kmp_reserve_warn = 1;
884         if (__kmp_tp_cached) {
885           __kmp_msg(kmp_ms_warning,
886                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889         } else {
890           __kmp_msg(kmp_ms_warning,
891                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893         }
894       }
895     }
896   }
897 
898 #ifdef KMP_DEBUG
899   if (new_nthreads == 1) {
900     KC_TRACE(10,
901              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902               "dead roots and rechecking; requested %d threads\n",
903               __kmp_get_gtid(), set_nthreads));
904   } else {
905     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906                   " %d threads\n",
907                   __kmp_get_gtid(), new_nthreads, set_nthreads));
908   }
909 #endif // KMP_DEBUG
910   return new_nthreads;
911 }
912 
913 /* Allocate threads from the thread pool and assign them to the new team. We are
914    assured that there are enough threads available, because we checked on that
915    earlier within critical section forkjoin */
916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917                                     kmp_info_t *master_th, int master_gtid) {
918   int i;
919   int use_hot_team;
920 
921   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
922   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
923   KMP_MB();
924 
925   /* first, let's setup the primary thread */
926   master_th->th.th_info.ds.ds_tid = 0;
927   master_th->th.th_team = team;
928   master_th->th.th_team_nproc = team->t.t_nproc;
929   master_th->th.th_team_master = master_th;
930   master_th->th.th_team_serialized = FALSE;
931   master_th->th.th_dispatch = &team->t.t_dispatch[0];
932 
933 /* make sure we are not the optimized hot team */
934 #if KMP_NESTED_HOT_TEAMS
935   use_hot_team = 0;
936   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
937   if (hot_teams) { // hot teams array is not allocated if
938     // KMP_HOT_TEAMS_MAX_LEVEL=0
939     int level = team->t.t_active_level - 1; // index in array of hot teams
940     if (master_th->th.th_teams_microtask) { // are we inside the teams?
941       if (master_th->th.th_teams_size.nteams > 1) {
942         ++level; // level was not increased in teams construct for
943         // team_of_masters
944       }
945       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
946           master_th->th.th_teams_level == team->t.t_level) {
947         ++level; // level was not increased in teams construct for
948         // team_of_workers before the parallel
949       } // team->t.t_level will be increased inside parallel
950     }
951     if (level < __kmp_hot_teams_max_level) {
952       if (hot_teams[level].hot_team) {
953         // hot team has already been allocated for given level
954         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
955         use_hot_team = 1; // the team is ready to use
956       } else {
957         use_hot_team = 0; // AC: threads are not allocated yet
958         hot_teams[level].hot_team = team; // remember new hot team
959         hot_teams[level].hot_team_nth = team->t.t_nproc;
960       }
961     } else {
962       use_hot_team = 0;
963     }
964   }
965 #else
966   use_hot_team = team == root->r.r_hot_team;
967 #endif
968   if (!use_hot_team) {
969 
970     /* install the primary thread */
971     team->t.t_threads[0] = master_th;
972     __kmp_initialize_info(master_th, team, 0, master_gtid);
973 
974     /* now, install the worker threads */
975     for (i = 1; i < team->t.t_nproc; i++) {
976 
977       /* fork or reallocate a new thread and install it in team */
978       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
979       team->t.t_threads[i] = thr;
980       KMP_DEBUG_ASSERT(thr);
981       KMP_DEBUG_ASSERT(thr->th.th_team == team);
982       /* align team and thread arrived states */
983       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
984                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
985                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
986                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
987                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
988                     team->t.t_bar[bs_plain_barrier].b_arrived));
989       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
990       thr->th.th_teams_level = master_th->th.th_teams_level;
991       thr->th.th_teams_size = master_th->th.th_teams_size;
992       { // Initialize threads' barrier data.
993         int b;
994         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
995         for (b = 0; b < bs_last_barrier; ++b) {
996           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
997           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
998 #if USE_DEBUGGER
999           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1000 #endif
1001         }
1002       }
1003     }
1004 
1005 #if KMP_AFFINITY_SUPPORTED
1006     __kmp_partition_places(team);
1007 #endif
1008   }
1009 
1010   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1011     for (i = 0; i < team->t.t_nproc; i++) {
1012       kmp_info_t *thr = team->t.t_threads[i];
1013       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1014           thr->th.th_prev_level != team->t.t_level) {
1015         team->t.t_display_affinity = 1;
1016         break;
1017       }
1018     }
1019   }
1020 
1021   KMP_MB();
1022 }
1023 
1024 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1025 // Propagate any changes to the floating point control registers out to the team
1026 // We try to avoid unnecessary writes to the relevant cache line in the team
1027 // structure, so we don't make changes unless they are needed.
1028 inline static void propagateFPControl(kmp_team_t *team) {
1029   if (__kmp_inherit_fp_control) {
1030     kmp_int16 x87_fpu_control_word;
1031     kmp_uint32 mxcsr;
1032 
1033     // Get primary thread's values of FPU control flags (both X87 and vector)
1034     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1035     __kmp_store_mxcsr(&mxcsr);
1036     mxcsr &= KMP_X86_MXCSR_MASK;
1037 
1038     // There is no point looking at t_fp_control_saved here.
1039     // If it is TRUE, we still have to update the values if they are different
1040     // from those we now have. If it is FALSE we didn't save anything yet, but
1041     // our objective is the same. We have to ensure that the values in the team
1042     // are the same as those we have.
1043     // So, this code achieves what we need whether or not t_fp_control_saved is
1044     // true. By checking whether the value needs updating we avoid unnecessary
1045     // writes that would put the cache-line into a written state, causing all
1046     // threads in the team to have to read it again.
1047     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1048     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1049     // Although we don't use this value, other code in the runtime wants to know
1050     // whether it should restore them. So we must ensure it is correct.
1051     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1052   } else {
1053     // Similarly here. Don't write to this cache-line in the team structure
1054     // unless we have to.
1055     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1056   }
1057 }
1058 
1059 // Do the opposite, setting the hardware registers to the updated values from
1060 // the team.
1061 inline static void updateHWFPControl(kmp_team_t *team) {
1062   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1063     // Only reset the fp control regs if they have been changed in the team.
1064     // the parallel region that we are exiting.
1065     kmp_int16 x87_fpu_control_word;
1066     kmp_uint32 mxcsr;
1067     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1068     __kmp_store_mxcsr(&mxcsr);
1069     mxcsr &= KMP_X86_MXCSR_MASK;
1070 
1071     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1072       __kmp_clear_x87_fpu_status_word();
1073       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1074     }
1075 
1076     if (team->t.t_mxcsr != mxcsr) {
1077       __kmp_load_mxcsr(&team->t.t_mxcsr);
1078     }
1079   }
1080 }
1081 #else
1082 #define propagateFPControl(x) ((void)0)
1083 #define updateHWFPControl(x) ((void)0)
1084 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1085 
1086 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1087                                      int realloc); // forward declaration
1088 
1089 /* Run a parallel region that has been serialized, so runs only in a team of the
1090    single primary thread. */
1091 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1092   kmp_info_t *this_thr;
1093   kmp_team_t *serial_team;
1094 
1095   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1096 
1097   /* Skip all this code for autopar serialized loops since it results in
1098      unacceptable overhead */
1099   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1100     return;
1101 
1102   if (!TCR_4(__kmp_init_parallel))
1103     __kmp_parallel_initialize();
1104   __kmp_resume_if_soft_paused();
1105 
1106   this_thr = __kmp_threads[global_tid];
1107   serial_team = this_thr->th.th_serial_team;
1108 
1109   /* utilize the serialized team held by this thread */
1110   KMP_DEBUG_ASSERT(serial_team);
1111   KMP_MB();
1112 
1113   if (__kmp_tasking_mode != tskm_immediate_exec) {
1114     KMP_DEBUG_ASSERT(
1115         this_thr->th.th_task_team ==
1116         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1117     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1118                      NULL);
1119     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1120                   "team %p, new task_team = NULL\n",
1121                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1122     this_thr->th.th_task_team = NULL;
1123   }
1124 
1125   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1126   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1127     proc_bind = proc_bind_false;
1128   } else if (proc_bind == proc_bind_default) {
1129     // No proc_bind clause was specified, so use the current value
1130     // of proc-bind-var for this parallel region.
1131     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1132   }
1133   // Reset for next parallel region
1134   this_thr->th.th_set_proc_bind = proc_bind_default;
1135 
1136 #if OMPT_SUPPORT
1137   ompt_data_t ompt_parallel_data = ompt_data_none;
1138   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1139   if (ompt_enabled.enabled &&
1140       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1141 
1142     ompt_task_info_t *parent_task_info;
1143     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1144 
1145     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1146     if (ompt_enabled.ompt_callback_parallel_begin) {
1147       int team_size = 1;
1148 
1149       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1150           &(parent_task_info->task_data), &(parent_task_info->frame),
1151           &ompt_parallel_data, team_size,
1152           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1153     }
1154   }
1155 #endif // OMPT_SUPPORT
1156 
1157   if (this_thr->th.th_team != serial_team) {
1158     // Nested level will be an index in the nested nthreads array
1159     int level = this_thr->th.th_team->t.t_level;
1160 
1161     if (serial_team->t.t_serialized) {
1162       /* this serial team was already used
1163          TODO increase performance by making this locks more specific */
1164       kmp_team_t *new_team;
1165 
1166       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1167 
1168       new_team =
1169           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1170 #if OMPT_SUPPORT
1171                               ompt_parallel_data,
1172 #endif
1173                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1174                               0 USE_NESTED_HOT_ARG(NULL));
1175       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1176       KMP_ASSERT(new_team);
1177 
1178       /* setup new serialized team and install it */
1179       new_team->t.t_threads[0] = this_thr;
1180       new_team->t.t_parent = this_thr->th.th_team;
1181       serial_team = new_team;
1182       this_thr->th.th_serial_team = serial_team;
1183 
1184       KF_TRACE(
1185           10,
1186           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1187            global_tid, serial_team));
1188 
1189       /* TODO the above breaks the requirement that if we run out of resources,
1190          then we can still guarantee that serialized teams are ok, since we may
1191          need to allocate a new one */
1192     } else {
1193       KF_TRACE(
1194           10,
1195           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1196            global_tid, serial_team));
1197     }
1198 
1199     /* we have to initialize this serial team */
1200     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1201     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1202     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1203     serial_team->t.t_ident = loc;
1204     serial_team->t.t_serialized = 1;
1205     serial_team->t.t_nproc = 1;
1206     serial_team->t.t_parent = this_thr->th.th_team;
1207     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1208     this_thr->th.th_team = serial_team;
1209     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1210 
1211     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1212                   this_thr->th.th_current_task));
1213     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1214     this_thr->th.th_current_task->td_flags.executing = 0;
1215 
1216     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1217 
1218     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1219        implicit task for each serialized task represented by
1220        team->t.t_serialized? */
1221     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1222               &this_thr->th.th_current_task->td_parent->td_icvs);
1223 
1224     // Thread value exists in the nested nthreads array for the next nested
1225     // level
1226     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1227       this_thr->th.th_current_task->td_icvs.nproc =
1228           __kmp_nested_nth.nth[level + 1];
1229     }
1230 
1231     if (__kmp_nested_proc_bind.used &&
1232         (level + 1 < __kmp_nested_proc_bind.used)) {
1233       this_thr->th.th_current_task->td_icvs.proc_bind =
1234           __kmp_nested_proc_bind.bind_types[level + 1];
1235     }
1236 
1237 #if USE_DEBUGGER
1238     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1239 #endif
1240     this_thr->th.th_info.ds.ds_tid = 0;
1241 
1242     /* set thread cache values */
1243     this_thr->th.th_team_nproc = 1;
1244     this_thr->th.th_team_master = this_thr;
1245     this_thr->th.th_team_serialized = 1;
1246 
1247     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1248     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1249     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1250 
1251     propagateFPControl(serial_team);
1252 
1253     /* check if we need to allocate dispatch buffers stack */
1254     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1255     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1256       serial_team->t.t_dispatch->th_disp_buffer =
1257           (dispatch_private_info_t *)__kmp_allocate(
1258               sizeof(dispatch_private_info_t));
1259     }
1260     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1261 
1262     KMP_MB();
1263 
1264   } else {
1265     /* this serialized team is already being used,
1266      * that's fine, just add another nested level */
1267     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1268     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1269     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1270     ++serial_team->t.t_serialized;
1271     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1272 
1273     // Nested level will be an index in the nested nthreads array
1274     int level = this_thr->th.th_team->t.t_level;
1275     // Thread value exists in the nested nthreads array for the next nested
1276     // level
1277     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1278       this_thr->th.th_current_task->td_icvs.nproc =
1279           __kmp_nested_nth.nth[level + 1];
1280     }
1281     serial_team->t.t_level++;
1282     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1283                   "of serial team %p to %d\n",
1284                   global_tid, serial_team, serial_team->t.t_level));
1285 
1286     /* allocate/push dispatch buffers stack */
1287     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1288     {
1289       dispatch_private_info_t *disp_buffer =
1290           (dispatch_private_info_t *)__kmp_allocate(
1291               sizeof(dispatch_private_info_t));
1292       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1293       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1294     }
1295     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1296 
1297     KMP_MB();
1298   }
1299   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1300 
1301   // Perform the display affinity functionality for
1302   // serialized parallel regions
1303   if (__kmp_display_affinity) {
1304     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1305         this_thr->th.th_prev_num_threads != 1) {
1306       // NULL means use the affinity-format-var ICV
1307       __kmp_aux_display_affinity(global_tid, NULL);
1308       this_thr->th.th_prev_level = serial_team->t.t_level;
1309       this_thr->th.th_prev_num_threads = 1;
1310     }
1311   }
1312 
1313   if (__kmp_env_consistency_check)
1314     __kmp_push_parallel(global_tid, NULL);
1315 #if OMPT_SUPPORT
1316   serial_team->t.ompt_team_info.master_return_address = codeptr;
1317   if (ompt_enabled.enabled &&
1318       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1319     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1320         OMPT_GET_FRAME_ADDRESS(0);
1321 
1322     ompt_lw_taskteam_t lw_taskteam;
1323     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1324                             &ompt_parallel_data, codeptr);
1325 
1326     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1327     // don't use lw_taskteam after linking. content was swaped
1328 
1329     /* OMPT implicit task begin */
1330     if (ompt_enabled.ompt_callback_implicit_task) {
1331       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1332           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1333           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1334           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1335       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1336           __kmp_tid_from_gtid(global_tid);
1337     }
1338 
1339     /* OMPT state */
1340     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1341     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1342         OMPT_GET_FRAME_ADDRESS(0);
1343   }
1344 #endif
1345 }
1346 
1347 /* most of the work for a fork */
1348 /* return true if we really went parallel, false if serialized */
1349 int __kmp_fork_call(ident_t *loc, int gtid,
1350                     enum fork_context_e call_context, // Intel, GNU, ...
1351                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1352                     kmp_va_list ap) {
1353   void **argv;
1354   int i;
1355   int master_tid;
1356   int master_this_cons;
1357   kmp_team_t *team;
1358   kmp_team_t *parent_team;
1359   kmp_info_t *master_th;
1360   kmp_root_t *root;
1361   int nthreads;
1362   int master_active;
1363   int master_set_numthreads;
1364   int level;
1365   int active_level;
1366   int teams_level;
1367 #if KMP_NESTED_HOT_TEAMS
1368   kmp_hot_team_ptr_t **p_hot_teams;
1369 #endif
1370   { // KMP_TIME_BLOCK
1371     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1372     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1373 
1374     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1375     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1376       /* Some systems prefer the stack for the root thread(s) to start with */
1377       /* some gap from the parent stack to prevent false sharing. */
1378       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1379       /* These 2 lines below are so this does not get optimized out */
1380       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1381         __kmp_stkpadding += (short)((kmp_int64)dummy);
1382     }
1383 
1384     /* initialize if needed */
1385     KMP_DEBUG_ASSERT(
1386         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1387     if (!TCR_4(__kmp_init_parallel))
1388       __kmp_parallel_initialize();
1389     __kmp_resume_if_soft_paused();
1390 
1391     /* setup current data */
1392     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1393     // shutdown
1394     parent_team = master_th->th.th_team;
1395     master_tid = master_th->th.th_info.ds.ds_tid;
1396     master_this_cons = master_th->th.th_local.this_construct;
1397     root = master_th->th.th_root;
1398     master_active = root->r.r_active;
1399     master_set_numthreads = master_th->th.th_set_nproc;
1400 
1401 #if OMPT_SUPPORT
1402     ompt_data_t ompt_parallel_data = ompt_data_none;
1403     ompt_data_t *parent_task_data;
1404     ompt_frame_t *ompt_frame;
1405     ompt_data_t *implicit_task_data;
1406     void *return_address = NULL;
1407 
1408     if (ompt_enabled.enabled) {
1409       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1410                                     NULL, NULL);
1411       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1412     }
1413 #endif
1414 
1415     // Assign affinity to root thread if it hasn't happened yet
1416     __kmp_assign_root_init_mask();
1417 
1418     // Nested level will be an index in the nested nthreads array
1419     level = parent_team->t.t_level;
1420     // used to launch non-serial teams even if nested is not allowed
1421     active_level = parent_team->t.t_active_level;
1422     // needed to check nesting inside the teams
1423     teams_level = master_th->th.th_teams_level;
1424 #if KMP_NESTED_HOT_TEAMS
1425     p_hot_teams = &master_th->th.th_hot_teams;
1426     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1427       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1428           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1429       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1430       // it is either actual or not needed (when active_level > 0)
1431       (*p_hot_teams)[0].hot_team_nth = 1;
1432     }
1433 #endif
1434 
1435 #if OMPT_SUPPORT
1436     if (ompt_enabled.enabled) {
1437       if (ompt_enabled.ompt_callback_parallel_begin) {
1438         int team_size = master_set_numthreads
1439                             ? master_set_numthreads
1440                             : get__nproc_2(parent_team, master_tid);
1441         int flags = OMPT_INVOKER(call_context) |
1442                     ((microtask == (microtask_t)__kmp_teams_master)
1443                          ? ompt_parallel_league
1444                          : ompt_parallel_team);
1445         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1446             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1447             return_address);
1448       }
1449       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1450     }
1451 #endif
1452 
1453     master_th->th.th_ident = loc;
1454 
1455     if (master_th->th.th_teams_microtask && ap &&
1456         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1457       // AC: This is start of parallel that is nested inside teams construct.
1458       // The team is actual (hot), all workers are ready at the fork barrier.
1459       // No lock needed to initialize the team a bit, then free workers.
1460       parent_team->t.t_ident = loc;
1461       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1462       parent_team->t.t_argc = argc;
1463       argv = (void **)parent_team->t.t_argv;
1464       for (i = argc - 1; i >= 0; --i)
1465         *argv++ = va_arg(kmp_va_deref(ap), void *);
1466       // Increment our nested depth levels, but not increase the serialization
1467       if (parent_team == master_th->th.th_serial_team) {
1468         // AC: we are in serialized parallel
1469         __kmpc_serialized_parallel(loc, gtid);
1470         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1471 
1472         if (call_context == fork_context_gnu) {
1473           // AC: need to decrement t_serialized for enquiry functions to work
1474           // correctly, will restore at join time
1475           parent_team->t.t_serialized--;
1476           return TRUE;
1477         }
1478 
1479 #if OMPD_SUPPORT
1480         parent_team->t.t_pkfn = microtask;
1481 #endif
1482 
1483 #if OMPT_SUPPORT
1484         void *dummy;
1485         void **exit_frame_p;
1486 
1487         ompt_lw_taskteam_t lw_taskteam;
1488 
1489         if (ompt_enabled.enabled) {
1490           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1491                                   &ompt_parallel_data, return_address);
1492           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1493 
1494           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1495           // don't use lw_taskteam after linking. content was swaped
1496 
1497           /* OMPT implicit task begin */
1498           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1499           if (ompt_enabled.ompt_callback_implicit_task) {
1500             OMPT_CUR_TASK_INFO(master_th)->thread_num =
1501                 __kmp_tid_from_gtid(gtid);
1502             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1503                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1504                 implicit_task_data, 1,
1505                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1506           }
1507 
1508           /* OMPT state */
1509           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1510         } else {
1511           exit_frame_p = &dummy;
1512         }
1513 #endif
1514         // AC: need to decrement t_serialized for enquiry functions to work
1515         // correctly, will restore at join time
1516         parent_team->t.t_serialized--;
1517 
1518         {
1519           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1520           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1521           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1522 #if OMPT_SUPPORT
1523                                  ,
1524                                  exit_frame_p
1525 #endif
1526           );
1527         }
1528 
1529 #if OMPT_SUPPORT
1530         if (ompt_enabled.enabled) {
1531           *exit_frame_p = NULL;
1532           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1533           if (ompt_enabled.ompt_callback_implicit_task) {
1534             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1535                 ompt_scope_end, NULL, implicit_task_data, 1,
1536                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1537           }
1538           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1539           __ompt_lw_taskteam_unlink(master_th);
1540           if (ompt_enabled.ompt_callback_parallel_end) {
1541             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1542                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1543                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1544                 return_address);
1545           }
1546           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1547         }
1548 #endif
1549         return TRUE;
1550       }
1551 
1552       parent_team->t.t_pkfn = microtask;
1553       parent_team->t.t_invoke = invoker;
1554       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1555       parent_team->t.t_active_level++;
1556       parent_team->t.t_level++;
1557       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1558 
1559 #if OMPT_SUPPORT
1560       if (ompt_enabled.enabled) {
1561         ompt_lw_taskteam_t lw_taskteam;
1562         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1563                                 &ompt_parallel_data, return_address);
1564         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1565       }
1566 #endif
1567 
1568       /* Change number of threads in the team if requested */
1569       if (master_set_numthreads) { // The parallel has num_threads clause
1570         if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1571           // AC: only can reduce number of threads dynamically, can't increase
1572           kmp_info_t **other_threads = parent_team->t.t_threads;
1573           // NOTE: if using distributed barrier, we need to run this code block
1574           // even when the team size appears not to have changed from the max.
1575           int old_proc = master_th->th.th_teams_size.nth;
1576           if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
1577               bp_dist_bar) {
1578             __kmp_resize_dist_barrier(parent_team, old_proc,
1579                                       master_set_numthreads);
1580             __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1581           }
1582           parent_team->t.t_nproc = master_set_numthreads;
1583           for (i = 0; i < master_set_numthreads; ++i) {
1584             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1585           }
1586         }
1587         // Keep extra threads hot in the team for possible next parallels
1588         master_th->th.th_set_nproc = 0;
1589       }
1590 
1591 #if USE_DEBUGGER
1592       if (__kmp_debugging) { // Let debugger override number of threads.
1593         int nth = __kmp_omp_num_threads(loc);
1594         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1595           master_set_numthreads = nth;
1596         }
1597       }
1598 #endif
1599 
1600 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1601       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1602            KMP_ITT_DEBUG) &&
1603           __kmp_forkjoin_frames_mode == 3 &&
1604           parent_team->t.t_active_level == 1 // only report frames at level 1
1605           && master_th->th.th_teams_size.nteams == 1) {
1606         kmp_uint64 tmp_time = __itt_get_timestamp();
1607         master_th->th.th_frame_time = tmp_time;
1608         parent_team->t.t_region_time = tmp_time;
1609       }
1610       if (__itt_stack_caller_create_ptr) {
1611         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1612         // create new stack stitching id before entering fork barrier
1613         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1614       }
1615 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1616 
1617       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1618                     "master_th=%p, gtid=%d\n",
1619                     root, parent_team, master_th, gtid));
1620       __kmp_internal_fork(loc, gtid, parent_team);
1621       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1622                     "master_th=%p, gtid=%d\n",
1623                     root, parent_team, master_th, gtid));
1624 
1625       if (call_context == fork_context_gnu)
1626         return TRUE;
1627 
1628       /* Invoke microtask for PRIMARY thread */
1629       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1630                     parent_team->t.t_id, parent_team->t.t_pkfn));
1631 
1632       if (!parent_team->t.t_invoke(gtid)) {
1633         KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1634       }
1635       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1636                     parent_team->t.t_id, parent_team->t.t_pkfn));
1637       KMP_MB(); /* Flush all pending memory write invalidates.  */
1638 
1639       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1640 
1641       return TRUE;
1642     } // Parallel closely nested in teams construct
1643 
1644 #if KMP_DEBUG
1645     if (__kmp_tasking_mode != tskm_immediate_exec) {
1646       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1647                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1648     }
1649 #endif
1650 
1651     // Need this to happen before we determine the number of threads, not while
1652     // we are allocating the team
1653     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1654     int enter_teams = 0;
1655     if (parent_team->t.t_active_level >=
1656         master_th->th.th_current_task->td_icvs.max_active_levels) {
1657       nthreads = 1;
1658     } else {
1659       enter_teams = ((ap == NULL && active_level == 0) ||
1660                      (ap && teams_level > 0 && teams_level == level));
1661       nthreads = master_set_numthreads
1662                      ? master_set_numthreads
1663                      // TODO: get nproc directly from current task
1664                      : get__nproc_2(parent_team, master_tid);
1665       // Check if we need to take forkjoin lock? (no need for serialized
1666       // parallel out of teams construct). This code moved here from
1667       // __kmp_reserve_threads() to speedup nested serialized parallels.
1668       if (nthreads > 1) {
1669         if ((get__max_active_levels(master_th) == 1 &&
1670              (root->r.r_in_parallel && !enter_teams)) ||
1671             (__kmp_library == library_serial)) {
1672           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1673                         " threads\n",
1674                         gtid, nthreads));
1675           nthreads = 1;
1676         }
1677       }
1678       if (nthreads > 1) {
1679         /* determine how many new threads we can use */
1680         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1681         /* AC: If we execute teams from parallel region (on host), then teams
1682            should be created but each can only have 1 thread if nesting is
1683            disabled. If teams called from serial region, then teams and their
1684            threads should be created regardless of the nesting setting. */
1685         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1686                                          nthreads, enter_teams);
1687         if (nthreads == 1) {
1688           // Free lock for single thread execution here; for multi-thread
1689           // execution it will be freed later after team of threads created
1690           // and initialized
1691           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1692         }
1693       }
1694     }
1695     KMP_DEBUG_ASSERT(nthreads > 0);
1696 
1697     // If we temporarily changed the set number of threads then restore it now
1698     master_th->th.th_set_nproc = 0;
1699 
1700     /* create a serialized parallel region? */
1701     if (nthreads == 1) {
1702 /* josh todo: hypothetical question: what do we do for OS X*? */
1703 #if KMP_OS_LINUX &&                                                            \
1704     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1705       void *args[argc];
1706 #else
1707       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1708 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1709           KMP_ARCH_AARCH64) */
1710 
1711       KA_TRACE(20,
1712                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1713 
1714       __kmpc_serialized_parallel(loc, gtid);
1715 
1716 #if OMPD_SUPPORT
1717       master_th->th.th_serial_team->t.t_pkfn = microtask;
1718 #endif
1719 
1720       if (call_context == fork_context_intel) {
1721         /* TODO this sucks, use the compiler itself to pass args! :) */
1722         master_th->th.th_serial_team->t.t_ident = loc;
1723         if (!ap) {
1724           // revert change made in __kmpc_serialized_parallel()
1725           master_th->th.th_serial_team->t.t_level--;
1726           // Get args from parent team for teams construct
1727 
1728 #if OMPT_SUPPORT
1729           void *dummy;
1730           void **exit_frame_p;
1731           ompt_task_info_t *task_info;
1732 
1733           ompt_lw_taskteam_t lw_taskteam;
1734 
1735           if (ompt_enabled.enabled) {
1736             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1737                                     &ompt_parallel_data, return_address);
1738 
1739             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1740             // don't use lw_taskteam after linking. content was swaped
1741 
1742             task_info = OMPT_CUR_TASK_INFO(master_th);
1743             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1744             if (ompt_enabled.ompt_callback_implicit_task) {
1745               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1746                   __kmp_tid_from_gtid(gtid);
1747               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1748                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1749                   &(task_info->task_data), 1,
1750                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1751                   ompt_task_implicit);
1752             }
1753 
1754             /* OMPT state */
1755             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1756           } else {
1757             exit_frame_p = &dummy;
1758           }
1759 #endif
1760 
1761           {
1762             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1763             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1764             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1765                                    parent_team->t.t_argv
1766 #if OMPT_SUPPORT
1767                                    ,
1768                                    exit_frame_p
1769 #endif
1770             );
1771           }
1772 
1773 #if OMPT_SUPPORT
1774           if (ompt_enabled.enabled) {
1775             *exit_frame_p = NULL;
1776             if (ompt_enabled.ompt_callback_implicit_task) {
1777               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1778                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1779                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1780                   ompt_task_implicit);
1781             }
1782             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1783             __ompt_lw_taskteam_unlink(master_th);
1784             if (ompt_enabled.ompt_callback_parallel_end) {
1785               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1786                   &ompt_parallel_data, parent_task_data,
1787                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1788                   return_address);
1789             }
1790             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1791           }
1792 #endif
1793         } else if (microtask == (microtask_t)__kmp_teams_master) {
1794           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1795                            master_th->th.th_serial_team);
1796           team = master_th->th.th_team;
1797           // team->t.t_pkfn = microtask;
1798           team->t.t_invoke = invoker;
1799           __kmp_alloc_argv_entries(argc, team, TRUE);
1800           team->t.t_argc = argc;
1801           argv = (void **)team->t.t_argv;
1802           if (ap) {
1803             for (i = argc - 1; i >= 0; --i)
1804               *argv++ = va_arg(kmp_va_deref(ap), void *);
1805           } else {
1806             for (i = 0; i < argc; ++i)
1807               // Get args from parent team for teams construct
1808               argv[i] = parent_team->t.t_argv[i];
1809           }
1810           // AC: revert change made in __kmpc_serialized_parallel()
1811           //     because initial code in teams should have level=0
1812           team->t.t_level--;
1813           // AC: call special invoker for outer "parallel" of teams construct
1814           invoker(gtid);
1815 #if OMPT_SUPPORT
1816           if (ompt_enabled.enabled) {
1817             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1818             if (ompt_enabled.ompt_callback_implicit_task) {
1819               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1820                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1821                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1822             }
1823             if (ompt_enabled.ompt_callback_parallel_end) {
1824               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1825                   &ompt_parallel_data, parent_task_data,
1826                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1827                   return_address);
1828             }
1829             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1830           }
1831 #endif
1832         } else {
1833           argv = args;
1834           for (i = argc - 1; i >= 0; --i)
1835             *argv++ = va_arg(kmp_va_deref(ap), void *);
1836           KMP_MB();
1837 
1838 #if OMPT_SUPPORT
1839           void *dummy;
1840           void **exit_frame_p;
1841           ompt_task_info_t *task_info;
1842 
1843           ompt_lw_taskteam_t lw_taskteam;
1844 
1845           if (ompt_enabled.enabled) {
1846             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1847                                     &ompt_parallel_data, return_address);
1848             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1849             // don't use lw_taskteam after linking. content was swaped
1850             task_info = OMPT_CUR_TASK_INFO(master_th);
1851             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1852 
1853             /* OMPT implicit task begin */
1854             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1855             if (ompt_enabled.ompt_callback_implicit_task) {
1856               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1857                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1858                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1859                   ompt_task_implicit);
1860               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1861                   __kmp_tid_from_gtid(gtid);
1862             }
1863 
1864             /* OMPT state */
1865             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1866           } else {
1867             exit_frame_p = &dummy;
1868           }
1869 #endif
1870 
1871           {
1872             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1873             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1874             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1875 #if OMPT_SUPPORT
1876                                    ,
1877                                    exit_frame_p
1878 #endif
1879             );
1880           }
1881 
1882 #if OMPT_SUPPORT
1883           if (ompt_enabled.enabled) {
1884             *exit_frame_p = NULL;
1885             if (ompt_enabled.ompt_callback_implicit_task) {
1886               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1887                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1888                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1889                   ompt_task_implicit);
1890             }
1891 
1892             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1893             __ompt_lw_taskteam_unlink(master_th);
1894             if (ompt_enabled.ompt_callback_parallel_end) {
1895               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1896                   &ompt_parallel_data, parent_task_data,
1897                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1898                   return_address);
1899             }
1900             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1901           }
1902 #endif
1903         }
1904       } else if (call_context == fork_context_gnu) {
1905 #if OMPT_SUPPORT
1906         ompt_lw_taskteam_t lwt;
1907         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1908                                 return_address);
1909 
1910         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1911         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1912 // don't use lw_taskteam after linking. content was swaped
1913 #endif
1914 
1915         // we were called from GNU native code
1916         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1917         return FALSE;
1918       } else {
1919         KMP_ASSERT2(call_context < fork_context_last,
1920                     "__kmp_fork_call: unknown fork_context parameter");
1921       }
1922 
1923       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1924       KMP_MB();
1925       return FALSE;
1926     } // if (nthreads == 1)
1927 
1928     // GEH: only modify the executing flag in the case when not serialized
1929     //      serialized case is handled in kmpc_serialized_parallel
1930     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1931                   "curtask=%p, curtask_max_aclevel=%d\n",
1932                   parent_team->t.t_active_level, master_th,
1933                   master_th->th.th_current_task,
1934                   master_th->th.th_current_task->td_icvs.max_active_levels));
1935     // TODO: GEH - cannot do this assertion because root thread not set up as
1936     // executing
1937     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1938     master_th->th.th_current_task->td_flags.executing = 0;
1939 
1940     if (!master_th->th.th_teams_microtask || level > teams_level) {
1941       /* Increment our nested depth level */
1942       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1943     }
1944 
1945     // See if we need to make a copy of the ICVs.
1946     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1947     if ((level + 1 < __kmp_nested_nth.used) &&
1948         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1949       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1950     } else {
1951       nthreads_icv = 0; // don't update
1952     }
1953 
1954     // Figure out the proc_bind_policy for the new team.
1955     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1956     kmp_proc_bind_t proc_bind_icv =
1957         proc_bind_default; // proc_bind_default means don't update
1958     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1959       proc_bind = proc_bind_false;
1960     } else {
1961       if (proc_bind == proc_bind_default) {
1962         // No proc_bind clause specified; use current proc-bind-var for this
1963         // parallel region
1964         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1965       }
1966       /* else: The proc_bind policy was specified explicitly on parallel clause.
1967          This overrides proc-bind-var for this parallel region, but does not
1968          change proc-bind-var. */
1969       // Figure the value of proc-bind-var for the child threads.
1970       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1971           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1972            master_th->th.th_current_task->td_icvs.proc_bind)) {
1973         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1974       }
1975     }
1976 
1977     // Reset for next parallel region
1978     master_th->th.th_set_proc_bind = proc_bind_default;
1979 
1980     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1981       kmp_internal_control_t new_icvs;
1982       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1983       new_icvs.next = NULL;
1984       if (nthreads_icv > 0) {
1985         new_icvs.nproc = nthreads_icv;
1986       }
1987       if (proc_bind_icv != proc_bind_default) {
1988         new_icvs.proc_bind = proc_bind_icv;
1989       }
1990 
1991       /* allocate a new parallel team */
1992       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1993       team = __kmp_allocate_team(root, nthreads, nthreads,
1994 #if OMPT_SUPPORT
1995                                  ompt_parallel_data,
1996 #endif
1997                                  proc_bind, &new_icvs,
1998                                  argc USE_NESTED_HOT_ARG(master_th));
1999       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2000         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2001     } else {
2002       /* allocate a new parallel team */
2003       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2004       team = __kmp_allocate_team(root, nthreads, nthreads,
2005 #if OMPT_SUPPORT
2006                                  ompt_parallel_data,
2007 #endif
2008                                  proc_bind,
2009                                  &master_th->th.th_current_task->td_icvs,
2010                                  argc USE_NESTED_HOT_ARG(master_th));
2011       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2012         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2013                   &master_th->th.th_current_task->td_icvs);
2014     }
2015     KF_TRACE(
2016         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2017 
2018     /* setup the new team */
2019     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2020     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2021     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2022     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2023     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2024 #if OMPT_SUPPORT
2025     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2026                           return_address);
2027 #endif
2028     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2029     // TODO: parent_team->t.t_level == INT_MAX ???
2030     if (!master_th->th.th_teams_microtask || level > teams_level) {
2031       int new_level = parent_team->t.t_level + 1;
2032       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2033       new_level = parent_team->t.t_active_level + 1;
2034       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2035     } else {
2036       // AC: Do not increase parallel level at start of the teams construct
2037       int new_level = parent_team->t.t_level;
2038       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2039       new_level = parent_team->t.t_active_level;
2040       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2041     }
2042     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2043     // set primary thread's schedule as new run-time schedule
2044     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2045 
2046     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2047     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2048 
2049     // Update the floating point rounding in the team if required.
2050     propagateFPControl(team);
2051 #if OMPD_SUPPORT
2052     if (ompd_state & OMPD_ENABLE_BP)
2053       ompd_bp_parallel_begin();
2054 #endif
2055 
2056     if (__kmp_tasking_mode != tskm_immediate_exec) {
2057       // Set primary thread's task team to team's task team. Unless this is hot
2058       // team, it should be NULL.
2059       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2060                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2061       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2062                     "%p, new task_team %p / team %p\n",
2063                     __kmp_gtid_from_thread(master_th),
2064                     master_th->th.th_task_team, parent_team,
2065                     team->t.t_task_team[master_th->th.th_task_state], team));
2066 
2067       if (active_level || master_th->th.th_task_team) {
2068         // Take a memo of primary thread's task_state
2069         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2070         if (master_th->th.th_task_state_top >=
2071             master_th->th.th_task_state_stack_sz) { // increase size
2072           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2073           kmp_uint8 *old_stack, *new_stack;
2074           kmp_uint32 i;
2075           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2076           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2077             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2078           }
2079           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2080                ++i) { // zero-init rest of stack
2081             new_stack[i] = 0;
2082           }
2083           old_stack = master_th->th.th_task_state_memo_stack;
2084           master_th->th.th_task_state_memo_stack = new_stack;
2085           master_th->th.th_task_state_stack_sz = new_size;
2086           __kmp_free(old_stack);
2087         }
2088         // Store primary thread's task_state on stack
2089         master_th->th
2090             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2091             master_th->th.th_task_state;
2092         master_th->th.th_task_state_top++;
2093 #if KMP_NESTED_HOT_TEAMS
2094         if (master_th->th.th_hot_teams &&
2095             active_level < __kmp_hot_teams_max_level &&
2096             team == master_th->th.th_hot_teams[active_level].hot_team) {
2097           // Restore primary thread's nested state if nested hot team
2098           master_th->th.th_task_state =
2099               master_th->th
2100                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2101         } else {
2102 #endif
2103           master_th->th.th_task_state = 0;
2104 #if KMP_NESTED_HOT_TEAMS
2105         }
2106 #endif
2107       }
2108 #if !KMP_NESTED_HOT_TEAMS
2109       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2110                        (team == root->r.r_hot_team));
2111 #endif
2112     }
2113 
2114     KA_TRACE(
2115         20,
2116         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2117          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2118          team->t.t_nproc));
2119     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2120                      (team->t.t_master_tid == 0 &&
2121                       (team->t.t_parent == root->r.r_root_team ||
2122                        team->t.t_parent->t.t_serialized)));
2123     KMP_MB();
2124 
2125     /* now, setup the arguments */
2126     argv = (void **)team->t.t_argv;
2127     if (ap) {
2128       for (i = argc - 1; i >= 0; --i) {
2129         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2130         KMP_CHECK_UPDATE(*argv, new_argv);
2131         argv++;
2132       }
2133     } else {
2134       for (i = 0; i < argc; ++i) {
2135         // Get args from parent team for teams construct
2136         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2137       }
2138     }
2139 
2140     /* now actually fork the threads */
2141     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2142     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2143       root->r.r_active = TRUE;
2144 
2145     __kmp_fork_team_threads(root, team, master_th, gtid);
2146     __kmp_setup_icv_copy(team, nthreads,
2147                          &master_th->th.th_current_task->td_icvs, loc);
2148 
2149 #if OMPT_SUPPORT
2150     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2151 #endif
2152 
2153     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2154 
2155 #if USE_ITT_BUILD
2156     if (team->t.t_active_level == 1 // only report frames at level 1
2157         && !master_th->th.th_teams_microtask) { // not in teams construct
2158 #if USE_ITT_NOTIFY
2159       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2160           (__kmp_forkjoin_frames_mode == 3 ||
2161            __kmp_forkjoin_frames_mode == 1)) {
2162         kmp_uint64 tmp_time = 0;
2163         if (__itt_get_timestamp_ptr)
2164           tmp_time = __itt_get_timestamp();
2165         // Internal fork - report frame begin
2166         master_th->th.th_frame_time = tmp_time;
2167         if (__kmp_forkjoin_frames_mode == 3)
2168           team->t.t_region_time = tmp_time;
2169       } else
2170 // only one notification scheme (either "submit" or "forking/joined", not both)
2171 #endif /* USE_ITT_NOTIFY */
2172           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2173               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2174         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2175         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2176       }
2177     }
2178 #endif /* USE_ITT_BUILD */
2179 
2180     /* now go on and do the work */
2181     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2182     KMP_MB();
2183     KF_TRACE(10,
2184              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2185               root, team, master_th, gtid));
2186 
2187 #if USE_ITT_BUILD
2188     if (__itt_stack_caller_create_ptr) {
2189       // create new stack stitching id before entering fork barrier
2190       if (!enter_teams) {
2191         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2192         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2193       } else if (parent_team->t.t_serialized) {
2194         // keep stack stitching id in the serialized parent_team;
2195         // current team will be used for parallel inside the teams;
2196         // if parent_team is active, then it already keeps stack stitching id
2197         // for the league of teams
2198         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2199         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2200       }
2201     }
2202 #endif /* USE_ITT_BUILD */
2203 
2204     // AC: skip __kmp_internal_fork at teams construct, let only primary
2205     // threads execute
2206     if (ap) {
2207       __kmp_internal_fork(loc, gtid, team);
2208       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2209                     "master_th=%p, gtid=%d\n",
2210                     root, team, master_th, gtid));
2211     }
2212 
2213     if (call_context == fork_context_gnu) {
2214       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2215       return TRUE;
2216     }
2217 
2218     /* Invoke microtask for PRIMARY thread */
2219     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2220                   team->t.t_id, team->t.t_pkfn));
2221   } // END of timer KMP_fork_call block
2222 
2223 #if KMP_STATS_ENABLED
2224   // If beginning a teams construct, then change thread state
2225   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2226   if (!ap) {
2227     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2228   }
2229 #endif
2230 
2231   if (!team->t.t_invoke(gtid)) {
2232     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2233   }
2234 
2235 #if KMP_STATS_ENABLED
2236   // If was beginning of a teams construct, then reset thread state
2237   if (!ap) {
2238     KMP_SET_THREAD_STATE(previous_state);
2239   }
2240 #endif
2241 
2242   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2243                 team->t.t_id, team->t.t_pkfn));
2244   KMP_MB(); /* Flush all pending memory write invalidates.  */
2245 
2246   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2247 #if OMPT_SUPPORT
2248   if (ompt_enabled.enabled) {
2249     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2250   }
2251 #endif
2252 
2253   return TRUE;
2254 }
2255 
2256 #if OMPT_SUPPORT
2257 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2258                                             kmp_team_t *team) {
2259   // restore state outside the region
2260   thread->th.ompt_thread_info.state =
2261       ((team->t.t_serialized) ? ompt_state_work_serial
2262                               : ompt_state_work_parallel);
2263 }
2264 
2265 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2266                                    kmp_team_t *team, ompt_data_t *parallel_data,
2267                                    int flags, void *codeptr) {
2268   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2269   if (ompt_enabled.ompt_callback_parallel_end) {
2270     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2271         parallel_data, &(task_info->task_data), flags, codeptr);
2272   }
2273 
2274   task_info->frame.enter_frame = ompt_data_none;
2275   __kmp_join_restore_state(thread, team);
2276 }
2277 #endif
2278 
2279 void __kmp_join_call(ident_t *loc, int gtid
2280 #if OMPT_SUPPORT
2281                      ,
2282                      enum fork_context_e fork_context
2283 #endif
2284                      ,
2285                      int exit_teams) {
2286   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2287   kmp_team_t *team;
2288   kmp_team_t *parent_team;
2289   kmp_info_t *master_th;
2290   kmp_root_t *root;
2291   int master_active;
2292 
2293   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2294 
2295   /* setup current data */
2296   master_th = __kmp_threads[gtid];
2297   root = master_th->th.th_root;
2298   team = master_th->th.th_team;
2299   parent_team = team->t.t_parent;
2300 
2301   master_th->th.th_ident = loc;
2302 
2303 #if OMPT_SUPPORT
2304   void *team_microtask = (void *)team->t.t_pkfn;
2305   // For GOMP interface with serialized parallel, need the
2306   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2307   // and end-parallel events.
2308   if (ompt_enabled.enabled &&
2309       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2310     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2311   }
2312 #endif
2313 
2314 #if KMP_DEBUG
2315   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2316     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2317                   "th_task_team = %p\n",
2318                   __kmp_gtid_from_thread(master_th), team,
2319                   team->t.t_task_team[master_th->th.th_task_state],
2320                   master_th->th.th_task_team));
2321     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2322                      team->t.t_task_team[master_th->th.th_task_state]);
2323   }
2324 #endif
2325 
2326   if (team->t.t_serialized) {
2327     if (master_th->th.th_teams_microtask) {
2328       // We are in teams construct
2329       int level = team->t.t_level;
2330       int tlevel = master_th->th.th_teams_level;
2331       if (level == tlevel) {
2332         // AC: we haven't incremented it earlier at start of teams construct,
2333         //     so do it here - at the end of teams construct
2334         team->t.t_level++;
2335       } else if (level == tlevel + 1) {
2336         // AC: we are exiting parallel inside teams, need to increment
2337         // serialization in order to restore it in the next call to
2338         // __kmpc_end_serialized_parallel
2339         team->t.t_serialized++;
2340       }
2341     }
2342     __kmpc_end_serialized_parallel(loc, gtid);
2343 
2344 #if OMPT_SUPPORT
2345     if (ompt_enabled.enabled) {
2346       __kmp_join_restore_state(master_th, parent_team);
2347     }
2348 #endif
2349 
2350     return;
2351   }
2352 
2353   master_active = team->t.t_master_active;
2354 
2355   if (!exit_teams) {
2356     // AC: No barrier for internal teams at exit from teams construct.
2357     //     But there is barrier for external team (league).
2358     __kmp_internal_join(loc, gtid, team);
2359 #if USE_ITT_BUILD
2360     if (__itt_stack_caller_create_ptr) {
2361       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2362       // destroy the stack stitching id after join barrier
2363       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2364       team->t.t_stack_id = NULL;
2365     }
2366 #endif
2367   } else {
2368     master_th->th.th_task_state =
2369         0; // AC: no tasking in teams (out of any parallel)
2370 #if USE_ITT_BUILD
2371     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2372       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2373       // destroy the stack stitching id on exit from the teams construct
2374       // if parent_team is active, then the id will be destroyed later on
2375       // by master of the league of teams
2376       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2377       parent_team->t.t_stack_id = NULL;
2378     }
2379 #endif
2380 
2381     if (team->t.t_nproc > 1 &&
2382         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2383       team->t.b->update_num_threads(team->t.t_nproc);
2384       __kmp_add_threads_to_team(team, team->t.t_nproc);
2385     }
2386   }
2387 
2388   KMP_MB();
2389 
2390 #if OMPT_SUPPORT
2391   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2392   void *codeptr = team->t.ompt_team_info.master_return_address;
2393 #endif
2394 
2395 #if USE_ITT_BUILD
2396   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2397   if (team->t.t_active_level == 1 &&
2398       (!master_th->th.th_teams_microtask || /* not in teams construct */
2399        master_th->th.th_teams_size.nteams == 1)) {
2400     master_th->th.th_ident = loc;
2401     // only one notification scheme (either "submit" or "forking/joined", not
2402     // both)
2403     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2404         __kmp_forkjoin_frames_mode == 3)
2405       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2406                              master_th->th.th_frame_time, 0, loc,
2407                              master_th->th.th_team_nproc, 1);
2408     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2409              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2410       __kmp_itt_region_joined(gtid);
2411   } // active_level == 1
2412 #endif /* USE_ITT_BUILD */
2413 
2414   if (master_th->th.th_teams_microtask && !exit_teams &&
2415       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2416       team->t.t_level == master_th->th.th_teams_level + 1) {
2417 // AC: We need to leave the team structure intact at the end of parallel
2418 // inside the teams construct, so that at the next parallel same (hot) team
2419 // works, only adjust nesting levels
2420 #if OMPT_SUPPORT
2421     ompt_data_t ompt_parallel_data = ompt_data_none;
2422     if (ompt_enabled.enabled) {
2423       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2424       if (ompt_enabled.ompt_callback_implicit_task) {
2425         int ompt_team_size = team->t.t_nproc;
2426         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2427             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2428             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2429       }
2430       task_info->frame.exit_frame = ompt_data_none;
2431       task_info->task_data = ompt_data_none;
2432       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2433       __ompt_lw_taskteam_unlink(master_th);
2434     }
2435 #endif
2436     /* Decrement our nested depth level */
2437     team->t.t_level--;
2438     team->t.t_active_level--;
2439     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2440 
2441     // Restore number of threads in the team if needed. This code relies on
2442     // the proper adjustment of th_teams_size.nth after the fork in
2443     // __kmp_teams_master on each teams primary thread in the case that
2444     // __kmp_reserve_threads reduced it.
2445     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2446       int old_num = master_th->th.th_team_nproc;
2447       int new_num = master_th->th.th_teams_size.nth;
2448       kmp_info_t **other_threads = team->t.t_threads;
2449       team->t.t_nproc = new_num;
2450       for (int i = 0; i < old_num; ++i) {
2451         other_threads[i]->th.th_team_nproc = new_num;
2452       }
2453       // Adjust states of non-used threads of the team
2454       for (int i = old_num; i < new_num; ++i) {
2455         // Re-initialize thread's barrier data.
2456         KMP_DEBUG_ASSERT(other_threads[i]);
2457         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2458         for (int b = 0; b < bs_last_barrier; ++b) {
2459           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2460           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2461 #if USE_DEBUGGER
2462           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2463 #endif
2464         }
2465         if (__kmp_tasking_mode != tskm_immediate_exec) {
2466           // Synchronize thread's task state
2467           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2468         }
2469       }
2470     }
2471 
2472 #if OMPT_SUPPORT
2473     if (ompt_enabled.enabled) {
2474       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2475                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2476     }
2477 #endif
2478 
2479     return;
2480   }
2481 
2482   /* do cleanup and restore the parent team */
2483   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2484   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2485 
2486   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2487 
2488   /* jc: The following lock has instructions with REL and ACQ semantics,
2489      separating the parallel user code called in this parallel region
2490      from the serial user code called after this function returns. */
2491   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2492 
2493   if (!master_th->th.th_teams_microtask ||
2494       team->t.t_level > master_th->th.th_teams_level) {
2495     /* Decrement our nested depth level */
2496     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2497   }
2498   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2499 
2500 #if OMPT_SUPPORT
2501   if (ompt_enabled.enabled) {
2502     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2503     if (ompt_enabled.ompt_callback_implicit_task) {
2504       int flags = (team_microtask == (void *)__kmp_teams_master)
2505                       ? ompt_task_initial
2506                       : ompt_task_implicit;
2507       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2508       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2509           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2510           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2511     }
2512     task_info->frame.exit_frame = ompt_data_none;
2513     task_info->task_data = ompt_data_none;
2514   }
2515 #endif
2516 
2517   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2518                 master_th, team));
2519   __kmp_pop_current_task_from_thread(master_th);
2520 
2521 #if KMP_AFFINITY_SUPPORTED
2522   // Restore master thread's partition.
2523   master_th->th.th_first_place = team->t.t_first_place;
2524   master_th->th.th_last_place = team->t.t_last_place;
2525 #endif // KMP_AFFINITY_SUPPORTED
2526   master_th->th.th_def_allocator = team->t.t_def_allocator;
2527 
2528 #if OMPD_SUPPORT
2529   if (ompd_state & OMPD_ENABLE_BP)
2530     ompd_bp_parallel_end();
2531 #endif
2532   updateHWFPControl(team);
2533 
2534   if (root->r.r_active != master_active)
2535     root->r.r_active = master_active;
2536 
2537   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2538                             master_th)); // this will free worker threads
2539 
2540   /* this race was fun to find. make sure the following is in the critical
2541      region otherwise assertions may fail occasionally since the old team may be
2542      reallocated and the hierarchy appears inconsistent. it is actually safe to
2543      run and won't cause any bugs, but will cause those assertion failures. it's
2544      only one deref&assign so might as well put this in the critical region */
2545   master_th->th.th_team = parent_team;
2546   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2547   master_th->th.th_team_master = parent_team->t.t_threads[0];
2548   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2549 
2550   /* restore serialized team, if need be */
2551   if (parent_team->t.t_serialized &&
2552       parent_team != master_th->th.th_serial_team &&
2553       parent_team != root->r.r_root_team) {
2554     __kmp_free_team(root,
2555                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2556     master_th->th.th_serial_team = parent_team;
2557   }
2558 
2559   if (__kmp_tasking_mode != tskm_immediate_exec) {
2560     if (master_th->th.th_task_state_top >
2561         0) { // Restore task state from memo stack
2562       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2563       // Remember primary thread's state if we re-use this nested hot team
2564       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2565           master_th->th.th_task_state;
2566       --master_th->th.th_task_state_top; // pop
2567       // Now restore state at this level
2568       master_th->th.th_task_state =
2569           master_th->th
2570               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2571     }
2572     // Copy the task team from the parent team to the primary thread
2573     master_th->th.th_task_team =
2574         parent_team->t.t_task_team[master_th->th.th_task_state];
2575     KA_TRACE(20,
2576              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2577               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2578               parent_team));
2579   }
2580 
2581   // TODO: GEH - cannot do this assertion because root thread not set up as
2582   // executing
2583   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2584   master_th->th.th_current_task->td_flags.executing = 1;
2585 
2586   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2587 
2588 #if OMPT_SUPPORT
2589   int flags =
2590       OMPT_INVOKER(fork_context) |
2591       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2592                                                       : ompt_parallel_team);
2593   if (ompt_enabled.enabled) {
2594     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2595                     codeptr);
2596   }
2597 #endif
2598 
2599   KMP_MB();
2600   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2601 }
2602 
2603 /* Check whether we should push an internal control record onto the
2604    serial team stack.  If so, do it.  */
2605 void __kmp_save_internal_controls(kmp_info_t *thread) {
2606 
2607   if (thread->th.th_team != thread->th.th_serial_team) {
2608     return;
2609   }
2610   if (thread->th.th_team->t.t_serialized > 1) {
2611     int push = 0;
2612 
2613     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2614       push = 1;
2615     } else {
2616       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2617           thread->th.th_team->t.t_serialized) {
2618         push = 1;
2619       }
2620     }
2621     if (push) { /* push a record on the serial team's stack */
2622       kmp_internal_control_t *control =
2623           (kmp_internal_control_t *)__kmp_allocate(
2624               sizeof(kmp_internal_control_t));
2625 
2626       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2627 
2628       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2629 
2630       control->next = thread->th.th_team->t.t_control_stack_top;
2631       thread->th.th_team->t.t_control_stack_top = control;
2632     }
2633   }
2634 }
2635 
2636 /* Changes set_nproc */
2637 void __kmp_set_num_threads(int new_nth, int gtid) {
2638   kmp_info_t *thread;
2639   kmp_root_t *root;
2640 
2641   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2642   KMP_DEBUG_ASSERT(__kmp_init_serial);
2643 
2644   if (new_nth < 1)
2645     new_nth = 1;
2646   else if (new_nth > __kmp_max_nth)
2647     new_nth = __kmp_max_nth;
2648 
2649   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2650   thread = __kmp_threads[gtid];
2651   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2652     return; // nothing to do
2653 
2654   __kmp_save_internal_controls(thread);
2655 
2656   set__nproc(thread, new_nth);
2657 
2658   // If this omp_set_num_threads() call will cause the hot team size to be
2659   // reduced (in the absence of a num_threads clause), then reduce it now,
2660   // rather than waiting for the next parallel region.
2661   root = thread->th.th_root;
2662   if (__kmp_init_parallel && (!root->r.r_active) &&
2663       (root->r.r_hot_team->t.t_nproc > new_nth)
2664 #if KMP_NESTED_HOT_TEAMS
2665       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2666 #endif
2667   ) {
2668     kmp_team_t *hot_team = root->r.r_hot_team;
2669     int f;
2670 
2671     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2672 
2673     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2674       __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2675     }
2676     // Release the extra threads we don't need any more.
2677     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2678       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2679       if (__kmp_tasking_mode != tskm_immediate_exec) {
2680         // When decreasing team size, threads no longer in the team should unref
2681         // task team.
2682         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2683       }
2684       __kmp_free_thread(hot_team->t.t_threads[f]);
2685       hot_team->t.t_threads[f] = NULL;
2686     }
2687     hot_team->t.t_nproc = new_nth;
2688 #if KMP_NESTED_HOT_TEAMS
2689     if (thread->th.th_hot_teams) {
2690       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2691       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2692     }
2693 #endif
2694 
2695     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2696       hot_team->t.b->update_num_threads(new_nth);
2697       __kmp_add_threads_to_team(hot_team, new_nth);
2698     }
2699 
2700     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2701 
2702     // Update the t_nproc field in the threads that are still active.
2703     for (f = 0; f < new_nth; f++) {
2704       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2705       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2706     }
2707     // Special flag in case omp_set_num_threads() call
2708     hot_team->t.t_size_changed = -1;
2709   }
2710 }
2711 
2712 /* Changes max_active_levels */
2713 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2714   kmp_info_t *thread;
2715 
2716   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2717                 "%d = (%d)\n",
2718                 gtid, max_active_levels));
2719   KMP_DEBUG_ASSERT(__kmp_init_serial);
2720 
2721   // validate max_active_levels
2722   if (max_active_levels < 0) {
2723     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2724     // We ignore this call if the user has specified a negative value.
2725     // The current setting won't be changed. The last valid setting will be
2726     // used. A warning will be issued (if warnings are allowed as controlled by
2727     // the KMP_WARNINGS env var).
2728     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2729                   "max_active_levels for thread %d = (%d)\n",
2730                   gtid, max_active_levels));
2731     return;
2732   }
2733   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2734     // it's OK, the max_active_levels is within the valid range: [ 0;
2735     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2736     // We allow a zero value. (implementation defined behavior)
2737   } else {
2738     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2739                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2740     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2741     // Current upper limit is MAX_INT. (implementation defined behavior)
2742     // If the input exceeds the upper limit, we correct the input to be the
2743     // upper limit. (implementation defined behavior)
2744     // Actually, the flow should never get here until we use MAX_INT limit.
2745   }
2746   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2747                 "max_active_levels for thread %d = (%d)\n",
2748                 gtid, max_active_levels));
2749 
2750   thread = __kmp_threads[gtid];
2751 
2752   __kmp_save_internal_controls(thread);
2753 
2754   set__max_active_levels(thread, max_active_levels);
2755 }
2756 
2757 /* Gets max_active_levels */
2758 int __kmp_get_max_active_levels(int gtid) {
2759   kmp_info_t *thread;
2760 
2761   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2762   KMP_DEBUG_ASSERT(__kmp_init_serial);
2763 
2764   thread = __kmp_threads[gtid];
2765   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2766   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2767                 "curtask_maxaclevel=%d\n",
2768                 gtid, thread->th.th_current_task,
2769                 thread->th.th_current_task->td_icvs.max_active_levels));
2770   return thread->th.th_current_task->td_icvs.max_active_levels;
2771 }
2772 
2773 // nteams-var per-device ICV
2774 void __kmp_set_num_teams(int num_teams) {
2775   if (num_teams > 0)
2776     __kmp_nteams = num_teams;
2777 }
2778 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2779 // teams-thread-limit-var per-device ICV
2780 void __kmp_set_teams_thread_limit(int limit) {
2781   if (limit > 0)
2782     __kmp_teams_thread_limit = limit;
2783 }
2784 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2785 
2786 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2787 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2788 
2789 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2790 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2791   kmp_info_t *thread;
2792   kmp_sched_t orig_kind;
2793   //    kmp_team_t *team;
2794 
2795   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2796                 gtid, (int)kind, chunk));
2797   KMP_DEBUG_ASSERT(__kmp_init_serial);
2798 
2799   // Check if the kind parameter is valid, correct if needed.
2800   // Valid parameters should fit in one of two intervals - standard or extended:
2801   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2802   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2803   orig_kind = kind;
2804   kind = __kmp_sched_without_mods(kind);
2805 
2806   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2807       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2808     // TODO: Hint needs attention in case we change the default schedule.
2809     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2810               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2811               __kmp_msg_null);
2812     kind = kmp_sched_default;
2813     chunk = 0; // ignore chunk value in case of bad kind
2814   }
2815 
2816   thread = __kmp_threads[gtid];
2817 
2818   __kmp_save_internal_controls(thread);
2819 
2820   if (kind < kmp_sched_upper_std) {
2821     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2822       // differ static chunked vs. unchunked:  chunk should be invalid to
2823       // indicate unchunked schedule (which is the default)
2824       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2825     } else {
2826       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2827           __kmp_sch_map[kind - kmp_sched_lower - 1];
2828     }
2829   } else {
2830     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2831     //    kmp_sched_lower - 2 ];
2832     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2833         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2834                       kmp_sched_lower - 2];
2835   }
2836   __kmp_sched_apply_mods_intkind(
2837       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2838   if (kind == kmp_sched_auto || chunk < 1) {
2839     // ignore parameter chunk for schedule auto
2840     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2841   } else {
2842     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2843   }
2844 }
2845 
2846 /* Gets def_sched_var ICV values */
2847 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2848   kmp_info_t *thread;
2849   enum sched_type th_type;
2850 
2851   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2852   KMP_DEBUG_ASSERT(__kmp_init_serial);
2853 
2854   thread = __kmp_threads[gtid];
2855 
2856   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2857   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2858   case kmp_sch_static:
2859   case kmp_sch_static_greedy:
2860   case kmp_sch_static_balanced:
2861     *kind = kmp_sched_static;
2862     __kmp_sched_apply_mods_stdkind(kind, th_type);
2863     *chunk = 0; // chunk was not set, try to show this fact via zero value
2864     return;
2865   case kmp_sch_static_chunked:
2866     *kind = kmp_sched_static;
2867     break;
2868   case kmp_sch_dynamic_chunked:
2869     *kind = kmp_sched_dynamic;
2870     break;
2871   case kmp_sch_guided_chunked:
2872   case kmp_sch_guided_iterative_chunked:
2873   case kmp_sch_guided_analytical_chunked:
2874     *kind = kmp_sched_guided;
2875     break;
2876   case kmp_sch_auto:
2877     *kind = kmp_sched_auto;
2878     break;
2879   case kmp_sch_trapezoidal:
2880     *kind = kmp_sched_trapezoidal;
2881     break;
2882 #if KMP_STATIC_STEAL_ENABLED
2883   case kmp_sch_static_steal:
2884     *kind = kmp_sched_static_steal;
2885     break;
2886 #endif
2887   default:
2888     KMP_FATAL(UnknownSchedulingType, th_type);
2889   }
2890 
2891   __kmp_sched_apply_mods_stdkind(kind, th_type);
2892   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2893 }
2894 
2895 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2896 
2897   int ii, dd;
2898   kmp_team_t *team;
2899   kmp_info_t *thr;
2900 
2901   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2902   KMP_DEBUG_ASSERT(__kmp_init_serial);
2903 
2904   // validate level
2905   if (level == 0)
2906     return 0;
2907   if (level < 0)
2908     return -1;
2909   thr = __kmp_threads[gtid];
2910   team = thr->th.th_team;
2911   ii = team->t.t_level;
2912   if (level > ii)
2913     return -1;
2914 
2915   if (thr->th.th_teams_microtask) {
2916     // AC: we are in teams region where multiple nested teams have same level
2917     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2918     if (level <=
2919         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2920       KMP_DEBUG_ASSERT(ii >= tlevel);
2921       // AC: As we need to pass by the teams league, we need to artificially
2922       // increase ii
2923       if (ii == tlevel) {
2924         ii += 2; // three teams have same level
2925       } else {
2926         ii++; // two teams have same level
2927       }
2928     }
2929   }
2930 
2931   if (ii == level)
2932     return __kmp_tid_from_gtid(gtid);
2933 
2934   dd = team->t.t_serialized;
2935   level++;
2936   while (ii > level) {
2937     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2938     }
2939     if ((team->t.t_serialized) && (!dd)) {
2940       team = team->t.t_parent;
2941       continue;
2942     }
2943     if (ii > level) {
2944       team = team->t.t_parent;
2945       dd = team->t.t_serialized;
2946       ii--;
2947     }
2948   }
2949 
2950   return (dd > 1) ? (0) : (team->t.t_master_tid);
2951 }
2952 
2953 int __kmp_get_team_size(int gtid, int level) {
2954 
2955   int ii, dd;
2956   kmp_team_t *team;
2957   kmp_info_t *thr;
2958 
2959   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2960   KMP_DEBUG_ASSERT(__kmp_init_serial);
2961 
2962   // validate level
2963   if (level == 0)
2964     return 1;
2965   if (level < 0)
2966     return -1;
2967   thr = __kmp_threads[gtid];
2968   team = thr->th.th_team;
2969   ii = team->t.t_level;
2970   if (level > ii)
2971     return -1;
2972 
2973   if (thr->th.th_teams_microtask) {
2974     // AC: we are in teams region where multiple nested teams have same level
2975     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2976     if (level <=
2977         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2978       KMP_DEBUG_ASSERT(ii >= tlevel);
2979       // AC: As we need to pass by the teams league, we need to artificially
2980       // increase ii
2981       if (ii == tlevel) {
2982         ii += 2; // three teams have same level
2983       } else {
2984         ii++; // two teams have same level
2985       }
2986     }
2987   }
2988 
2989   while (ii > level) {
2990     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2991     }
2992     if (team->t.t_serialized && (!dd)) {
2993       team = team->t.t_parent;
2994       continue;
2995     }
2996     if (ii > level) {
2997       team = team->t.t_parent;
2998       ii--;
2999     }
3000   }
3001 
3002   return team->t.t_nproc;
3003 }
3004 
3005 kmp_r_sched_t __kmp_get_schedule_global() {
3006   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3007   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3008   // independently. So one can get the updated schedule here.
3009 
3010   kmp_r_sched_t r_sched;
3011 
3012   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3013   // __kmp_guided. __kmp_sched should keep original value, so that user can set
3014   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3015   // different roots (even in OMP 2.5)
3016   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3017   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3018   if (s == kmp_sch_static) {
3019     // replace STATIC with more detailed schedule (balanced or greedy)
3020     r_sched.r_sched_type = __kmp_static;
3021   } else if (s == kmp_sch_guided_chunked) {
3022     // replace GUIDED with more detailed schedule (iterative or analytical)
3023     r_sched.r_sched_type = __kmp_guided;
3024   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3025     r_sched.r_sched_type = __kmp_sched;
3026   }
3027   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3028 
3029   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3030     // __kmp_chunk may be wrong here (if it was not ever set)
3031     r_sched.chunk = KMP_DEFAULT_CHUNK;
3032   } else {
3033     r_sched.chunk = __kmp_chunk;
3034   }
3035 
3036   return r_sched;
3037 }
3038 
3039 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3040    at least argc number of *t_argv entries for the requested team. */
3041 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3042 
3043   KMP_DEBUG_ASSERT(team);
3044   if (!realloc || argc > team->t.t_max_argc) {
3045 
3046     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3047                    "current entries=%d\n",
3048                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3049     /* if previously allocated heap space for args, free them */
3050     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3051       __kmp_free((void *)team->t.t_argv);
3052 
3053     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3054       /* use unused space in the cache line for arguments */
3055       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3056       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3057                      "argv entries\n",
3058                      team->t.t_id, team->t.t_max_argc));
3059       team->t.t_argv = &team->t.t_inline_argv[0];
3060       if (__kmp_storage_map) {
3061         __kmp_print_storage_map_gtid(
3062             -1, &team->t.t_inline_argv[0],
3063             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3064             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3065             team->t.t_id);
3066       }
3067     } else {
3068       /* allocate space for arguments in the heap */
3069       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3070                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3071                                : 2 * argc;
3072       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3073                      "argv entries\n",
3074                      team->t.t_id, team->t.t_max_argc));
3075       team->t.t_argv =
3076           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3077       if (__kmp_storage_map) {
3078         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3079                                      &team->t.t_argv[team->t.t_max_argc],
3080                                      sizeof(void *) * team->t.t_max_argc,
3081                                      "team_%d.t_argv", team->t.t_id);
3082       }
3083     }
3084   }
3085 }
3086 
3087 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3088   int i;
3089   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3090   team->t.t_threads =
3091       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3092   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3093       sizeof(dispatch_shared_info_t) * num_disp_buff);
3094   team->t.t_dispatch =
3095       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3096   team->t.t_implicit_task_taskdata =
3097       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3098   team->t.t_max_nproc = max_nth;
3099 
3100   /* setup dispatch buffers */
3101   for (i = 0; i < num_disp_buff; ++i) {
3102     team->t.t_disp_buffer[i].buffer_index = i;
3103     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3104   }
3105 }
3106 
3107 static void __kmp_free_team_arrays(kmp_team_t *team) {
3108   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3109   int i;
3110   for (i = 0; i < team->t.t_max_nproc; ++i) {
3111     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3112       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3113       team->t.t_dispatch[i].th_disp_buffer = NULL;
3114     }
3115   }
3116 #if KMP_USE_HIER_SCHED
3117   __kmp_dispatch_free_hierarchies(team);
3118 #endif
3119   __kmp_free(team->t.t_threads);
3120   __kmp_free(team->t.t_disp_buffer);
3121   __kmp_free(team->t.t_dispatch);
3122   __kmp_free(team->t.t_implicit_task_taskdata);
3123   team->t.t_threads = NULL;
3124   team->t.t_disp_buffer = NULL;
3125   team->t.t_dispatch = NULL;
3126   team->t.t_implicit_task_taskdata = 0;
3127 }
3128 
3129 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3130   kmp_info_t **oldThreads = team->t.t_threads;
3131 
3132   __kmp_free(team->t.t_disp_buffer);
3133   __kmp_free(team->t.t_dispatch);
3134   __kmp_free(team->t.t_implicit_task_taskdata);
3135   __kmp_allocate_team_arrays(team, max_nth);
3136 
3137   KMP_MEMCPY(team->t.t_threads, oldThreads,
3138              team->t.t_nproc * sizeof(kmp_info_t *));
3139 
3140   __kmp_free(oldThreads);
3141 }
3142 
3143 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3144 
3145   kmp_r_sched_t r_sched =
3146       __kmp_get_schedule_global(); // get current state of scheduling globals
3147 
3148   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3149 
3150   kmp_internal_control_t g_icvs = {
3151     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3152     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3153     // adjustment of threads (per thread)
3154     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3155     // whether blocktime is explicitly set
3156     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3157 #if KMP_USE_MONITOR
3158     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3159 // intervals
3160 #endif
3161     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3162     // next parallel region (per thread)
3163     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3164     __kmp_cg_max_nth, // int thread_limit;
3165     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3166     // for max_active_levels
3167     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3168     // {sched,chunk} pair
3169     __kmp_nested_proc_bind.bind_types[0],
3170     __kmp_default_device,
3171     NULL // struct kmp_internal_control *next;
3172   };
3173 
3174   return g_icvs;
3175 }
3176 
3177 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3178 
3179   kmp_internal_control_t gx_icvs;
3180   gx_icvs.serial_nesting_level =
3181       0; // probably =team->t.t_serial like in save_inter_controls
3182   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3183   gx_icvs.next = NULL;
3184 
3185   return gx_icvs;
3186 }
3187 
3188 static void __kmp_initialize_root(kmp_root_t *root) {
3189   int f;
3190   kmp_team_t *root_team;
3191   kmp_team_t *hot_team;
3192   int hot_team_max_nth;
3193   kmp_r_sched_t r_sched =
3194       __kmp_get_schedule_global(); // get current state of scheduling globals
3195   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3196   KMP_DEBUG_ASSERT(root);
3197   KMP_ASSERT(!root->r.r_begin);
3198 
3199   /* setup the root state structure */
3200   __kmp_init_lock(&root->r.r_begin_lock);
3201   root->r.r_begin = FALSE;
3202   root->r.r_active = FALSE;
3203   root->r.r_in_parallel = 0;
3204   root->r.r_blocktime = __kmp_dflt_blocktime;
3205 #if KMP_AFFINITY_SUPPORTED
3206   root->r.r_affinity_assigned = FALSE;
3207 #endif
3208 
3209   /* setup the root team for this task */
3210   /* allocate the root team structure */
3211   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3212 
3213   root_team =
3214       __kmp_allocate_team(root,
3215                           1, // new_nproc
3216                           1, // max_nproc
3217 #if OMPT_SUPPORT
3218                           ompt_data_none, // root parallel id
3219 #endif
3220                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3221                           0 // argc
3222                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3223                           );
3224 #if USE_DEBUGGER
3225   // Non-NULL value should be assigned to make the debugger display the root
3226   // team.
3227   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3228 #endif
3229 
3230   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3231 
3232   root->r.r_root_team = root_team;
3233   root_team->t.t_control_stack_top = NULL;
3234 
3235   /* initialize root team */
3236   root_team->t.t_threads[0] = NULL;
3237   root_team->t.t_nproc = 1;
3238   root_team->t.t_serialized = 1;
3239   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3240   root_team->t.t_sched.sched = r_sched.sched;
3241   KA_TRACE(
3242       20,
3243       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3244        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3245 
3246   /* setup the  hot team for this task */
3247   /* allocate the hot team structure */
3248   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3249 
3250   hot_team =
3251       __kmp_allocate_team(root,
3252                           1, // new_nproc
3253                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3254 #if OMPT_SUPPORT
3255                           ompt_data_none, // root parallel id
3256 #endif
3257                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3258                           0 // argc
3259                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3260                           );
3261   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3262 
3263   root->r.r_hot_team = hot_team;
3264   root_team->t.t_control_stack_top = NULL;
3265 
3266   /* first-time initialization */
3267   hot_team->t.t_parent = root_team;
3268 
3269   /* initialize hot team */
3270   hot_team_max_nth = hot_team->t.t_max_nproc;
3271   for (f = 0; f < hot_team_max_nth; ++f) {
3272     hot_team->t.t_threads[f] = NULL;
3273   }
3274   hot_team->t.t_nproc = 1;
3275   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3276   hot_team->t.t_sched.sched = r_sched.sched;
3277   hot_team->t.t_size_changed = 0;
3278 }
3279 
3280 #ifdef KMP_DEBUG
3281 
3282 typedef struct kmp_team_list_item {
3283   kmp_team_p const *entry;
3284   struct kmp_team_list_item *next;
3285 } kmp_team_list_item_t;
3286 typedef kmp_team_list_item_t *kmp_team_list_t;
3287 
3288 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3289     kmp_team_list_t list, // List of teams.
3290     kmp_team_p const *team // Team to add.
3291 ) {
3292 
3293   // List must terminate with item where both entry and next are NULL.
3294   // Team is added to the list only once.
3295   // List is sorted in ascending order by team id.
3296   // Team id is *not* a key.
3297 
3298   kmp_team_list_t l;
3299 
3300   KMP_DEBUG_ASSERT(list != NULL);
3301   if (team == NULL) {
3302     return;
3303   }
3304 
3305   __kmp_print_structure_team_accum(list, team->t.t_parent);
3306   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3307 
3308   // Search list for the team.
3309   l = list;
3310   while (l->next != NULL && l->entry != team) {
3311     l = l->next;
3312   }
3313   if (l->next != NULL) {
3314     return; // Team has been added before, exit.
3315   }
3316 
3317   // Team is not found. Search list again for insertion point.
3318   l = list;
3319   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3320     l = l->next;
3321   }
3322 
3323   // Insert team.
3324   {
3325     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3326         sizeof(kmp_team_list_item_t));
3327     *item = *l;
3328     l->entry = team;
3329     l->next = item;
3330   }
3331 }
3332 
3333 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3334 
3335 ) {
3336   __kmp_printf("%s", title);
3337   if (team != NULL) {
3338     __kmp_printf("%2x %p\n", team->t.t_id, team);
3339   } else {
3340     __kmp_printf(" - (nil)\n");
3341   }
3342 }
3343 
3344 static void __kmp_print_structure_thread(char const *title,
3345                                          kmp_info_p const *thread) {
3346   __kmp_printf("%s", title);
3347   if (thread != NULL) {
3348     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3349   } else {
3350     __kmp_printf(" - (nil)\n");
3351   }
3352 }
3353 
3354 void __kmp_print_structure(void) {
3355 
3356   kmp_team_list_t list;
3357 
3358   // Initialize list of teams.
3359   list =
3360       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3361   list->entry = NULL;
3362   list->next = NULL;
3363 
3364   __kmp_printf("\n------------------------------\nGlobal Thread "
3365                "Table\n------------------------------\n");
3366   {
3367     int gtid;
3368     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3369       __kmp_printf("%2d", gtid);
3370       if (__kmp_threads != NULL) {
3371         __kmp_printf(" %p", __kmp_threads[gtid]);
3372       }
3373       if (__kmp_root != NULL) {
3374         __kmp_printf(" %p", __kmp_root[gtid]);
3375       }
3376       __kmp_printf("\n");
3377     }
3378   }
3379 
3380   // Print out __kmp_threads array.
3381   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3382                "----------\n");
3383   if (__kmp_threads != NULL) {
3384     int gtid;
3385     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3386       kmp_info_t const *thread = __kmp_threads[gtid];
3387       if (thread != NULL) {
3388         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3389         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3390         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3391         __kmp_print_structure_team("    Serial Team:  ",
3392                                    thread->th.th_serial_team);
3393         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3394         __kmp_print_structure_thread("    Primary:      ",
3395                                      thread->th.th_team_master);
3396         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3397         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3398         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3399         __kmp_print_structure_thread("    Next in pool: ",
3400                                      thread->th.th_next_pool);
3401         __kmp_printf("\n");
3402         __kmp_print_structure_team_accum(list, thread->th.th_team);
3403         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3404       }
3405     }
3406   } else {
3407     __kmp_printf("Threads array is not allocated.\n");
3408   }
3409 
3410   // Print out __kmp_root array.
3411   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3412                "--------\n");
3413   if (__kmp_root != NULL) {
3414     int gtid;
3415     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3416       kmp_root_t const *root = __kmp_root[gtid];
3417       if (root != NULL) {
3418         __kmp_printf("GTID %2d %p:\n", gtid, root);
3419         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3420         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3421         __kmp_print_structure_thread("    Uber Thread:  ",
3422                                      root->r.r_uber_thread);
3423         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3424         __kmp_printf("    In Parallel:  %2d\n",
3425                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3426         __kmp_printf("\n");
3427         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3428         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3429       }
3430     }
3431   } else {
3432     __kmp_printf("Ubers array is not allocated.\n");
3433   }
3434 
3435   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3436                "--------\n");
3437   while (list->next != NULL) {
3438     kmp_team_p const *team = list->entry;
3439     int i;
3440     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3441     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3442     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3443     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3444     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3445     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3446     for (i = 0; i < team->t.t_nproc; ++i) {
3447       __kmp_printf("    Thread %2d:      ", i);
3448       __kmp_print_structure_thread("", team->t.t_threads[i]);
3449     }
3450     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3451     __kmp_printf("\n");
3452     list = list->next;
3453   }
3454 
3455   // Print out __kmp_thread_pool and __kmp_team_pool.
3456   __kmp_printf("\n------------------------------\nPools\n----------------------"
3457                "--------\n");
3458   __kmp_print_structure_thread("Thread pool:          ",
3459                                CCAST(kmp_info_t *, __kmp_thread_pool));
3460   __kmp_print_structure_team("Team pool:            ",
3461                              CCAST(kmp_team_t *, __kmp_team_pool));
3462   __kmp_printf("\n");
3463 
3464   // Free team list.
3465   while (list != NULL) {
3466     kmp_team_list_item_t *item = list;
3467     list = list->next;
3468     KMP_INTERNAL_FREE(item);
3469   }
3470 }
3471 
3472 #endif
3473 
3474 //---------------------------------------------------------------------------
3475 //  Stuff for per-thread fast random number generator
3476 //  Table of primes
3477 static const unsigned __kmp_primes[] = {
3478     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3479     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3480     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3481     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3482     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3483     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3484     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3485     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3486     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3487     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3488     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3489 
3490 //---------------------------------------------------------------------------
3491 //  __kmp_get_random: Get a random number using a linear congruential method.
3492 unsigned short __kmp_get_random(kmp_info_t *thread) {
3493   unsigned x = thread->th.th_x;
3494   unsigned short r = (unsigned short)(x >> 16);
3495 
3496   thread->th.th_x = x * thread->th.th_a + 1;
3497 
3498   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3499                 thread->th.th_info.ds.ds_tid, r));
3500 
3501   return r;
3502 }
3503 //--------------------------------------------------------
3504 // __kmp_init_random: Initialize a random number generator
3505 void __kmp_init_random(kmp_info_t *thread) {
3506   unsigned seed = thread->th.th_info.ds.ds_tid;
3507 
3508   thread->th.th_a =
3509       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3510   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3511   KA_TRACE(30,
3512            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3513 }
3514 
3515 #if KMP_OS_WINDOWS
3516 /* reclaim array entries for root threads that are already dead, returns number
3517  * reclaimed */
3518 static int __kmp_reclaim_dead_roots(void) {
3519   int i, r = 0;
3520 
3521   for (i = 0; i < __kmp_threads_capacity; ++i) {
3522     if (KMP_UBER_GTID(i) &&
3523         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3524         !__kmp_root[i]
3525              ->r.r_active) { // AC: reclaim only roots died in non-active state
3526       r += __kmp_unregister_root_other_thread(i);
3527     }
3528   }
3529   return r;
3530 }
3531 #endif
3532 
3533 /* This function attempts to create free entries in __kmp_threads and
3534    __kmp_root, and returns the number of free entries generated.
3535 
3536    For Windows* OS static library, the first mechanism used is to reclaim array
3537    entries for root threads that are already dead.
3538 
3539    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3540    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3541    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3542    threadprivate cache array has been created. Synchronization with
3543    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3544 
3545    After any dead root reclamation, if the clipping value allows array expansion
3546    to result in the generation of a total of nNeed free slots, the function does
3547    that expansion. If not, nothing is done beyond the possible initial root
3548    thread reclamation.
3549 
3550    If any argument is negative, the behavior is undefined. */
3551 static int __kmp_expand_threads(int nNeed) {
3552   int added = 0;
3553   int minimumRequiredCapacity;
3554   int newCapacity;
3555   kmp_info_t **newThreads;
3556   kmp_root_t **newRoot;
3557 
3558   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3559   // resizing __kmp_threads does not need additional protection if foreign
3560   // threads are present
3561 
3562 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3563   /* only for Windows static library */
3564   /* reclaim array entries for root threads that are already dead */
3565   added = __kmp_reclaim_dead_roots();
3566 
3567   if (nNeed) {
3568     nNeed -= added;
3569     if (nNeed < 0)
3570       nNeed = 0;
3571   }
3572 #endif
3573   if (nNeed <= 0)
3574     return added;
3575 
3576   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3577   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3578   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3579   // > __kmp_max_nth in one of two ways:
3580   //
3581   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3582   //    may not be reused by another thread, so we may need to increase
3583   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3584   //
3585   // 2) New foreign root(s) are encountered.  We always register new foreign
3586   //    roots. This may cause a smaller # of threads to be allocated at
3587   //    subsequent parallel regions, but the worker threads hang around (and
3588   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3589   //
3590   // Anyway, that is the reason for moving the check to see if
3591   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3592   // instead of having it performed here. -BB
3593 
3594   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3595 
3596   /* compute expansion headroom to check if we can expand */
3597   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3598     /* possible expansion too small -- give up */
3599     return added;
3600   }
3601   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3602 
3603   newCapacity = __kmp_threads_capacity;
3604   do {
3605     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3606                                                           : __kmp_sys_max_nth;
3607   } while (newCapacity < minimumRequiredCapacity);
3608   newThreads = (kmp_info_t **)__kmp_allocate(
3609       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3610   newRoot =
3611       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3612   KMP_MEMCPY(newThreads, __kmp_threads,
3613              __kmp_threads_capacity * sizeof(kmp_info_t *));
3614   KMP_MEMCPY(newRoot, __kmp_root,
3615              __kmp_threads_capacity * sizeof(kmp_root_t *));
3616 
3617   kmp_info_t **temp_threads = __kmp_threads;
3618   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3619   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3620   __kmp_free(temp_threads);
3621   added += newCapacity - __kmp_threads_capacity;
3622   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3623 
3624   if (newCapacity > __kmp_tp_capacity) {
3625     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3626     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3627       __kmp_threadprivate_resize_cache(newCapacity);
3628     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3629       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3630     }
3631     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3632   }
3633 
3634   return added;
3635 }
3636 
3637 /* Register the current thread as a root thread and obtain our gtid. We must
3638    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3639    thread that calls from __kmp_do_serial_initialize() */
3640 int __kmp_register_root(int initial_thread) {
3641   kmp_info_t *root_thread;
3642   kmp_root_t *root;
3643   int gtid;
3644   int capacity;
3645   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3646   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3647   KMP_MB();
3648 
3649   /* 2007-03-02:
3650      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3651      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3652      work as expected -- it may return false (that means there is at least one
3653      empty slot in __kmp_threads array), but it is possible the only free slot
3654      is #0, which is reserved for initial thread and so cannot be used for this
3655      one. Following code workarounds this bug.
3656 
3657      However, right solution seems to be not reserving slot #0 for initial
3658      thread because:
3659      (1) there is no magic in slot #0,
3660      (2) we cannot detect initial thread reliably (the first thread which does
3661         serial initialization may be not a real initial thread).
3662   */
3663   capacity = __kmp_threads_capacity;
3664   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3665     --capacity;
3666   }
3667 
3668   // If it is not for initializing the hidden helper team, we need to take
3669   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3670   // in __kmp_threads_capacity.
3671   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3672     capacity -= __kmp_hidden_helper_threads_num;
3673   }
3674 
3675   /* see if there are too many threads */
3676   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3677     if (__kmp_tp_cached) {
3678       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3679                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3680                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3681     } else {
3682       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3683                   __kmp_msg_null);
3684     }
3685   }
3686 
3687   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3688   // 0: initial thread, also a regular OpenMP thread.
3689   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3690   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3691   // regular OpenMP threads.
3692   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3693     // Find an available thread slot for hidden helper thread. Slots for hidden
3694     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3695     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3696                    gtid <= __kmp_hidden_helper_threads_num;
3697          gtid++)
3698       ;
3699     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3700     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3701                  "hidden helper thread: T#%d\n",
3702                  gtid));
3703   } else {
3704     /* find an available thread slot */
3705     // Don't reassign the zero slot since we need that to only be used by
3706     // initial thread. Slots for hidden helper threads should also be skipped.
3707     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3708       gtid = 0;
3709     } else {
3710       for (gtid = __kmp_hidden_helper_threads_num + 1;
3711            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3712         ;
3713     }
3714     KA_TRACE(
3715         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3716     KMP_ASSERT(gtid < __kmp_threads_capacity);
3717   }
3718 
3719   /* update global accounting */
3720   __kmp_all_nth++;
3721   TCW_4(__kmp_nth, __kmp_nth + 1);
3722 
3723   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3724   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3725   if (__kmp_adjust_gtid_mode) {
3726     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3727       if (TCR_4(__kmp_gtid_mode) != 2) {
3728         TCW_4(__kmp_gtid_mode, 2);
3729       }
3730     } else {
3731       if (TCR_4(__kmp_gtid_mode) != 1) {
3732         TCW_4(__kmp_gtid_mode, 1);
3733       }
3734     }
3735   }
3736 
3737 #ifdef KMP_ADJUST_BLOCKTIME
3738   /* Adjust blocktime to zero if necessary            */
3739   /* Middle initialization might not have occurred yet */
3740   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3741     if (__kmp_nth > __kmp_avail_proc) {
3742       __kmp_zero_bt = TRUE;
3743     }
3744   }
3745 #endif /* KMP_ADJUST_BLOCKTIME */
3746 
3747   /* setup this new hierarchy */
3748   if (!(root = __kmp_root[gtid])) {
3749     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3750     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3751   }
3752 
3753 #if KMP_STATS_ENABLED
3754   // Initialize stats as soon as possible (right after gtid assignment).
3755   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3756   __kmp_stats_thread_ptr->startLife();
3757   KMP_SET_THREAD_STATE(SERIAL_REGION);
3758   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3759 #endif
3760   __kmp_initialize_root(root);
3761 
3762   /* setup new root thread structure */
3763   if (root->r.r_uber_thread) {
3764     root_thread = root->r.r_uber_thread;
3765   } else {
3766     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3767     if (__kmp_storage_map) {
3768       __kmp_print_thread_storage_map(root_thread, gtid);
3769     }
3770     root_thread->th.th_info.ds.ds_gtid = gtid;
3771 #if OMPT_SUPPORT
3772     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3773 #endif
3774     root_thread->th.th_root = root;
3775     if (__kmp_env_consistency_check) {
3776       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3777     }
3778 #if USE_FAST_MEMORY
3779     __kmp_initialize_fast_memory(root_thread);
3780 #endif /* USE_FAST_MEMORY */
3781 
3782 #if KMP_USE_BGET
3783     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3784     __kmp_initialize_bget(root_thread);
3785 #endif
3786     __kmp_init_random(root_thread); // Initialize random number generator
3787   }
3788 
3789   /* setup the serial team held in reserve by the root thread */
3790   if (!root_thread->th.th_serial_team) {
3791     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3792     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3793     root_thread->th.th_serial_team = __kmp_allocate_team(
3794         root, 1, 1,
3795 #if OMPT_SUPPORT
3796         ompt_data_none, // root parallel id
3797 #endif
3798         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3799   }
3800   KMP_ASSERT(root_thread->th.th_serial_team);
3801   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3802                 root_thread->th.th_serial_team));
3803 
3804   /* drop root_thread into place */
3805   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3806 
3807   root->r.r_root_team->t.t_threads[0] = root_thread;
3808   root->r.r_hot_team->t.t_threads[0] = root_thread;
3809   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3810   // AC: the team created in reserve, not for execution (it is unused for now).
3811   root_thread->th.th_serial_team->t.t_serialized = 0;
3812   root->r.r_uber_thread = root_thread;
3813 
3814   /* initialize the thread, get it ready to go */
3815   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3816   TCW_4(__kmp_init_gtid, TRUE);
3817 
3818   /* prepare the primary thread for get_gtid() */
3819   __kmp_gtid_set_specific(gtid);
3820 
3821 #if USE_ITT_BUILD
3822   __kmp_itt_thread_name(gtid);
3823 #endif /* USE_ITT_BUILD */
3824 
3825 #ifdef KMP_TDATA_GTID
3826   __kmp_gtid = gtid;
3827 #endif
3828   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3829   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3830 
3831   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3832                 "plain=%u\n",
3833                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3834                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3835                 KMP_INIT_BARRIER_STATE));
3836   { // Initialize barrier data.
3837     int b;
3838     for (b = 0; b < bs_last_barrier; ++b) {
3839       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3840 #if USE_DEBUGGER
3841       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3842 #endif
3843     }
3844   }
3845   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3846                    KMP_INIT_BARRIER_STATE);
3847 
3848 #if KMP_AFFINITY_SUPPORTED
3849   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3850   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3851   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3852   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3853 #endif /* KMP_AFFINITY_SUPPORTED */
3854   root_thread->th.th_def_allocator = __kmp_def_allocator;
3855   root_thread->th.th_prev_level = 0;
3856   root_thread->th.th_prev_num_threads = 1;
3857 
3858   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3859   tmp->cg_root = root_thread;
3860   tmp->cg_thread_limit = __kmp_cg_max_nth;
3861   tmp->cg_nthreads = 1;
3862   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3863                  " cg_nthreads init to 1\n",
3864                  root_thread, tmp));
3865   tmp->up = NULL;
3866   root_thread->th.th_cg_roots = tmp;
3867 
3868   __kmp_root_counter++;
3869 
3870 #if OMPT_SUPPORT
3871   if (!initial_thread && ompt_enabled.enabled) {
3872 
3873     kmp_info_t *root_thread = ompt_get_thread();
3874 
3875     ompt_set_thread_state(root_thread, ompt_state_overhead);
3876 
3877     if (ompt_enabled.ompt_callback_thread_begin) {
3878       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3879           ompt_thread_initial, __ompt_get_thread_data_internal());
3880     }
3881     ompt_data_t *task_data;
3882     ompt_data_t *parallel_data;
3883     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3884                                   NULL);
3885     if (ompt_enabled.ompt_callback_implicit_task) {
3886       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3887           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3888     }
3889 
3890     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3891   }
3892 #endif
3893 #if OMPD_SUPPORT
3894   if (ompd_state & OMPD_ENABLE_BP)
3895     ompd_bp_thread_begin();
3896 #endif
3897 
3898   KMP_MB();
3899   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3900 
3901   return gtid;
3902 }
3903 
3904 #if KMP_NESTED_HOT_TEAMS
3905 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3906                                 const int max_level) {
3907   int i, n, nth;
3908   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3909   if (!hot_teams || !hot_teams[level].hot_team) {
3910     return 0;
3911   }
3912   KMP_DEBUG_ASSERT(level < max_level);
3913   kmp_team_t *team = hot_teams[level].hot_team;
3914   nth = hot_teams[level].hot_team_nth;
3915   n = nth - 1; // primary thread is not freed
3916   if (level < max_level - 1) {
3917     for (i = 0; i < nth; ++i) {
3918       kmp_info_t *th = team->t.t_threads[i];
3919       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3920       if (i > 0 && th->th.th_hot_teams) {
3921         __kmp_free(th->th.th_hot_teams);
3922         th->th.th_hot_teams = NULL;
3923       }
3924     }
3925   }
3926   __kmp_free_team(root, team, NULL);
3927   return n;
3928 }
3929 #endif
3930 
3931 // Resets a root thread and clear its root and hot teams.
3932 // Returns the number of __kmp_threads entries directly and indirectly freed.
3933 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3934   kmp_team_t *root_team = root->r.r_root_team;
3935   kmp_team_t *hot_team = root->r.r_hot_team;
3936   int n = hot_team->t.t_nproc;
3937   int i;
3938 
3939   KMP_DEBUG_ASSERT(!root->r.r_active);
3940 
3941   root->r.r_root_team = NULL;
3942   root->r.r_hot_team = NULL;
3943   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3944   // before call to __kmp_free_team().
3945   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3946 #if KMP_NESTED_HOT_TEAMS
3947   if (__kmp_hot_teams_max_level >
3948       0) { // need to free nested hot teams and their threads if any
3949     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3950       kmp_info_t *th = hot_team->t.t_threads[i];
3951       if (__kmp_hot_teams_max_level > 1) {
3952         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3953       }
3954       if (th->th.th_hot_teams) {
3955         __kmp_free(th->th.th_hot_teams);
3956         th->th.th_hot_teams = NULL;
3957       }
3958     }
3959   }
3960 #endif
3961   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3962 
3963   // Before we can reap the thread, we need to make certain that all other
3964   // threads in the teams that had this root as ancestor have stopped trying to
3965   // steal tasks.
3966   if (__kmp_tasking_mode != tskm_immediate_exec) {
3967     __kmp_wait_to_unref_task_teams();
3968   }
3969 
3970 #if KMP_OS_WINDOWS
3971   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3972   KA_TRACE(
3973       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3974            "\n",
3975            (LPVOID) & (root->r.r_uber_thread->th),
3976            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3977   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3978 #endif /* KMP_OS_WINDOWS */
3979 
3980 #if OMPD_SUPPORT
3981   if (ompd_state & OMPD_ENABLE_BP)
3982     ompd_bp_thread_end();
3983 #endif
3984 
3985 #if OMPT_SUPPORT
3986   ompt_data_t *task_data;
3987   ompt_data_t *parallel_data;
3988   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3989                                 NULL);
3990   if (ompt_enabled.ompt_callback_implicit_task) {
3991     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3992         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3993   }
3994   if (ompt_enabled.ompt_callback_thread_end) {
3995     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3996         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3997   }
3998 #endif
3999 
4000   TCW_4(__kmp_nth,
4001         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4002   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4003   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4004                  " to %d\n",
4005                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4006                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4007   if (i == 1) {
4008     // need to free contention group structure
4009     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4010                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
4011     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4012     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4013     root->r.r_uber_thread->th.th_cg_roots = NULL;
4014   }
4015   __kmp_reap_thread(root->r.r_uber_thread, 1);
4016 
4017   // We canot put root thread to __kmp_thread_pool, so we have to reap it
4018   // instead of freeing.
4019   root->r.r_uber_thread = NULL;
4020   /* mark root as no longer in use */
4021   root->r.r_begin = FALSE;
4022 
4023   return n;
4024 }
4025 
4026 void __kmp_unregister_root_current_thread(int gtid) {
4027   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4028   /* this lock should be ok, since unregister_root_current_thread is never
4029      called during an abort, only during a normal close. furthermore, if you
4030      have the forkjoin lock, you should never try to get the initz lock */
4031   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4032   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4033     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4034                   "exiting T#%d\n",
4035                   gtid));
4036     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4037     return;
4038   }
4039   kmp_root_t *root = __kmp_root[gtid];
4040 
4041   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4042   KMP_ASSERT(KMP_UBER_GTID(gtid));
4043   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4044   KMP_ASSERT(root->r.r_active == FALSE);
4045 
4046   KMP_MB();
4047 
4048   kmp_info_t *thread = __kmp_threads[gtid];
4049   kmp_team_t *team = thread->th.th_team;
4050   kmp_task_team_t *task_team = thread->th.th_task_team;
4051 
4052   // we need to wait for the proxy tasks before finishing the thread
4053   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4054 #if OMPT_SUPPORT
4055     // the runtime is shutting down so we won't report any events
4056     thread->th.ompt_thread_info.state = ompt_state_undefined;
4057 #endif
4058     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4059   }
4060 
4061   __kmp_reset_root(gtid, root);
4062 
4063   KMP_MB();
4064   KC_TRACE(10,
4065            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4066 
4067   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4068 }
4069 
4070 #if KMP_OS_WINDOWS
4071 /* __kmp_forkjoin_lock must be already held
4072    Unregisters a root thread that is not the current thread.  Returns the number
4073    of __kmp_threads entries freed as a result. */
4074 static int __kmp_unregister_root_other_thread(int gtid) {
4075   kmp_root_t *root = __kmp_root[gtid];
4076   int r;
4077 
4078   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4079   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4080   KMP_ASSERT(KMP_UBER_GTID(gtid));
4081   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4082   KMP_ASSERT(root->r.r_active == FALSE);
4083 
4084   r = __kmp_reset_root(gtid, root);
4085   KC_TRACE(10,
4086            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4087   return r;
4088 }
4089 #endif
4090 
4091 #if KMP_DEBUG
4092 void __kmp_task_info() {
4093 
4094   kmp_int32 gtid = __kmp_entry_gtid();
4095   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4096   kmp_info_t *this_thr = __kmp_threads[gtid];
4097   kmp_team_t *steam = this_thr->th.th_serial_team;
4098   kmp_team_t *team = this_thr->th.th_team;
4099 
4100   __kmp_printf(
4101       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4102       "ptask=%p\n",
4103       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4104       team->t.t_implicit_task_taskdata[tid].td_parent);
4105 }
4106 #endif // KMP_DEBUG
4107 
4108 /* TODO optimize with one big memclr, take out what isn't needed, split
4109    responsibility to workers as much as possible, and delay initialization of
4110    features as much as possible  */
4111 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4112                                   int tid, int gtid) {
4113   /* this_thr->th.th_info.ds.ds_gtid is setup in
4114      kmp_allocate_thread/create_worker.
4115      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4116   KMP_DEBUG_ASSERT(this_thr != NULL);
4117   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4118   KMP_DEBUG_ASSERT(team);
4119   KMP_DEBUG_ASSERT(team->t.t_threads);
4120   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4121   kmp_info_t *master = team->t.t_threads[0];
4122   KMP_DEBUG_ASSERT(master);
4123   KMP_DEBUG_ASSERT(master->th.th_root);
4124 
4125   KMP_MB();
4126 
4127   TCW_SYNC_PTR(this_thr->th.th_team, team);
4128 
4129   this_thr->th.th_info.ds.ds_tid = tid;
4130   this_thr->th.th_set_nproc = 0;
4131   if (__kmp_tasking_mode != tskm_immediate_exec)
4132     // When tasking is possible, threads are not safe to reap until they are
4133     // done tasking; this will be set when tasking code is exited in wait
4134     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4135   else // no tasking --> always safe to reap
4136     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4137   this_thr->th.th_set_proc_bind = proc_bind_default;
4138 #if KMP_AFFINITY_SUPPORTED
4139   this_thr->th.th_new_place = this_thr->th.th_current_place;
4140 #endif
4141   this_thr->th.th_root = master->th.th_root;
4142 
4143   /* setup the thread's cache of the team structure */
4144   this_thr->th.th_team_nproc = team->t.t_nproc;
4145   this_thr->th.th_team_master = master;
4146   this_thr->th.th_team_serialized = team->t.t_serialized;
4147 
4148   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4149 
4150   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4151                 tid, gtid, this_thr, this_thr->th.th_current_task));
4152 
4153   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4154                            team, tid, TRUE);
4155 
4156   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4157                 tid, gtid, this_thr, this_thr->th.th_current_task));
4158   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4159   // __kmp_initialize_team()?
4160 
4161   /* TODO no worksharing in speculative threads */
4162   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4163 
4164   this_thr->th.th_local.this_construct = 0;
4165 
4166   if (!this_thr->th.th_pri_common) {
4167     this_thr->th.th_pri_common =
4168         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4169     if (__kmp_storage_map) {
4170       __kmp_print_storage_map_gtid(
4171           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4172           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4173     }
4174     this_thr->th.th_pri_head = NULL;
4175   }
4176 
4177   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4178       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4179     // Make new thread's CG root same as primary thread's
4180     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4181     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4182     if (tmp) {
4183       // worker changes CG, need to check if old CG should be freed
4184       int i = tmp->cg_nthreads--;
4185       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4186                      " on node %p of thread %p to %d\n",
4187                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4188       if (i == 1) {
4189         __kmp_free(tmp); // last thread left CG --> free it
4190       }
4191     }
4192     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4193     // Increment new thread's CG root's counter to add the new thread
4194     this_thr->th.th_cg_roots->cg_nthreads++;
4195     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4196                    " node %p of thread %p to %d\n",
4197                    this_thr, this_thr->th.th_cg_roots,
4198                    this_thr->th.th_cg_roots->cg_root,
4199                    this_thr->th.th_cg_roots->cg_nthreads));
4200     this_thr->th.th_current_task->td_icvs.thread_limit =
4201         this_thr->th.th_cg_roots->cg_thread_limit;
4202   }
4203 
4204   /* Initialize dynamic dispatch */
4205   {
4206     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4207     // Use team max_nproc since this will never change for the team.
4208     size_t disp_size =
4209         sizeof(dispatch_private_info_t) *
4210         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4211     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4212                   team->t.t_max_nproc));
4213     KMP_ASSERT(dispatch);
4214     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4215     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4216 
4217     dispatch->th_disp_index = 0;
4218     dispatch->th_doacross_buf_idx = 0;
4219     if (!dispatch->th_disp_buffer) {
4220       dispatch->th_disp_buffer =
4221           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4222 
4223       if (__kmp_storage_map) {
4224         __kmp_print_storage_map_gtid(
4225             gtid, &dispatch->th_disp_buffer[0],
4226             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4227                                           ? 1
4228                                           : __kmp_dispatch_num_buffers],
4229             disp_size,
4230             "th_%d.th_dispatch.th_disp_buffer "
4231             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4232             gtid, team->t.t_id, gtid);
4233       }
4234     } else {
4235       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4236     }
4237 
4238     dispatch->th_dispatch_pr_current = 0;
4239     dispatch->th_dispatch_sh_current = 0;
4240 
4241     dispatch->th_deo_fcn = 0; /* ORDERED     */
4242     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4243   }
4244 
4245   this_thr->th.th_next_pool = NULL;
4246 
4247   if (!this_thr->th.th_task_state_memo_stack) {
4248     size_t i;
4249     this_thr->th.th_task_state_memo_stack =
4250         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4251     this_thr->th.th_task_state_top = 0;
4252     this_thr->th.th_task_state_stack_sz = 4;
4253     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4254          ++i) // zero init the stack
4255       this_thr->th.th_task_state_memo_stack[i] = 0;
4256   }
4257 
4258   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4259   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4260 
4261   KMP_MB();
4262 }
4263 
4264 /* allocate a new thread for the requesting team. this is only called from
4265    within a forkjoin critical section. we will first try to get an available
4266    thread from the thread pool. if none is available, we will fork a new one
4267    assuming we are able to create a new one. this should be assured, as the
4268    caller should check on this first. */
4269 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4270                                   int new_tid) {
4271   kmp_team_t *serial_team;
4272   kmp_info_t *new_thr;
4273   int new_gtid;
4274 
4275   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4276   KMP_DEBUG_ASSERT(root && team);
4277 #if !KMP_NESTED_HOT_TEAMS
4278   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4279 #endif
4280   KMP_MB();
4281 
4282   /* first, try to get one from the thread pool */
4283   if (__kmp_thread_pool) {
4284     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4285     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4286     if (new_thr == __kmp_thread_pool_insert_pt) {
4287       __kmp_thread_pool_insert_pt = NULL;
4288     }
4289     TCW_4(new_thr->th.th_in_pool, FALSE);
4290     __kmp_suspend_initialize_thread(new_thr);
4291     __kmp_lock_suspend_mx(new_thr);
4292     if (new_thr->th.th_active_in_pool == TRUE) {
4293       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4294       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4295       new_thr->th.th_active_in_pool = FALSE;
4296     }
4297     __kmp_unlock_suspend_mx(new_thr);
4298 
4299     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4300                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4301     KMP_ASSERT(!new_thr->th.th_team);
4302     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4303 
4304     /* setup the thread structure */
4305     __kmp_initialize_info(new_thr, team, new_tid,
4306                           new_thr->th.th_info.ds.ds_gtid);
4307     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4308 
4309     TCW_4(__kmp_nth, __kmp_nth + 1);
4310 
4311     new_thr->th.th_task_state = 0;
4312     new_thr->th.th_task_state_top = 0;
4313     new_thr->th.th_task_state_stack_sz = 4;
4314 
4315     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4316       // Make sure pool thread has transitioned to waiting on own thread struct
4317       KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4318       // Thread activated in __kmp_allocate_team when increasing team size
4319     }
4320 
4321 #ifdef KMP_ADJUST_BLOCKTIME
4322     /* Adjust blocktime back to zero if necessary */
4323     /* Middle initialization might not have occurred yet */
4324     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4325       if (__kmp_nth > __kmp_avail_proc) {
4326         __kmp_zero_bt = TRUE;
4327       }
4328     }
4329 #endif /* KMP_ADJUST_BLOCKTIME */
4330 
4331 #if KMP_DEBUG
4332     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4333     // KMP_BARRIER_PARENT_FLAG.
4334     int b;
4335     kmp_balign_t *balign = new_thr->th.th_bar;
4336     for (b = 0; b < bs_last_barrier; ++b)
4337       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4338 #endif
4339 
4340     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4341                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4342 
4343     KMP_MB();
4344     return new_thr;
4345   }
4346 
4347   /* no, well fork a new one */
4348   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4349   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4350 
4351 #if KMP_USE_MONITOR
4352   // If this is the first worker thread the RTL is creating, then also
4353   // launch the monitor thread.  We try to do this as early as possible.
4354   if (!TCR_4(__kmp_init_monitor)) {
4355     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4356     if (!TCR_4(__kmp_init_monitor)) {
4357       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4358       TCW_4(__kmp_init_monitor, 1);
4359       __kmp_create_monitor(&__kmp_monitor);
4360       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4361 #if KMP_OS_WINDOWS
4362       // AC: wait until monitor has started. This is a fix for CQ232808.
4363       // The reason is that if the library is loaded/unloaded in a loop with
4364       // small (parallel) work in between, then there is high probability that
4365       // monitor thread started after the library shutdown. At shutdown it is
4366       // too late to cope with the problem, because when the primary thread is
4367       // in DllMain (process detach) the monitor has no chances to start (it is
4368       // blocked), and primary thread has no means to inform the monitor that
4369       // the library has gone, because all the memory which the monitor can
4370       // access is going to be released/reset.
4371       while (TCR_4(__kmp_init_monitor) < 2) {
4372         KMP_YIELD(TRUE);
4373       }
4374       KF_TRACE(10, ("after monitor thread has started\n"));
4375 #endif
4376     }
4377     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4378   }
4379 #endif
4380 
4381   KMP_MB();
4382 
4383   {
4384     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4385                              ? 1
4386                              : __kmp_hidden_helper_threads_num + 1;
4387 
4388     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4389          ++new_gtid) {
4390       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4391     }
4392 
4393     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4394       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4395     }
4396   }
4397 
4398   /* allocate space for it. */
4399   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4400 
4401   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4402 
4403 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4404   // suppress race conditions detection on synchronization flags in debug mode
4405   // this helps to analyze library internals eliminating false positives
4406   __itt_suppress_mark_range(
4407       __itt_suppress_range, __itt_suppress_threading_errors,
4408       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4409   __itt_suppress_mark_range(
4410       __itt_suppress_range, __itt_suppress_threading_errors,
4411       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4412 #if KMP_OS_WINDOWS
4413   __itt_suppress_mark_range(
4414       __itt_suppress_range, __itt_suppress_threading_errors,
4415       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4416 #else
4417   __itt_suppress_mark_range(__itt_suppress_range,
4418                             __itt_suppress_threading_errors,
4419                             &new_thr->th.th_suspend_init_count,
4420                             sizeof(new_thr->th.th_suspend_init_count));
4421 #endif
4422   // TODO: check if we need to also suppress b_arrived flags
4423   __itt_suppress_mark_range(__itt_suppress_range,
4424                             __itt_suppress_threading_errors,
4425                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4426                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4427   __itt_suppress_mark_range(__itt_suppress_range,
4428                             __itt_suppress_threading_errors,
4429                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4430                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4431   __itt_suppress_mark_range(__itt_suppress_range,
4432                             __itt_suppress_threading_errors,
4433                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4434                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4435 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4436   if (__kmp_storage_map) {
4437     __kmp_print_thread_storage_map(new_thr, new_gtid);
4438   }
4439 
4440   // add the reserve serialized team, initialized from the team's primary thread
4441   {
4442     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4443     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4444     new_thr->th.th_serial_team = serial_team =
4445         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4446 #if OMPT_SUPPORT
4447                                           ompt_data_none, // root parallel id
4448 #endif
4449                                           proc_bind_default, &r_icvs,
4450                                           0 USE_NESTED_HOT_ARG(NULL));
4451   }
4452   KMP_ASSERT(serial_team);
4453   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4454   // execution (it is unused for now).
4455   serial_team->t.t_threads[0] = new_thr;
4456   KF_TRACE(10,
4457            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4458             new_thr));
4459 
4460   /* setup the thread structures */
4461   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4462 
4463 #if USE_FAST_MEMORY
4464   __kmp_initialize_fast_memory(new_thr);
4465 #endif /* USE_FAST_MEMORY */
4466 
4467 #if KMP_USE_BGET
4468   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4469   __kmp_initialize_bget(new_thr);
4470 #endif
4471 
4472   __kmp_init_random(new_thr); // Initialize random number generator
4473 
4474   /* Initialize these only once when thread is grabbed for a team allocation */
4475   KA_TRACE(20,
4476            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4477             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4478 
4479   int b;
4480   kmp_balign_t *balign = new_thr->th.th_bar;
4481   for (b = 0; b < bs_last_barrier; ++b) {
4482     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4483     balign[b].bb.team = NULL;
4484     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4485     balign[b].bb.use_oncore_barrier = 0;
4486   }
4487 
4488   TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4489   new_thr->th.th_sleep_loc_type = flag_unset;
4490 
4491   new_thr->th.th_spin_here = FALSE;
4492   new_thr->th.th_next_waiting = 0;
4493 #if KMP_OS_UNIX
4494   new_thr->th.th_blocking = false;
4495 #endif
4496 
4497 #if KMP_AFFINITY_SUPPORTED
4498   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4499   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4500   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4501   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4502 #endif
4503   new_thr->th.th_def_allocator = __kmp_def_allocator;
4504   new_thr->th.th_prev_level = 0;
4505   new_thr->th.th_prev_num_threads = 1;
4506 
4507   TCW_4(new_thr->th.th_in_pool, FALSE);
4508   new_thr->th.th_active_in_pool = FALSE;
4509   TCW_4(new_thr->th.th_active, TRUE);
4510 
4511   /* adjust the global counters */
4512   __kmp_all_nth++;
4513   __kmp_nth++;
4514 
4515   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4516   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4517   if (__kmp_adjust_gtid_mode) {
4518     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4519       if (TCR_4(__kmp_gtid_mode) != 2) {
4520         TCW_4(__kmp_gtid_mode, 2);
4521       }
4522     } else {
4523       if (TCR_4(__kmp_gtid_mode) != 1) {
4524         TCW_4(__kmp_gtid_mode, 1);
4525       }
4526     }
4527   }
4528 
4529 #ifdef KMP_ADJUST_BLOCKTIME
4530   /* Adjust blocktime back to zero if necessary       */
4531   /* Middle initialization might not have occurred yet */
4532   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4533     if (__kmp_nth > __kmp_avail_proc) {
4534       __kmp_zero_bt = TRUE;
4535     }
4536   }
4537 #endif /* KMP_ADJUST_BLOCKTIME */
4538 
4539   /* actually fork it and create the new worker thread */
4540   KF_TRACE(
4541       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4542   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4543   KF_TRACE(10,
4544            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4545 
4546   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4547                 new_gtid));
4548   KMP_MB();
4549   return new_thr;
4550 }
4551 
4552 /* Reinitialize team for reuse.
4553    The hot team code calls this case at every fork barrier, so EPCC barrier
4554    test are extremely sensitive to changes in it, esp. writes to the team
4555    struct, which cause a cache invalidation in all threads.
4556    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4557 static void __kmp_reinitialize_team(kmp_team_t *team,
4558                                     kmp_internal_control_t *new_icvs,
4559                                     ident_t *loc) {
4560   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4561                 team->t.t_threads[0], team));
4562   KMP_DEBUG_ASSERT(team && new_icvs);
4563   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4564   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4565 
4566   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4567   // Copy ICVs to the primary thread's implicit taskdata
4568   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4569   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4570 
4571   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4572                 team->t.t_threads[0], team));
4573 }
4574 
4575 /* Initialize the team data structure.
4576    This assumes the t_threads and t_max_nproc are already set.
4577    Also, we don't touch the arguments */
4578 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4579                                   kmp_internal_control_t *new_icvs,
4580                                   ident_t *loc) {
4581   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4582 
4583   /* verify */
4584   KMP_DEBUG_ASSERT(team);
4585   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4586   KMP_DEBUG_ASSERT(team->t.t_threads);
4587   KMP_MB();
4588 
4589   team->t.t_master_tid = 0; /* not needed */
4590   /* team->t.t_master_bar;        not needed */
4591   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4592   team->t.t_nproc = new_nproc;
4593 
4594   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4595   team->t.t_next_pool = NULL;
4596   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4597    * up hot team */
4598 
4599   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4600   team->t.t_invoke = NULL; /* not needed */
4601 
4602   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4603   team->t.t_sched.sched = new_icvs->sched.sched;
4604 
4605 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4606   team->t.t_fp_control_saved = FALSE; /* not needed */
4607   team->t.t_x87_fpu_control_word = 0; /* not needed */
4608   team->t.t_mxcsr = 0; /* not needed */
4609 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4610 
4611   team->t.t_construct = 0;
4612 
4613   team->t.t_ordered.dt.t_value = 0;
4614   team->t.t_master_active = FALSE;
4615 
4616 #ifdef KMP_DEBUG
4617   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4618 #endif
4619 #if KMP_OS_WINDOWS
4620   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4621 #endif
4622 
4623   team->t.t_control_stack_top = NULL;
4624 
4625   __kmp_reinitialize_team(team, new_icvs, loc);
4626 
4627   KMP_MB();
4628   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4629 }
4630 
4631 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4632 /* Sets full mask for thread and returns old mask, no changes to structures. */
4633 static void
4634 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4635   if (KMP_AFFINITY_CAPABLE()) {
4636     int status;
4637     if (old_mask != NULL) {
4638       status = __kmp_get_system_affinity(old_mask, TRUE);
4639       int error = errno;
4640       if (status != 0) {
4641         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4642                     __kmp_msg_null);
4643       }
4644     }
4645     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4646   }
4647 }
4648 #endif
4649 
4650 #if KMP_AFFINITY_SUPPORTED
4651 
4652 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4653 // It calculates the worker + primary thread's partition based upon the parent
4654 // thread's partition, and binds each worker to a thread in their partition.
4655 // The primary thread's partition should already include its current binding.
4656 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4657   // Do not partition places for the hidden helper team
4658   if (KMP_HIDDEN_HELPER_TEAM(team))
4659     return;
4660   // Copy the primary thread's place partition to the team struct
4661   kmp_info_t *master_th = team->t.t_threads[0];
4662   KMP_DEBUG_ASSERT(master_th != NULL);
4663   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4664   int first_place = master_th->th.th_first_place;
4665   int last_place = master_th->th.th_last_place;
4666   int masters_place = master_th->th.th_current_place;
4667   team->t.t_first_place = first_place;
4668   team->t.t_last_place = last_place;
4669 
4670   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4671                 "bound to place %d partition = [%d,%d]\n",
4672                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4673                 team->t.t_id, masters_place, first_place, last_place));
4674 
4675   switch (proc_bind) {
4676 
4677   case proc_bind_default:
4678     // Serial teams might have the proc_bind policy set to proc_bind_default.
4679     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4680     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4681     break;
4682 
4683   case proc_bind_primary: {
4684     int f;
4685     int n_th = team->t.t_nproc;
4686     for (f = 1; f < n_th; f++) {
4687       kmp_info_t *th = team->t.t_threads[f];
4688       KMP_DEBUG_ASSERT(th != NULL);
4689       th->th.th_first_place = first_place;
4690       th->th.th_last_place = last_place;
4691       th->th.th_new_place = masters_place;
4692       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4693           team->t.t_display_affinity != 1) {
4694         team->t.t_display_affinity = 1;
4695       }
4696 
4697       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4698                      "partition = [%d,%d]\n",
4699                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4700                      f, masters_place, first_place, last_place));
4701     }
4702   } break;
4703 
4704   case proc_bind_close: {
4705     int f;
4706     int n_th = team->t.t_nproc;
4707     int n_places;
4708     if (first_place <= last_place) {
4709       n_places = last_place - first_place + 1;
4710     } else {
4711       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4712     }
4713     if (n_th <= n_places) {
4714       int place = masters_place;
4715       for (f = 1; f < n_th; f++) {
4716         kmp_info_t *th = team->t.t_threads[f];
4717         KMP_DEBUG_ASSERT(th != NULL);
4718 
4719         if (place == last_place) {
4720           place = first_place;
4721         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4722           place = 0;
4723         } else {
4724           place++;
4725         }
4726         th->th.th_first_place = first_place;
4727         th->th.th_last_place = last_place;
4728         th->th.th_new_place = place;
4729         if (__kmp_display_affinity && place != th->th.th_current_place &&
4730             team->t.t_display_affinity != 1) {
4731           team->t.t_display_affinity = 1;
4732         }
4733 
4734         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4735                        "partition = [%d,%d]\n",
4736                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4737                        team->t.t_id, f, place, first_place, last_place));
4738       }
4739     } else {
4740       int S, rem, gap, s_count;
4741       S = n_th / n_places;
4742       s_count = 0;
4743       rem = n_th - (S * n_places);
4744       gap = rem > 0 ? n_places / rem : n_places;
4745       int place = masters_place;
4746       int gap_ct = gap;
4747       for (f = 0; f < n_th; f++) {
4748         kmp_info_t *th = team->t.t_threads[f];
4749         KMP_DEBUG_ASSERT(th != NULL);
4750 
4751         th->th.th_first_place = first_place;
4752         th->th.th_last_place = last_place;
4753         th->th.th_new_place = place;
4754         if (__kmp_display_affinity && place != th->th.th_current_place &&
4755             team->t.t_display_affinity != 1) {
4756           team->t.t_display_affinity = 1;
4757         }
4758         s_count++;
4759 
4760         if ((s_count == S) && rem && (gap_ct == gap)) {
4761           // do nothing, add an extra thread to place on next iteration
4762         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4763           // we added an extra thread to this place; move to next place
4764           if (place == last_place) {
4765             place = first_place;
4766           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4767             place = 0;
4768           } else {
4769             place++;
4770           }
4771           s_count = 0;
4772           gap_ct = 1;
4773           rem--;
4774         } else if (s_count == S) { // place full; don't add extra
4775           if (place == last_place) {
4776             place = first_place;
4777           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4778             place = 0;
4779           } else {
4780             place++;
4781           }
4782           gap_ct++;
4783           s_count = 0;
4784         }
4785 
4786         KA_TRACE(100,
4787                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4788                   "partition = [%d,%d]\n",
4789                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4790                   th->th.th_new_place, first_place, last_place));
4791       }
4792       KMP_DEBUG_ASSERT(place == masters_place);
4793     }
4794   } break;
4795 
4796   case proc_bind_spread: {
4797     int f;
4798     int n_th = team->t.t_nproc;
4799     int n_places;
4800     int thidx;
4801     if (first_place <= last_place) {
4802       n_places = last_place - first_place + 1;
4803     } else {
4804       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4805     }
4806     if (n_th <= n_places) {
4807       int place = -1;
4808 
4809       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4810         int S = n_places / n_th;
4811         int s_count, rem, gap, gap_ct;
4812 
4813         place = masters_place;
4814         rem = n_places - n_th * S;
4815         gap = rem ? n_th / rem : 1;
4816         gap_ct = gap;
4817         thidx = n_th;
4818         if (update_master_only == 1)
4819           thidx = 1;
4820         for (f = 0; f < thidx; f++) {
4821           kmp_info_t *th = team->t.t_threads[f];
4822           KMP_DEBUG_ASSERT(th != NULL);
4823 
4824           th->th.th_first_place = place;
4825           th->th.th_new_place = place;
4826           if (__kmp_display_affinity && place != th->th.th_current_place &&
4827               team->t.t_display_affinity != 1) {
4828             team->t.t_display_affinity = 1;
4829           }
4830           s_count = 1;
4831           while (s_count < S) {
4832             if (place == last_place) {
4833               place = first_place;
4834             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4835               place = 0;
4836             } else {
4837               place++;
4838             }
4839             s_count++;
4840           }
4841           if (rem && (gap_ct == gap)) {
4842             if (place == last_place) {
4843               place = first_place;
4844             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4845               place = 0;
4846             } else {
4847               place++;
4848             }
4849             rem--;
4850             gap_ct = 0;
4851           }
4852           th->th.th_last_place = place;
4853           gap_ct++;
4854 
4855           if (place == last_place) {
4856             place = first_place;
4857           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4858             place = 0;
4859           } else {
4860             place++;
4861           }
4862 
4863           KA_TRACE(100,
4864                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4865                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4866                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4867                     f, th->th.th_new_place, th->th.th_first_place,
4868                     th->th.th_last_place, __kmp_affinity_num_masks));
4869         }
4870       } else {
4871         /* Having uniform space of available computation places I can create
4872            T partitions of round(P/T) size and put threads into the first
4873            place of each partition. */
4874         double current = static_cast<double>(masters_place);
4875         double spacing =
4876             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4877         int first, last;
4878         kmp_info_t *th;
4879 
4880         thidx = n_th + 1;
4881         if (update_master_only == 1)
4882           thidx = 1;
4883         for (f = 0; f < thidx; f++) {
4884           first = static_cast<int>(current);
4885           last = static_cast<int>(current + spacing) - 1;
4886           KMP_DEBUG_ASSERT(last >= first);
4887           if (first >= n_places) {
4888             if (masters_place) {
4889               first -= n_places;
4890               last -= n_places;
4891               if (first == (masters_place + 1)) {
4892                 KMP_DEBUG_ASSERT(f == n_th);
4893                 first--;
4894               }
4895               if (last == masters_place) {
4896                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4897                 last--;
4898               }
4899             } else {
4900               KMP_DEBUG_ASSERT(f == n_th);
4901               first = 0;
4902               last = 0;
4903             }
4904           }
4905           if (last >= n_places) {
4906             last = (n_places - 1);
4907           }
4908           place = first;
4909           current += spacing;
4910           if (f < n_th) {
4911             KMP_DEBUG_ASSERT(0 <= first);
4912             KMP_DEBUG_ASSERT(n_places > first);
4913             KMP_DEBUG_ASSERT(0 <= last);
4914             KMP_DEBUG_ASSERT(n_places > last);
4915             KMP_DEBUG_ASSERT(last_place >= first_place);
4916             th = team->t.t_threads[f];
4917             KMP_DEBUG_ASSERT(th);
4918             th->th.th_first_place = first;
4919             th->th.th_new_place = place;
4920             th->th.th_last_place = last;
4921             if (__kmp_display_affinity && place != th->th.th_current_place &&
4922                 team->t.t_display_affinity != 1) {
4923               team->t.t_display_affinity = 1;
4924             }
4925             KA_TRACE(100,
4926                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4927                       "partition = [%d,%d], spacing = %.4f\n",
4928                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4929                       team->t.t_id, f, th->th.th_new_place,
4930                       th->th.th_first_place, th->th.th_last_place, spacing));
4931           }
4932         }
4933       }
4934       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4935     } else {
4936       int S, rem, gap, s_count;
4937       S = n_th / n_places;
4938       s_count = 0;
4939       rem = n_th - (S * n_places);
4940       gap = rem > 0 ? n_places / rem : n_places;
4941       int place = masters_place;
4942       int gap_ct = gap;
4943       thidx = n_th;
4944       if (update_master_only == 1)
4945         thidx = 1;
4946       for (f = 0; f < thidx; f++) {
4947         kmp_info_t *th = team->t.t_threads[f];
4948         KMP_DEBUG_ASSERT(th != NULL);
4949 
4950         th->th.th_first_place = place;
4951         th->th.th_last_place = place;
4952         th->th.th_new_place = place;
4953         if (__kmp_display_affinity && place != th->th.th_current_place &&
4954             team->t.t_display_affinity != 1) {
4955           team->t.t_display_affinity = 1;
4956         }
4957         s_count++;
4958 
4959         if ((s_count == S) && rem && (gap_ct == gap)) {
4960           // do nothing, add an extra thread to place on next iteration
4961         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4962           // we added an extra thread to this place; move on to next place
4963           if (place == last_place) {
4964             place = first_place;
4965           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4966             place = 0;
4967           } else {
4968             place++;
4969           }
4970           s_count = 0;
4971           gap_ct = 1;
4972           rem--;
4973         } else if (s_count == S) { // place is full; don't add extra thread
4974           if (place == last_place) {
4975             place = first_place;
4976           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4977             place = 0;
4978           } else {
4979             place++;
4980           }
4981           gap_ct++;
4982           s_count = 0;
4983         }
4984 
4985         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4986                        "partition = [%d,%d]\n",
4987                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4988                        team->t.t_id, f, th->th.th_new_place,
4989                        th->th.th_first_place, th->th.th_last_place));
4990       }
4991       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4992     }
4993   } break;
4994 
4995   default:
4996     break;
4997   }
4998 
4999   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5000 }
5001 
5002 #endif // KMP_AFFINITY_SUPPORTED
5003 
5004 /* allocate a new team data structure to use.  take one off of the free pool if
5005    available */
5006 kmp_team_t *
5007 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5008 #if OMPT_SUPPORT
5009                     ompt_data_t ompt_parallel_data,
5010 #endif
5011                     kmp_proc_bind_t new_proc_bind,
5012                     kmp_internal_control_t *new_icvs,
5013                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5014   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5015   int f;
5016   kmp_team_t *team;
5017   int use_hot_team = !root->r.r_active;
5018   int level = 0;
5019 
5020   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5021   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5022   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5023   KMP_MB();
5024 
5025 #if KMP_NESTED_HOT_TEAMS
5026   kmp_hot_team_ptr_t *hot_teams;
5027   if (master) {
5028     team = master->th.th_team;
5029     level = team->t.t_active_level;
5030     if (master->th.th_teams_microtask) { // in teams construct?
5031       if (master->th.th_teams_size.nteams > 1 &&
5032           ( // #teams > 1
5033               team->t.t_pkfn ==
5034                   (microtask_t)__kmp_teams_master || // inner fork of the teams
5035               master->th.th_teams_level <
5036                   team->t.t_level)) { // or nested parallel inside the teams
5037         ++level; // not increment if #teams==1, or for outer fork of the teams;
5038         // increment otherwise
5039       }
5040     }
5041     hot_teams = master->th.th_hot_teams;
5042     if (level < __kmp_hot_teams_max_level && hot_teams &&
5043         hot_teams[level].hot_team) {
5044       // hot team has already been allocated for given level
5045       use_hot_team = 1;
5046     } else {
5047       use_hot_team = 0;
5048     }
5049   } else {
5050     // check we won't access uninitialized hot_teams, just in case
5051     KMP_DEBUG_ASSERT(new_nproc == 1);
5052   }
5053 #endif
5054   // Optimization to use a "hot" team
5055   if (use_hot_team && new_nproc > 1) {
5056     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5057 #if KMP_NESTED_HOT_TEAMS
5058     team = hot_teams[level].hot_team;
5059 #else
5060     team = root->r.r_hot_team;
5061 #endif
5062 #if KMP_DEBUG
5063     if (__kmp_tasking_mode != tskm_immediate_exec) {
5064       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5065                     "task_team[1] = %p before reinit\n",
5066                     team->t.t_task_team[0], team->t.t_task_team[1]));
5067     }
5068 #endif
5069 
5070     if (team->t.t_nproc != new_nproc &&
5071         __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5072       // Distributed barrier may need a resize
5073       int old_nthr = team->t.t_nproc;
5074       __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5075     }
5076 
5077     // Has the number of threads changed?
5078     /* Let's assume the most common case is that the number of threads is
5079        unchanged, and put that case first. */
5080     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5081       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5082       // This case can mean that omp_set_num_threads() was called and the hot
5083       // team size was already reduced, so we check the special flag
5084       if (team->t.t_size_changed == -1) {
5085         team->t.t_size_changed = 1;
5086       } else {
5087         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5088       }
5089 
5090       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5091       kmp_r_sched_t new_sched = new_icvs->sched;
5092       // set primary thread's schedule as new run-time schedule
5093       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5094 
5095       __kmp_reinitialize_team(team, new_icvs,
5096                               root->r.r_uber_thread->th.th_ident);
5097 
5098       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5099                     team->t.t_threads[0], team));
5100       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5101 
5102 #if KMP_AFFINITY_SUPPORTED
5103       if ((team->t.t_size_changed == 0) &&
5104           (team->t.t_proc_bind == new_proc_bind)) {
5105         if (new_proc_bind == proc_bind_spread) {
5106           __kmp_partition_places(
5107               team, 1); // add flag to update only master for spread
5108         }
5109         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5110                        "proc_bind = %d, partition = [%d,%d]\n",
5111                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5112                        team->t.t_last_place));
5113       } else {
5114         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5115         __kmp_partition_places(team);
5116       }
5117 #else
5118       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5119 #endif /* KMP_AFFINITY_SUPPORTED */
5120     } else if (team->t.t_nproc > new_nproc) {
5121       KA_TRACE(20,
5122                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5123                 new_nproc));
5124 
5125       team->t.t_size_changed = 1;
5126       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5127         // Barrier size already reduced earlier in this function
5128         // Activate team threads via th_used_in_team
5129         __kmp_add_threads_to_team(team, new_nproc);
5130       }
5131 #if KMP_NESTED_HOT_TEAMS
5132       if (__kmp_hot_teams_mode == 0) {
5133         // AC: saved number of threads should correspond to team's value in this
5134         // mode, can be bigger in mode 1, when hot team has threads in reserve
5135         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5136         hot_teams[level].hot_team_nth = new_nproc;
5137 #endif // KMP_NESTED_HOT_TEAMS
5138         /* release the extra threads we don't need any more */
5139         for (f = new_nproc; f < team->t.t_nproc; f++) {
5140           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5141           if (__kmp_tasking_mode != tskm_immediate_exec) {
5142             // When decreasing team size, threads no longer in the team should
5143             // unref task team.
5144             team->t.t_threads[f]->th.th_task_team = NULL;
5145           }
5146           __kmp_free_thread(team->t.t_threads[f]);
5147           team->t.t_threads[f] = NULL;
5148         }
5149 #if KMP_NESTED_HOT_TEAMS
5150       } // (__kmp_hot_teams_mode == 0)
5151       else {
5152         // When keeping extra threads in team, switch threads to wait on own
5153         // b_go flag
5154         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5155           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5156           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5157           for (int b = 0; b < bs_last_barrier; ++b) {
5158             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5159               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5160             }
5161             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5162           }
5163         }
5164       }
5165 #endif // KMP_NESTED_HOT_TEAMS
5166       team->t.t_nproc = new_nproc;
5167       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5168       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5169       __kmp_reinitialize_team(team, new_icvs,
5170                               root->r.r_uber_thread->th.th_ident);
5171 
5172       // Update remaining threads
5173       for (f = 0; f < new_nproc; ++f) {
5174         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5175       }
5176 
5177       // restore the current task state of the primary thread: should be the
5178       // implicit task
5179       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5180                     team->t.t_threads[0], team));
5181 
5182       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5183 
5184 #ifdef KMP_DEBUG
5185       for (f = 0; f < team->t.t_nproc; f++) {
5186         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5187                          team->t.t_threads[f]->th.th_team_nproc ==
5188                              team->t.t_nproc);
5189       }
5190 #endif
5191 
5192       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5193 #if KMP_AFFINITY_SUPPORTED
5194       __kmp_partition_places(team);
5195 #endif
5196     } else { // team->t.t_nproc < new_nproc
5197 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5198       kmp_affin_mask_t *old_mask;
5199       if (KMP_AFFINITY_CAPABLE()) {
5200         KMP_CPU_ALLOC(old_mask);
5201       }
5202 #endif
5203 
5204       KA_TRACE(20,
5205                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5206                 new_nproc));
5207       int old_nproc = team->t.t_nproc; // save old value and use to update only
5208       team->t.t_size_changed = 1;
5209 
5210 #if KMP_NESTED_HOT_TEAMS
5211       int avail_threads = hot_teams[level].hot_team_nth;
5212       if (new_nproc < avail_threads)
5213         avail_threads = new_nproc;
5214       kmp_info_t **other_threads = team->t.t_threads;
5215       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5216         // Adjust barrier data of reserved threads (if any) of the team
5217         // Other data will be set in __kmp_initialize_info() below.
5218         int b;
5219         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5220         for (b = 0; b < bs_last_barrier; ++b) {
5221           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5222           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5223 #if USE_DEBUGGER
5224           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5225 #endif
5226         }
5227       }
5228       if (hot_teams[level].hot_team_nth >= new_nproc) {
5229         // we have all needed threads in reserve, no need to allocate any
5230         // this only possible in mode 1, cannot have reserved threads in mode 0
5231         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5232         team->t.t_nproc = new_nproc; // just get reserved threads involved
5233       } else {
5234         // We may have some threads in reserve, but not enough;
5235         // get reserved threads involved if any.
5236         team->t.t_nproc = hot_teams[level].hot_team_nth;
5237         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5238 #endif // KMP_NESTED_HOT_TEAMS
5239         if (team->t.t_max_nproc < new_nproc) {
5240           /* reallocate larger arrays */
5241           __kmp_reallocate_team_arrays(team, new_nproc);
5242           __kmp_reinitialize_team(team, new_icvs, NULL);
5243         }
5244 
5245 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5246         /* Temporarily set full mask for primary thread before creation of
5247            workers. The reason is that workers inherit the affinity from the
5248            primary thread, so if a lot of workers are created on the single
5249            core quickly, they don't get a chance to set their own affinity for
5250            a long time. */
5251         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5252 #endif
5253 
5254         /* allocate new threads for the hot team */
5255         for (f = team->t.t_nproc; f < new_nproc; f++) {
5256           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5257           KMP_DEBUG_ASSERT(new_worker);
5258           team->t.t_threads[f] = new_worker;
5259 
5260           KA_TRACE(20,
5261                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5262                     "join=%llu, plain=%llu\n",
5263                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5264                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5265                     team->t.t_bar[bs_plain_barrier].b_arrived));
5266 
5267           { // Initialize barrier data for new threads.
5268             int b;
5269             kmp_balign_t *balign = new_worker->th.th_bar;
5270             for (b = 0; b < bs_last_barrier; ++b) {
5271               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5272               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5273                                KMP_BARRIER_PARENT_FLAG);
5274 #if USE_DEBUGGER
5275               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5276 #endif
5277             }
5278           }
5279         }
5280 
5281 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5282         if (KMP_AFFINITY_CAPABLE()) {
5283           /* Restore initial primary thread's affinity mask */
5284           __kmp_set_system_affinity(old_mask, TRUE);
5285           KMP_CPU_FREE(old_mask);
5286         }
5287 #endif
5288 #if KMP_NESTED_HOT_TEAMS
5289       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5290 #endif // KMP_NESTED_HOT_TEAMS
5291       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5292         // Barrier size already increased earlier in this function
5293         // Activate team threads via th_used_in_team
5294         __kmp_add_threads_to_team(team, new_nproc);
5295       }
5296       /* make sure everyone is syncronized */
5297       // new threads below
5298       __kmp_initialize_team(team, new_nproc, new_icvs,
5299                             root->r.r_uber_thread->th.th_ident);
5300 
5301       /* reinitialize the threads */
5302       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5303       for (f = 0; f < team->t.t_nproc; ++f)
5304         __kmp_initialize_info(team->t.t_threads[f], team, f,
5305                               __kmp_gtid_from_tid(f, team));
5306 
5307       if (level) { // set th_task_state for new threads in nested hot team
5308         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5309         // only need to set the th_task_state for the new threads. th_task_state
5310         // for primary thread will not be accurate until after this in
5311         // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5312         // get the correct value.
5313         for (f = old_nproc; f < team->t.t_nproc; ++f)
5314           team->t.t_threads[f]->th.th_task_state =
5315               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5316       } else { // set th_task_state for new threads in non-nested hot team
5317         // copy primary thread's state
5318         kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5319         for (f = old_nproc; f < team->t.t_nproc; ++f)
5320           team->t.t_threads[f]->th.th_task_state = old_state;
5321       }
5322 
5323 #ifdef KMP_DEBUG
5324       for (f = 0; f < team->t.t_nproc; ++f) {
5325         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5326                          team->t.t_threads[f]->th.th_team_nproc ==
5327                              team->t.t_nproc);
5328       }
5329 #endif
5330 
5331       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5332 #if KMP_AFFINITY_SUPPORTED
5333       __kmp_partition_places(team);
5334 #endif
5335     } // Check changes in number of threads
5336 
5337     kmp_info_t *master = team->t.t_threads[0];
5338     if (master->th.th_teams_microtask) {
5339       for (f = 1; f < new_nproc; ++f) {
5340         // propagate teams construct specific info to workers
5341         kmp_info_t *thr = team->t.t_threads[f];
5342         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5343         thr->th.th_teams_level = master->th.th_teams_level;
5344         thr->th.th_teams_size = master->th.th_teams_size;
5345       }
5346     }
5347 #if KMP_NESTED_HOT_TEAMS
5348     if (level) {
5349       // Sync barrier state for nested hot teams, not needed for outermost hot
5350       // team.
5351       for (f = 1; f < new_nproc; ++f) {
5352         kmp_info_t *thr = team->t.t_threads[f];
5353         int b;
5354         kmp_balign_t *balign = thr->th.th_bar;
5355         for (b = 0; b < bs_last_barrier; ++b) {
5356           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5357           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5358 #if USE_DEBUGGER
5359           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5360 #endif
5361         }
5362       }
5363     }
5364 #endif // KMP_NESTED_HOT_TEAMS
5365 
5366     /* reallocate space for arguments if necessary */
5367     __kmp_alloc_argv_entries(argc, team, TRUE);
5368     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5369     // The hot team re-uses the previous task team,
5370     // if untouched during the previous release->gather phase.
5371 
5372     KF_TRACE(10, (" hot_team = %p\n", team));
5373 
5374 #if KMP_DEBUG
5375     if (__kmp_tasking_mode != tskm_immediate_exec) {
5376       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5377                     "task_team[1] = %p after reinit\n",
5378                     team->t.t_task_team[0], team->t.t_task_team[1]));
5379     }
5380 #endif
5381 
5382 #if OMPT_SUPPORT
5383     __ompt_team_assign_id(team, ompt_parallel_data);
5384 #endif
5385 
5386     KMP_MB();
5387 
5388     return team;
5389   }
5390 
5391   /* next, let's try to take one from the team pool */
5392   KMP_MB();
5393   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5394     /* TODO: consider resizing undersized teams instead of reaping them, now
5395        that we have a resizing mechanism */
5396     if (team->t.t_max_nproc >= max_nproc) {
5397       /* take this team from the team pool */
5398       __kmp_team_pool = team->t.t_next_pool;
5399 
5400       if (max_nproc > 1 &&
5401           __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5402         if (!team->t.b) { // Allocate barrier structure
5403           team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5404         }
5405       }
5406 
5407       /* setup the team for fresh use */
5408       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5409 
5410       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5411                     "task_team[1] %p to NULL\n",
5412                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5413       team->t.t_task_team[0] = NULL;
5414       team->t.t_task_team[1] = NULL;
5415 
5416       /* reallocate space for arguments if necessary */
5417       __kmp_alloc_argv_entries(argc, team, TRUE);
5418       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5419 
5420       KA_TRACE(
5421           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5422                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5423       { // Initialize barrier data.
5424         int b;
5425         for (b = 0; b < bs_last_barrier; ++b) {
5426           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5427 #if USE_DEBUGGER
5428           team->t.t_bar[b].b_master_arrived = 0;
5429           team->t.t_bar[b].b_team_arrived = 0;
5430 #endif
5431         }
5432       }
5433 
5434       team->t.t_proc_bind = new_proc_bind;
5435 
5436       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5437                     team->t.t_id));
5438 
5439 #if OMPT_SUPPORT
5440       __ompt_team_assign_id(team, ompt_parallel_data);
5441 #endif
5442 
5443       KMP_MB();
5444 
5445       return team;
5446     }
5447 
5448     /* reap team if it is too small, then loop back and check the next one */
5449     // not sure if this is wise, but, will be redone during the hot-teams
5450     // rewrite.
5451     /* TODO: Use technique to find the right size hot-team, don't reap them */
5452     team = __kmp_reap_team(team);
5453     __kmp_team_pool = team;
5454   }
5455 
5456   /* nothing available in the pool, no matter, make a new team! */
5457   KMP_MB();
5458   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5459 
5460   /* and set it up */
5461   team->t.t_max_nproc = max_nproc;
5462   if (max_nproc > 1 &&
5463       __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5464     // Allocate barrier structure
5465     team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5466   }
5467 
5468   /* NOTE well, for some reason allocating one big buffer and dividing it up
5469      seems to really hurt performance a lot on the P4, so, let's not use this */
5470   __kmp_allocate_team_arrays(team, max_nproc);
5471 
5472   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5473   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5474 
5475   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5476                 "%p to NULL\n",
5477                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5478   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5479   // memory, no need to duplicate
5480   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5481   // memory, no need to duplicate
5482 
5483   if (__kmp_storage_map) {
5484     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5485   }
5486 
5487   /* allocate space for arguments */
5488   __kmp_alloc_argv_entries(argc, team, FALSE);
5489   team->t.t_argc = argc;
5490 
5491   KA_TRACE(20,
5492            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5493             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5494   { // Initialize barrier data.
5495     int b;
5496     for (b = 0; b < bs_last_barrier; ++b) {
5497       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5498 #if USE_DEBUGGER
5499       team->t.t_bar[b].b_master_arrived = 0;
5500       team->t.t_bar[b].b_team_arrived = 0;
5501 #endif
5502     }
5503   }
5504 
5505   team->t.t_proc_bind = new_proc_bind;
5506 
5507 #if OMPT_SUPPORT
5508   __ompt_team_assign_id(team, ompt_parallel_data);
5509   team->t.ompt_serialized_team_info = NULL;
5510 #endif
5511 
5512   KMP_MB();
5513 
5514   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5515                 team->t.t_id));
5516 
5517   return team;
5518 }
5519 
5520 /* TODO implement hot-teams at all levels */
5521 /* TODO implement lazy thread release on demand (disband request) */
5522 
5523 /* free the team.  return it to the team pool.  release all the threads
5524  * associated with it */
5525 void __kmp_free_team(kmp_root_t *root,
5526                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5527   int f;
5528   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5529                 team->t.t_id));
5530 
5531   /* verify state */
5532   KMP_DEBUG_ASSERT(root);
5533   KMP_DEBUG_ASSERT(team);
5534   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5535   KMP_DEBUG_ASSERT(team->t.t_threads);
5536 
5537   int use_hot_team = team == root->r.r_hot_team;
5538 #if KMP_NESTED_HOT_TEAMS
5539   int level;
5540   if (master) {
5541     level = team->t.t_active_level - 1;
5542     if (master->th.th_teams_microtask) { // in teams construct?
5543       if (master->th.th_teams_size.nteams > 1) {
5544         ++level; // level was not increased in teams construct for
5545         // team_of_masters
5546       }
5547       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5548           master->th.th_teams_level == team->t.t_level) {
5549         ++level; // level was not increased in teams construct for
5550         // team_of_workers before the parallel
5551       } // team->t.t_level will be increased inside parallel
5552     }
5553 #if KMP_DEBUG
5554     kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5555 #endif
5556     if (level < __kmp_hot_teams_max_level) {
5557       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5558       use_hot_team = 1;
5559     }
5560   }
5561 #endif // KMP_NESTED_HOT_TEAMS
5562 
5563   /* team is done working */
5564   TCW_SYNC_PTR(team->t.t_pkfn,
5565                NULL); // Important for Debugging Support Library.
5566 #if KMP_OS_WINDOWS
5567   team->t.t_copyin_counter = 0; // init counter for possible reuse
5568 #endif
5569   // Do not reset pointer to parent team to NULL for hot teams.
5570 
5571   /* if we are non-hot team, release our threads */
5572   if (!use_hot_team) {
5573     if (__kmp_tasking_mode != tskm_immediate_exec) {
5574       // Wait for threads to reach reapable state
5575       for (f = 1; f < team->t.t_nproc; ++f) {
5576         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5577         kmp_info_t *th = team->t.t_threads[f];
5578         volatile kmp_uint32 *state = &th->th.th_reap_state;
5579         while (*state != KMP_SAFE_TO_REAP) {
5580 #if KMP_OS_WINDOWS
5581           // On Windows a thread can be killed at any time, check this
5582           DWORD ecode;
5583           if (!__kmp_is_thread_alive(th, &ecode)) {
5584             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5585             break;
5586           }
5587 #endif
5588           // first check if thread is sleeping
5589           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5590           if (fl.is_sleeping())
5591             fl.resume(__kmp_gtid_from_thread(th));
5592           KMP_CPU_PAUSE();
5593         }
5594       }
5595 
5596       // Delete task teams
5597       int tt_idx;
5598       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5599         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5600         if (task_team != NULL) {
5601           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5602             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5603             team->t.t_threads[f]->th.th_task_team = NULL;
5604           }
5605           KA_TRACE(
5606               20,
5607               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5608                __kmp_get_gtid(), task_team, team->t.t_id));
5609 #if KMP_NESTED_HOT_TEAMS
5610           __kmp_free_task_team(master, task_team);
5611 #endif
5612           team->t.t_task_team[tt_idx] = NULL;
5613         }
5614       }
5615     }
5616 
5617     // Reset pointer to parent team only for non-hot teams.
5618     team->t.t_parent = NULL;
5619     team->t.t_level = 0;
5620     team->t.t_active_level = 0;
5621 
5622     /* free the worker threads */
5623     for (f = 1; f < team->t.t_nproc; ++f) {
5624       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5625       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5626         KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5627                                     1, 2);
5628       }
5629       __kmp_free_thread(team->t.t_threads[f]);
5630     }
5631 
5632     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5633       if (team->t.b) {
5634         // wake up thread at old location
5635         team->t.b->go_release();
5636         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5637           for (f = 1; f < team->t.t_nproc; ++f) {
5638             if (team->t.b->sleep[f].sleep) {
5639               __kmp_atomic_resume_64(
5640                   team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5641                   (kmp_atomic_flag_64<> *)NULL);
5642             }
5643           }
5644         }
5645         // Wait for threads to be removed from team
5646         for (int f = 1; f < team->t.t_nproc; ++f) {
5647           while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5648             KMP_CPU_PAUSE();
5649         }
5650       }
5651     }
5652 
5653     for (f = 1; f < team->t.t_nproc; ++f) {
5654       team->t.t_threads[f] = NULL;
5655     }
5656 
5657     if (team->t.t_max_nproc > 1 &&
5658         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5659       distributedBarrier::deallocate(team->t.b);
5660       team->t.b = NULL;
5661     }
5662     /* put the team back in the team pool */
5663     /* TODO limit size of team pool, call reap_team if pool too large */
5664     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5665     __kmp_team_pool = (volatile kmp_team_t *)team;
5666   } else { // Check if team was created for primary threads in teams construct
5667     // See if first worker is a CG root
5668     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5669                      team->t.t_threads[1]->th.th_cg_roots);
5670     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5671       // Clean up the CG root nodes on workers so that this team can be re-used
5672       for (f = 1; f < team->t.t_nproc; ++f) {
5673         kmp_info_t *thr = team->t.t_threads[f];
5674         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5675                          thr->th.th_cg_roots->cg_root == thr);
5676         // Pop current CG root off list
5677         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5678         thr->th.th_cg_roots = tmp->up;
5679         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5680                        " up to node %p. cg_nthreads was %d\n",
5681                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5682         int i = tmp->cg_nthreads--;
5683         if (i == 1) {
5684           __kmp_free(tmp); // free CG if we are the last thread in it
5685         }
5686         // Restore current task's thread_limit from CG root
5687         if (thr->th.th_cg_roots)
5688           thr->th.th_current_task->td_icvs.thread_limit =
5689               thr->th.th_cg_roots->cg_thread_limit;
5690       }
5691     }
5692   }
5693 
5694   KMP_MB();
5695 }
5696 
5697 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5698 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5699   kmp_team_t *next_pool = team->t.t_next_pool;
5700 
5701   KMP_DEBUG_ASSERT(team);
5702   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5703   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5704   KMP_DEBUG_ASSERT(team->t.t_threads);
5705   KMP_DEBUG_ASSERT(team->t.t_argv);
5706 
5707   /* TODO clean the threads that are a part of this? */
5708 
5709   /* free stuff */
5710   __kmp_free_team_arrays(team);
5711   if (team->t.t_argv != &team->t.t_inline_argv[0])
5712     __kmp_free((void *)team->t.t_argv);
5713   __kmp_free(team);
5714 
5715   KMP_MB();
5716   return next_pool;
5717 }
5718 
5719 // Free the thread.  Don't reap it, just place it on the pool of available
5720 // threads.
5721 //
5722 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5723 // binding for the affinity mechanism to be useful.
5724 //
5725 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5726 // However, we want to avoid a potential performance problem by always
5727 // scanning through the list to find the correct point at which to insert
5728 // the thread (potential N**2 behavior).  To do this we keep track of the
5729 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5730 // With single-level parallelism, threads will always be added to the tail
5731 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5732 // parallelism, all bets are off and we may need to scan through the entire
5733 // free list.
5734 //
5735 // This change also has a potentially large performance benefit, for some
5736 // applications.  Previously, as threads were freed from the hot team, they
5737 // would be placed back on the free list in inverse order.  If the hot team
5738 // grew back to it's original size, then the freed thread would be placed
5739 // back on the hot team in reverse order.  This could cause bad cache
5740 // locality problems on programs where the size of the hot team regularly
5741 // grew and shrunk.
5742 //
5743 // Now, for single-level parallelism, the OMP tid is always == gtid.
5744 void __kmp_free_thread(kmp_info_t *this_th) {
5745   int gtid;
5746   kmp_info_t **scan;
5747 
5748   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5749                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5750 
5751   KMP_DEBUG_ASSERT(this_th);
5752 
5753   // When moving thread to pool, switch thread to wait on own b_go flag, and
5754   // uninitialized (NULL team).
5755   int b;
5756   kmp_balign_t *balign = this_th->th.th_bar;
5757   for (b = 0; b < bs_last_barrier; ++b) {
5758     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5759       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5760     balign[b].bb.team = NULL;
5761     balign[b].bb.leaf_kids = 0;
5762   }
5763   this_th->th.th_task_state = 0;
5764   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5765 
5766   /* put thread back on the free pool */
5767   TCW_PTR(this_th->th.th_team, NULL);
5768   TCW_PTR(this_th->th.th_root, NULL);
5769   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5770 
5771   while (this_th->th.th_cg_roots) {
5772     this_th->th.th_cg_roots->cg_nthreads--;
5773     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5774                    " %p of thread  %p to %d\n",
5775                    this_th, this_th->th.th_cg_roots,
5776                    this_th->th.th_cg_roots->cg_root,
5777                    this_th->th.th_cg_roots->cg_nthreads));
5778     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5779     if (tmp->cg_root == this_th) { // Thread is a cg_root
5780       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5781       KA_TRACE(
5782           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5783       this_th->th.th_cg_roots = tmp->up;
5784       __kmp_free(tmp);
5785     } else { // Worker thread
5786       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5787         __kmp_free(tmp);
5788       }
5789       this_th->th.th_cg_roots = NULL;
5790       break;
5791     }
5792   }
5793 
5794   /* If the implicit task assigned to this thread can be used by other threads
5795    * -> multiple threads can share the data and try to free the task at
5796    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5797    * with higher probability when hot team is disabled but can occurs even when
5798    * the hot team is enabled */
5799   __kmp_free_implicit_task(this_th);
5800   this_th->th.th_current_task = NULL;
5801 
5802   // If the __kmp_thread_pool_insert_pt is already past the new insert
5803   // point, then we need to re-scan the entire list.
5804   gtid = this_th->th.th_info.ds.ds_gtid;
5805   if (__kmp_thread_pool_insert_pt != NULL) {
5806     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5807     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5808       __kmp_thread_pool_insert_pt = NULL;
5809     }
5810   }
5811 
5812   // Scan down the list to find the place to insert the thread.
5813   // scan is the address of a link in the list, possibly the address of
5814   // __kmp_thread_pool itself.
5815   //
5816   // In the absence of nested parallelism, the for loop will have 0 iterations.
5817   if (__kmp_thread_pool_insert_pt != NULL) {
5818     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5819   } else {
5820     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5821   }
5822   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5823        scan = &((*scan)->th.th_next_pool))
5824     ;
5825 
5826   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5827   // to its address.
5828   TCW_PTR(this_th->th.th_next_pool, *scan);
5829   __kmp_thread_pool_insert_pt = *scan = this_th;
5830   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5831                    (this_th->th.th_info.ds.ds_gtid <
5832                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5833   TCW_4(this_th->th.th_in_pool, TRUE);
5834   __kmp_suspend_initialize_thread(this_th);
5835   __kmp_lock_suspend_mx(this_th);
5836   if (this_th->th.th_active == TRUE) {
5837     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5838     this_th->th.th_active_in_pool = TRUE;
5839   }
5840 #if KMP_DEBUG
5841   else {
5842     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5843   }
5844 #endif
5845   __kmp_unlock_suspend_mx(this_th);
5846 
5847   TCW_4(__kmp_nth, __kmp_nth - 1);
5848 
5849 #ifdef KMP_ADJUST_BLOCKTIME
5850   /* Adjust blocktime back to user setting or default if necessary */
5851   /* Middle initialization might never have occurred                */
5852   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5853     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5854     if (__kmp_nth <= __kmp_avail_proc) {
5855       __kmp_zero_bt = FALSE;
5856     }
5857   }
5858 #endif /* KMP_ADJUST_BLOCKTIME */
5859 
5860   KMP_MB();
5861 }
5862 
5863 /* ------------------------------------------------------------------------ */
5864 
5865 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5866 #if OMP_PROFILING_SUPPORT
5867   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5868   // TODO: add a configuration option for time granularity
5869   if (ProfileTraceFile)
5870     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5871 #endif
5872 
5873   int gtid = this_thr->th.th_info.ds.ds_gtid;
5874   /*    void                 *stack_data;*/
5875   kmp_team_t **volatile pteam;
5876 
5877   KMP_MB();
5878   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5879 
5880   if (__kmp_env_consistency_check) {
5881     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5882   }
5883 
5884 #if OMPD_SUPPORT
5885   if (ompd_state & OMPD_ENABLE_BP)
5886     ompd_bp_thread_begin();
5887 #endif
5888 
5889 #if OMPT_SUPPORT
5890   ompt_data_t *thread_data = nullptr;
5891   if (ompt_enabled.enabled) {
5892     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5893     *thread_data = ompt_data_none;
5894 
5895     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5896     this_thr->th.ompt_thread_info.wait_id = 0;
5897     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5898     this_thr->th.ompt_thread_info.parallel_flags = 0;
5899     if (ompt_enabled.ompt_callback_thread_begin) {
5900       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5901           ompt_thread_worker, thread_data);
5902     }
5903     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5904   }
5905 #endif
5906 
5907   /* This is the place where threads wait for work */
5908   while (!TCR_4(__kmp_global.g.g_done)) {
5909     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5910     KMP_MB();
5911 
5912     /* wait for work to do */
5913     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5914 
5915     /* No tid yet since not part of a team */
5916     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5917 
5918 #if OMPT_SUPPORT
5919     if (ompt_enabled.enabled) {
5920       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5921     }
5922 #endif
5923 
5924     pteam = &this_thr->th.th_team;
5925 
5926     /* have we been allocated? */
5927     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5928       /* we were just woken up, so run our new task */
5929       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5930         int rc;
5931         KA_TRACE(20,
5932                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5933                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5934                   (*pteam)->t.t_pkfn));
5935 
5936         updateHWFPControl(*pteam);
5937 
5938 #if OMPT_SUPPORT
5939         if (ompt_enabled.enabled) {
5940           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5941         }
5942 #endif
5943 
5944         rc = (*pteam)->t.t_invoke(gtid);
5945         KMP_ASSERT(rc);
5946 
5947         KMP_MB();
5948         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5949                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5950                       (*pteam)->t.t_pkfn));
5951       }
5952 #if OMPT_SUPPORT
5953       if (ompt_enabled.enabled) {
5954         /* no frame set while outside task */
5955         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5956 
5957         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5958       }
5959 #endif
5960       /* join barrier after parallel region */
5961       __kmp_join_barrier(gtid);
5962     }
5963   }
5964   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5965 
5966 #if OMPD_SUPPORT
5967   if (ompd_state & OMPD_ENABLE_BP)
5968     ompd_bp_thread_end();
5969 #endif
5970 
5971 #if OMPT_SUPPORT
5972   if (ompt_enabled.ompt_callback_thread_end) {
5973     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5974   }
5975 #endif
5976 
5977   this_thr->th.th_task_team = NULL;
5978   /* run the destructors for the threadprivate data for this thread */
5979   __kmp_common_destroy_gtid(gtid);
5980 
5981   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5982   KMP_MB();
5983 
5984 #if OMP_PROFILING_SUPPORT
5985   llvm::timeTraceProfilerFinishThread();
5986 #endif
5987   return this_thr;
5988 }
5989 
5990 /* ------------------------------------------------------------------------ */
5991 
5992 void __kmp_internal_end_dest(void *specific_gtid) {
5993   // Make sure no significant bits are lost
5994   int gtid;
5995   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5996 
5997   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5998   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5999    * this is because 0 is reserved for the nothing-stored case */
6000 
6001   __kmp_internal_end_thread(gtid);
6002 }
6003 
6004 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6005 
6006 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6007   __kmp_internal_end_atexit();
6008 }
6009 
6010 #endif
6011 
6012 /* [Windows] josh: when the atexit handler is called, there may still be more
6013    than one thread alive */
6014 void __kmp_internal_end_atexit(void) {
6015   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6016   /* [Windows]
6017      josh: ideally, we want to completely shutdown the library in this atexit
6018      handler, but stat code that depends on thread specific data for gtid fails
6019      because that data becomes unavailable at some point during the shutdown, so
6020      we call __kmp_internal_end_thread instead. We should eventually remove the
6021      dependency on __kmp_get_specific_gtid in the stat code and use
6022      __kmp_internal_end_library to cleanly shutdown the library.
6023 
6024      // TODO: Can some of this comment about GVS be removed?
6025      I suspect that the offending stat code is executed when the calling thread
6026      tries to clean up a dead root thread's data structures, resulting in GVS
6027      code trying to close the GVS structures for that thread, but since the stat
6028      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6029      the calling thread is cleaning up itself instead of another thread, it get
6030      confused. This happens because allowing a thread to unregister and cleanup
6031      another thread is a recent modification for addressing an issue.
6032      Based on the current design (20050722), a thread may end up
6033      trying to unregister another thread only if thread death does not trigger
6034      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
6035      thread specific data destructor function to detect thread death. For
6036      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6037      is nothing.  Thus, the workaround is applicable only for Windows static
6038      stat library. */
6039   __kmp_internal_end_library(-1);
6040 #if KMP_OS_WINDOWS
6041   __kmp_close_console();
6042 #endif
6043 }
6044 
6045 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6046   // It is assumed __kmp_forkjoin_lock is acquired.
6047 
6048   int gtid;
6049 
6050   KMP_DEBUG_ASSERT(thread != NULL);
6051 
6052   gtid = thread->th.th_info.ds.ds_gtid;
6053 
6054   if (!is_root) {
6055     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6056       /* Assume the threads are at the fork barrier here */
6057       KA_TRACE(
6058           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6059                gtid));
6060       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6061         while (
6062             !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6063           KMP_CPU_PAUSE();
6064         __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6065       } else {
6066         /* Need release fence here to prevent seg faults for tree forkjoin
6067            barrier (GEH) */
6068         kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6069                            thread);
6070         __kmp_release_64(&flag);
6071       }
6072     }
6073 
6074     // Terminate OS thread.
6075     __kmp_reap_worker(thread);
6076 
6077     // The thread was killed asynchronously.  If it was actively
6078     // spinning in the thread pool, decrement the global count.
6079     //
6080     // There is a small timing hole here - if the worker thread was just waking
6081     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6082     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6083     // the global counter might not get updated.
6084     //
6085     // Currently, this can only happen as the library is unloaded,
6086     // so there are no harmful side effects.
6087     if (thread->th.th_active_in_pool) {
6088       thread->th.th_active_in_pool = FALSE;
6089       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6090       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6091     }
6092   }
6093 
6094   __kmp_free_implicit_task(thread);
6095 
6096 // Free the fast memory for tasking
6097 #if USE_FAST_MEMORY
6098   __kmp_free_fast_memory(thread);
6099 #endif /* USE_FAST_MEMORY */
6100 
6101   __kmp_suspend_uninitialize_thread(thread);
6102 
6103   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6104   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6105 
6106   --__kmp_all_nth;
6107   // __kmp_nth was decremented when thread is added to the pool.
6108 
6109 #ifdef KMP_ADJUST_BLOCKTIME
6110   /* Adjust blocktime back to user setting or default if necessary */
6111   /* Middle initialization might never have occurred                */
6112   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6113     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6114     if (__kmp_nth <= __kmp_avail_proc) {
6115       __kmp_zero_bt = FALSE;
6116     }
6117   }
6118 #endif /* KMP_ADJUST_BLOCKTIME */
6119 
6120   /* free the memory being used */
6121   if (__kmp_env_consistency_check) {
6122     if (thread->th.th_cons) {
6123       __kmp_free_cons_stack(thread->th.th_cons);
6124       thread->th.th_cons = NULL;
6125     }
6126   }
6127 
6128   if (thread->th.th_pri_common != NULL) {
6129     __kmp_free(thread->th.th_pri_common);
6130     thread->th.th_pri_common = NULL;
6131   }
6132 
6133   if (thread->th.th_task_state_memo_stack != NULL) {
6134     __kmp_free(thread->th.th_task_state_memo_stack);
6135     thread->th.th_task_state_memo_stack = NULL;
6136   }
6137 
6138 #if KMP_USE_BGET
6139   if (thread->th.th_local.bget_data != NULL) {
6140     __kmp_finalize_bget(thread);
6141   }
6142 #endif
6143 
6144 #if KMP_AFFINITY_SUPPORTED
6145   if (thread->th.th_affin_mask != NULL) {
6146     KMP_CPU_FREE(thread->th.th_affin_mask);
6147     thread->th.th_affin_mask = NULL;
6148   }
6149 #endif /* KMP_AFFINITY_SUPPORTED */
6150 
6151 #if KMP_USE_HIER_SCHED
6152   if (thread->th.th_hier_bar_data != NULL) {
6153     __kmp_free(thread->th.th_hier_bar_data);
6154     thread->th.th_hier_bar_data = NULL;
6155   }
6156 #endif
6157 
6158   __kmp_reap_team(thread->th.th_serial_team);
6159   thread->th.th_serial_team = NULL;
6160   __kmp_free(thread);
6161 
6162   KMP_MB();
6163 
6164 } // __kmp_reap_thread
6165 
6166 static void __kmp_itthash_clean(kmp_info_t *th) {
6167 #if USE_ITT_NOTIFY
6168   if (__kmp_itt_region_domains.count > 0) {
6169     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6170       kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6171       while (bucket) {
6172         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6173         __kmp_thread_free(th, bucket);
6174         bucket = next;
6175       }
6176     }
6177   }
6178   if (__kmp_itt_barrier_domains.count > 0) {
6179     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6180       kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6181       while (bucket) {
6182         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6183         __kmp_thread_free(th, bucket);
6184         bucket = next;
6185       }
6186     }
6187   }
6188 #endif
6189 }
6190 
6191 static void __kmp_internal_end(void) {
6192   int i;
6193 
6194   /* First, unregister the library */
6195   __kmp_unregister_library();
6196 
6197 #if KMP_OS_WINDOWS
6198   /* In Win static library, we can't tell when a root actually dies, so we
6199      reclaim the data structures for any root threads that have died but not
6200      unregistered themselves, in order to shut down cleanly.
6201      In Win dynamic library we also can't tell when a thread dies.  */
6202   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6203 // dead roots
6204 #endif
6205 
6206   for (i = 0; i < __kmp_threads_capacity; i++)
6207     if (__kmp_root[i])
6208       if (__kmp_root[i]->r.r_active)
6209         break;
6210   KMP_MB(); /* Flush all pending memory write invalidates.  */
6211   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6212 
6213   if (i < __kmp_threads_capacity) {
6214 #if KMP_USE_MONITOR
6215     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6216     KMP_MB(); /* Flush all pending memory write invalidates.  */
6217 
6218     // Need to check that monitor was initialized before reaping it. If we are
6219     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6220     // __kmp_monitor will appear to contain valid data, but it is only valid in
6221     // the parent process, not the child.
6222     // New behavior (201008): instead of keying off of the flag
6223     // __kmp_init_parallel, the monitor thread creation is keyed off
6224     // of the new flag __kmp_init_monitor.
6225     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6226     if (TCR_4(__kmp_init_monitor)) {
6227       __kmp_reap_monitor(&__kmp_monitor);
6228       TCW_4(__kmp_init_monitor, 0);
6229     }
6230     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6231     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6232 #endif // KMP_USE_MONITOR
6233   } else {
6234 /* TODO move this to cleanup code */
6235 #ifdef KMP_DEBUG
6236     /* make sure that everything has properly ended */
6237     for (i = 0; i < __kmp_threads_capacity; i++) {
6238       if (__kmp_root[i]) {
6239         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6240         //                    there can be uber threads alive here
6241         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6242       }
6243     }
6244 #endif
6245 
6246     KMP_MB();
6247 
6248     // Reap the worker threads.
6249     // This is valid for now, but be careful if threads are reaped sooner.
6250     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6251       // Get the next thread from the pool.
6252       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6253       __kmp_thread_pool = thread->th.th_next_pool;
6254       // Reap it.
6255       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6256       thread->th.th_next_pool = NULL;
6257       thread->th.th_in_pool = FALSE;
6258       __kmp_reap_thread(thread, 0);
6259     }
6260     __kmp_thread_pool_insert_pt = NULL;
6261 
6262     // Reap teams.
6263     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6264       // Get the next team from the pool.
6265       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6266       __kmp_team_pool = team->t.t_next_pool;
6267       // Reap it.
6268       team->t.t_next_pool = NULL;
6269       __kmp_reap_team(team);
6270     }
6271 
6272     __kmp_reap_task_teams();
6273 
6274 #if KMP_OS_UNIX
6275     // Threads that are not reaped should not access any resources since they
6276     // are going to be deallocated soon, so the shutdown sequence should wait
6277     // until all threads either exit the final spin-waiting loop or begin
6278     // sleeping after the given blocktime.
6279     for (i = 0; i < __kmp_threads_capacity; i++) {
6280       kmp_info_t *thr = __kmp_threads[i];
6281       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6282         KMP_CPU_PAUSE();
6283     }
6284 #endif
6285 
6286     for (i = 0; i < __kmp_threads_capacity; ++i) {
6287       // TBD: Add some checking...
6288       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6289     }
6290 
6291     /* Make sure all threadprivate destructors get run by joining with all
6292        worker threads before resetting this flag */
6293     TCW_SYNC_4(__kmp_init_common, FALSE);
6294 
6295     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6296     KMP_MB();
6297 
6298 #if KMP_USE_MONITOR
6299     // See note above: One of the possible fixes for CQ138434 / CQ140126
6300     //
6301     // FIXME: push both code fragments down and CSE them?
6302     // push them into __kmp_cleanup() ?
6303     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6304     if (TCR_4(__kmp_init_monitor)) {
6305       __kmp_reap_monitor(&__kmp_monitor);
6306       TCW_4(__kmp_init_monitor, 0);
6307     }
6308     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6309     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6310 #endif
6311   } /* else !__kmp_global.t_active */
6312   TCW_4(__kmp_init_gtid, FALSE);
6313   KMP_MB(); /* Flush all pending memory write invalidates.  */
6314 
6315   __kmp_cleanup();
6316 #if OMPT_SUPPORT
6317   ompt_fini();
6318 #endif
6319 }
6320 
6321 void __kmp_internal_end_library(int gtid_req) {
6322   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6323   /* this shouldn't be a race condition because __kmp_internal_end() is the
6324      only place to clear __kmp_serial_init */
6325   /* we'll check this later too, after we get the lock */
6326   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6327   // redundant, because the next check will work in any case.
6328   if (__kmp_global.g.g_abort) {
6329     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6330     /* TODO abort? */
6331     return;
6332   }
6333   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6334     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6335     return;
6336   }
6337 
6338   // If hidden helper team has been initialized, we need to deinit it
6339   if (TCR_4(__kmp_init_hidden_helper) &&
6340       !TCR_4(__kmp_hidden_helper_team_done)) {
6341     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6342     // First release the main thread to let it continue its work
6343     __kmp_hidden_helper_main_thread_release();
6344     // Wait until the hidden helper team has been destroyed
6345     __kmp_hidden_helper_threads_deinitz_wait();
6346   }
6347 
6348   KMP_MB(); /* Flush all pending memory write invalidates.  */
6349   /* find out who we are and what we should do */
6350   {
6351     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6352     KA_TRACE(
6353         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6354     if (gtid == KMP_GTID_SHUTDOWN) {
6355       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6356                     "already shutdown\n"));
6357       return;
6358     } else if (gtid == KMP_GTID_MONITOR) {
6359       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6360                     "registered, or system shutdown\n"));
6361       return;
6362     } else if (gtid == KMP_GTID_DNE) {
6363       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6364                     "shutdown\n"));
6365       /* we don't know who we are, but we may still shutdown the library */
6366     } else if (KMP_UBER_GTID(gtid)) {
6367       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6368       if (__kmp_root[gtid]->r.r_active) {
6369         __kmp_global.g.g_abort = -1;
6370         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6371         __kmp_unregister_library();
6372         KA_TRACE(10,
6373                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6374                   gtid));
6375         return;
6376       } else {
6377         __kmp_itthash_clean(__kmp_threads[gtid]);
6378         KA_TRACE(
6379             10,
6380             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6381         __kmp_unregister_root_current_thread(gtid);
6382       }
6383     } else {
6384 /* worker threads may call this function through the atexit handler, if they
6385  * call exit() */
6386 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6387    TODO: do a thorough shutdown instead */
6388 #ifdef DUMP_DEBUG_ON_EXIT
6389       if (__kmp_debug_buf)
6390         __kmp_dump_debug_buffer();
6391 #endif
6392       // added unregister library call here when we switch to shm linux
6393       // if we don't, it will leave lots of files in /dev/shm
6394       // cleanup shared memory file before exiting.
6395       __kmp_unregister_library();
6396       return;
6397     }
6398   }
6399   /* synchronize the termination process */
6400   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6401 
6402   /* have we already finished */
6403   if (__kmp_global.g.g_abort) {
6404     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6405     /* TODO abort? */
6406     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6407     return;
6408   }
6409   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6410     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6411     return;
6412   }
6413 
6414   /* We need this lock to enforce mutex between this reading of
6415      __kmp_threads_capacity and the writing by __kmp_register_root.
6416      Alternatively, we can use a counter of roots that is atomically updated by
6417      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6418      __kmp_internal_end_*.  */
6419   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6420 
6421   /* now we can safely conduct the actual termination */
6422   __kmp_internal_end();
6423 
6424   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6425   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6426 
6427   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6428 
6429 #ifdef DUMP_DEBUG_ON_EXIT
6430   if (__kmp_debug_buf)
6431     __kmp_dump_debug_buffer();
6432 #endif
6433 
6434 #if KMP_OS_WINDOWS
6435   __kmp_close_console();
6436 #endif
6437 
6438   __kmp_fini_allocator();
6439 
6440 } // __kmp_internal_end_library
6441 
6442 void __kmp_internal_end_thread(int gtid_req) {
6443   int i;
6444 
6445   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6446   /* this shouldn't be a race condition because __kmp_internal_end() is the
6447    * only place to clear __kmp_serial_init */
6448   /* we'll check this later too, after we get the lock */
6449   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6450   // redundant, because the next check will work in any case.
6451   if (__kmp_global.g.g_abort) {
6452     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6453     /* TODO abort? */
6454     return;
6455   }
6456   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6457     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6458     return;
6459   }
6460 
6461   // If hidden helper team has been initialized, we need to deinit it
6462   if (TCR_4(__kmp_init_hidden_helper) &&
6463       !TCR_4(__kmp_hidden_helper_team_done)) {
6464     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6465     // First release the main thread to let it continue its work
6466     __kmp_hidden_helper_main_thread_release();
6467     // Wait until the hidden helper team has been destroyed
6468     __kmp_hidden_helper_threads_deinitz_wait();
6469   }
6470 
6471   KMP_MB(); /* Flush all pending memory write invalidates.  */
6472 
6473   /* find out who we are and what we should do */
6474   {
6475     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6476     KA_TRACE(10,
6477              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6478     if (gtid == KMP_GTID_SHUTDOWN) {
6479       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6480                     "already shutdown\n"));
6481       return;
6482     } else if (gtid == KMP_GTID_MONITOR) {
6483       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6484                     "registered, or system shutdown\n"));
6485       return;
6486     } else if (gtid == KMP_GTID_DNE) {
6487       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6488                     "shutdown\n"));
6489       return;
6490       /* we don't know who we are */
6491     } else if (KMP_UBER_GTID(gtid)) {
6492       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6493       if (__kmp_root[gtid]->r.r_active) {
6494         __kmp_global.g.g_abort = -1;
6495         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6496         KA_TRACE(10,
6497                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6498                   gtid));
6499         return;
6500       } else {
6501         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6502                       gtid));
6503         __kmp_unregister_root_current_thread(gtid);
6504       }
6505     } else {
6506       /* just a worker thread, let's leave */
6507       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6508 
6509       if (gtid >= 0) {
6510         __kmp_threads[gtid]->th.th_task_team = NULL;
6511       }
6512 
6513       KA_TRACE(10,
6514                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6515                 gtid));
6516       return;
6517     }
6518   }
6519 #if KMP_DYNAMIC_LIB
6520   if (__kmp_pause_status != kmp_hard_paused)
6521   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6522   // because we will better shutdown later in the library destructor.
6523   {
6524     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6525     return;
6526   }
6527 #endif
6528   /* synchronize the termination process */
6529   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6530 
6531   /* have we already finished */
6532   if (__kmp_global.g.g_abort) {
6533     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6534     /* TODO abort? */
6535     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6536     return;
6537   }
6538   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6539     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6540     return;
6541   }
6542 
6543   /* We need this lock to enforce mutex between this reading of
6544      __kmp_threads_capacity and the writing by __kmp_register_root.
6545      Alternatively, we can use a counter of roots that is atomically updated by
6546      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6547      __kmp_internal_end_*.  */
6548 
6549   /* should we finish the run-time?  are all siblings done? */
6550   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6551 
6552   for (i = 0; i < __kmp_threads_capacity; ++i) {
6553     if (KMP_UBER_GTID(i)) {
6554       KA_TRACE(
6555           10,
6556           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6557       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6558       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6559       return;
6560     }
6561   }
6562 
6563   /* now we can safely conduct the actual termination */
6564 
6565   __kmp_internal_end();
6566 
6567   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6568   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6569 
6570   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6571 
6572 #ifdef DUMP_DEBUG_ON_EXIT
6573   if (__kmp_debug_buf)
6574     __kmp_dump_debug_buffer();
6575 #endif
6576 } // __kmp_internal_end_thread
6577 
6578 // -----------------------------------------------------------------------------
6579 // Library registration stuff.
6580 
6581 static long __kmp_registration_flag = 0;
6582 // Random value used to indicate library initialization.
6583 static char *__kmp_registration_str = NULL;
6584 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6585 
6586 static inline char *__kmp_reg_status_name() {
6587 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6588    each thread. If registration and unregistration go in different threads
6589    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6590    env var can not be found, because the name will contain different pid. */
6591 // macOS* complains about name being too long with additional getuid()
6592 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6593   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6594                           (int)getuid());
6595 #else
6596   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6597 #endif
6598 } // __kmp_reg_status_get
6599 
6600 void __kmp_register_library_startup(void) {
6601 
6602   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6603   int done = 0;
6604   union {
6605     double dtime;
6606     long ltime;
6607   } time;
6608 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6609   __kmp_initialize_system_tick();
6610 #endif
6611   __kmp_read_system_time(&time.dtime);
6612   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6613   __kmp_registration_str =
6614       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6615                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6616 
6617   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6618                 __kmp_registration_str));
6619 
6620   while (!done) {
6621 
6622     char *value = NULL; // Actual value of the environment variable.
6623 
6624 #if defined(KMP_USE_SHM)
6625     char *shm_name = __kmp_str_format("/%s", name);
6626     int shm_preexist = 0;
6627     char *data1;
6628     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6629     if ((fd1 == -1) && (errno == EEXIST)) {
6630       // file didn't open because it already exists.
6631       // try opening existing file
6632       fd1 = shm_open(shm_name, O_RDWR, 0666);
6633       if (fd1 == -1) { // file didn't open
6634         // error out here
6635         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6636                     __kmp_msg_null);
6637       } else {
6638         // able to open existing file
6639         shm_preexist = 1;
6640       }
6641     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6642       // already exists.
6643       // error out here.
6644       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6645                   __kmp_msg_null);
6646     }
6647     if (shm_preexist == 0) {
6648       // we created SHM now set size
6649       if (ftruncate(fd1, SHM_SIZE) == -1) {
6650         // error occured setting size;
6651         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6652                     KMP_ERR(errno), __kmp_msg_null);
6653       }
6654     }
6655     data1 =
6656         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6657     if (data1 == MAP_FAILED) {
6658       // failed to map shared memory
6659       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6660                   __kmp_msg_null);
6661     }
6662     if (shm_preexist == 0) { // set data to SHM, set value
6663       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6664     }
6665     // Read value from either what we just wrote or existing file.
6666     value = __kmp_str_format("%s", data1); // read value from SHM
6667     munmap(data1, SHM_SIZE);
6668     close(fd1);
6669 #else // Windows and unix with static library
6670     // Set environment variable, but do not overwrite if it is exist.
6671     __kmp_env_set(name, __kmp_registration_str, 0);
6672     // read value to see if it got set
6673     value = __kmp_env_get(name);
6674 #endif
6675 
6676     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6677       done = 1; // Ok, environment variable set successfully, exit the loop.
6678     } else {
6679       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6680       // Check whether it alive or dead.
6681       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6682       char *tail = value;
6683       char *flag_addr_str = NULL;
6684       char *flag_val_str = NULL;
6685       char const *file_name = NULL;
6686       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6687       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6688       file_name = tail;
6689       if (tail != NULL) {
6690         unsigned long *flag_addr = 0;
6691         unsigned long flag_val = 0;
6692         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6693         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6694         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6695           // First, check whether environment-encoded address is mapped into
6696           // addr space.
6697           // If so, dereference it to see if it still has the right value.
6698           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6699             neighbor = 1;
6700           } else {
6701             // If not, then we know the other copy of the library is no longer
6702             // running.
6703             neighbor = 2;
6704           }
6705         }
6706       }
6707       switch (neighbor) {
6708       case 0: // Cannot parse environment variable -- neighbor status unknown.
6709         // Assume it is the incompatible format of future version of the
6710         // library. Assume the other library is alive.
6711         // WARN( ... ); // TODO: Issue a warning.
6712         file_name = "unknown library";
6713         KMP_FALLTHROUGH();
6714       // Attention! Falling to the next case. That's intentional.
6715       case 1: { // Neighbor is alive.
6716         // Check it is allowed.
6717         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6718         if (!__kmp_str_match_true(duplicate_ok)) {
6719           // That's not allowed. Issue fatal error.
6720           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6721                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6722         }
6723         KMP_INTERNAL_FREE(duplicate_ok);
6724         __kmp_duplicate_library_ok = 1;
6725         done = 1; // Exit the loop.
6726       } break;
6727       case 2: { // Neighbor is dead.
6728 
6729 #if defined(KMP_USE_SHM)
6730         // close shared memory.
6731         shm_unlink(shm_name); // this removes file in /dev/shm
6732 #else
6733         // Clear the variable and try to register library again.
6734         __kmp_env_unset(name);
6735 #endif
6736       } break;
6737       default: {
6738         KMP_DEBUG_ASSERT(0);
6739       } break;
6740       }
6741     }
6742     KMP_INTERNAL_FREE((void *)value);
6743 #if defined(KMP_USE_SHM)
6744     KMP_INTERNAL_FREE((void *)shm_name);
6745 #endif
6746   } // while
6747   KMP_INTERNAL_FREE((void *)name);
6748 
6749 } // func __kmp_register_library_startup
6750 
6751 void __kmp_unregister_library(void) {
6752 
6753   char *name = __kmp_reg_status_name();
6754   char *value = NULL;
6755 
6756 #if defined(KMP_USE_SHM)
6757   char *shm_name = __kmp_str_format("/%s", name);
6758   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6759   if (fd1 == -1) {
6760     // file did not open. return.
6761     return;
6762   }
6763   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6764   if (data1 != MAP_FAILED) {
6765     value = __kmp_str_format("%s", data1); // read value from SHM
6766     munmap(data1, SHM_SIZE);
6767   }
6768   close(fd1);
6769 #else
6770   value = __kmp_env_get(name);
6771 #endif
6772 
6773   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6774   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6775   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6776 //  Ok, this is our variable. Delete it.
6777 #if defined(KMP_USE_SHM)
6778     shm_unlink(shm_name); // this removes file in /dev/shm
6779 #else
6780     __kmp_env_unset(name);
6781 #endif
6782   }
6783 
6784 #if defined(KMP_USE_SHM)
6785   KMP_INTERNAL_FREE(shm_name);
6786 #endif
6787 
6788   KMP_INTERNAL_FREE(__kmp_registration_str);
6789   KMP_INTERNAL_FREE(value);
6790   KMP_INTERNAL_FREE(name);
6791 
6792   __kmp_registration_flag = 0;
6793   __kmp_registration_str = NULL;
6794 
6795 } // __kmp_unregister_library
6796 
6797 // End of Library registration stuff.
6798 // -----------------------------------------------------------------------------
6799 
6800 #if KMP_MIC_SUPPORTED
6801 
6802 static void __kmp_check_mic_type() {
6803   kmp_cpuid_t cpuid_state = {0};
6804   kmp_cpuid_t *cs_p = &cpuid_state;
6805   __kmp_x86_cpuid(1, 0, cs_p);
6806   // We don't support mic1 at the moment
6807   if ((cs_p->eax & 0xff0) == 0xB10) {
6808     __kmp_mic_type = mic2;
6809   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6810     __kmp_mic_type = mic3;
6811   } else {
6812     __kmp_mic_type = non_mic;
6813   }
6814 }
6815 
6816 #endif /* KMP_MIC_SUPPORTED */
6817 
6818 #if KMP_HAVE_UMWAIT
6819 static void __kmp_user_level_mwait_init() {
6820   struct kmp_cpuid buf;
6821   __kmp_x86_cpuid(7, 0, &buf);
6822   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6823   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6824                 __kmp_umwait_enabled));
6825 }
6826 #elif KMP_HAVE_MWAIT
6827 #ifndef AT_INTELPHIUSERMWAIT
6828 // Spurious, non-existent value that should always fail to return anything.
6829 // Will be replaced with the correct value when we know that.
6830 #define AT_INTELPHIUSERMWAIT 10000
6831 #endif
6832 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6833 // earlier OS is used to build the RTL, we'll use the following internal
6834 // function when the entry is not found.
6835 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6836 unsigned long getauxval(unsigned long) { return 0; }
6837 
6838 static void __kmp_user_level_mwait_init() {
6839   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6840   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6841   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6842   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6843   if (__kmp_mic_type == mic3) {
6844     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6845     if ((res & 0x1) || __kmp_user_level_mwait) {
6846       __kmp_mwait_enabled = TRUE;
6847       if (__kmp_user_level_mwait) {
6848         KMP_INFORM(EnvMwaitWarn);
6849       }
6850     } else {
6851       __kmp_mwait_enabled = FALSE;
6852     }
6853   }
6854   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6855                 "__kmp_mwait_enabled = %d\n",
6856                 __kmp_mic_type, __kmp_mwait_enabled));
6857 }
6858 #endif /* KMP_HAVE_UMWAIT */
6859 
6860 static void __kmp_do_serial_initialize(void) {
6861   int i, gtid;
6862   size_t size;
6863 
6864   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6865 
6866   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6867   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6868   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6869   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6870   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6871 
6872 #if OMPT_SUPPORT
6873   ompt_pre_init();
6874 #endif
6875 #if OMPD_SUPPORT
6876   __kmp_env_dump();
6877   ompd_init();
6878 #endif
6879 
6880   __kmp_validate_locks();
6881 
6882   /* Initialize internal memory allocator */
6883   __kmp_init_allocator();
6884 
6885   /* Register the library startup via an environment variable and check to see
6886      whether another copy of the library is already registered. */
6887 
6888   __kmp_register_library_startup();
6889 
6890   /* TODO reinitialization of library */
6891   if (TCR_4(__kmp_global.g.g_done)) {
6892     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6893   }
6894 
6895   __kmp_global.g.g_abort = 0;
6896   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6897 
6898 /* initialize the locks */
6899 #if KMP_USE_ADAPTIVE_LOCKS
6900 #if KMP_DEBUG_ADAPTIVE_LOCKS
6901   __kmp_init_speculative_stats();
6902 #endif
6903 #endif
6904 #if KMP_STATS_ENABLED
6905   __kmp_stats_init();
6906 #endif
6907   __kmp_init_lock(&__kmp_global_lock);
6908   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6909   __kmp_init_lock(&__kmp_debug_lock);
6910   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6911   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6912   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6913   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6914   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6915   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6916   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6917   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6918   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6919   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6920   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6921   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6922   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6923   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6924   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6925 #if KMP_USE_MONITOR
6926   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6927 #endif
6928   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6929 
6930   /* conduct initialization and initial setup of configuration */
6931 
6932   __kmp_runtime_initialize();
6933 
6934 #if KMP_MIC_SUPPORTED
6935   __kmp_check_mic_type();
6936 #endif
6937 
6938 // Some global variable initialization moved here from kmp_env_initialize()
6939 #ifdef KMP_DEBUG
6940   kmp_diag = 0;
6941 #endif
6942   __kmp_abort_delay = 0;
6943 
6944   // From __kmp_init_dflt_team_nth()
6945   /* assume the entire machine will be used */
6946   __kmp_dflt_team_nth_ub = __kmp_xproc;
6947   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6948     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6949   }
6950   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6951     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6952   }
6953   __kmp_max_nth = __kmp_sys_max_nth;
6954   __kmp_cg_max_nth = __kmp_sys_max_nth;
6955   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6956   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6957     __kmp_teams_max_nth = __kmp_sys_max_nth;
6958   }
6959 
6960   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6961   // part
6962   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6963 #if KMP_USE_MONITOR
6964   __kmp_monitor_wakeups =
6965       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6966   __kmp_bt_intervals =
6967       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6968 #endif
6969   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6970   __kmp_library = library_throughput;
6971   // From KMP_SCHEDULE initialization
6972   __kmp_static = kmp_sch_static_balanced;
6973 // AC: do not use analytical here, because it is non-monotonous
6974 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6975 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6976 // need to repeat assignment
6977 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6978 // bit control and barrier method control parts
6979 #if KMP_FAST_REDUCTION_BARRIER
6980 #define kmp_reduction_barrier_gather_bb ((int)1)
6981 #define kmp_reduction_barrier_release_bb ((int)1)
6982 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
6983 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
6984 #endif // KMP_FAST_REDUCTION_BARRIER
6985   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6986     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6987     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6988     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6989     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6990 #if KMP_FAST_REDUCTION_BARRIER
6991     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6992       // lin_64 ): hyper,1
6993       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6994       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6995       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6996       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6997     }
6998 #endif // KMP_FAST_REDUCTION_BARRIER
6999   }
7000 #if KMP_FAST_REDUCTION_BARRIER
7001 #undef kmp_reduction_barrier_release_pat
7002 #undef kmp_reduction_barrier_gather_pat
7003 #undef kmp_reduction_barrier_release_bb
7004 #undef kmp_reduction_barrier_gather_bb
7005 #endif // KMP_FAST_REDUCTION_BARRIER
7006 #if KMP_MIC_SUPPORTED
7007   if (__kmp_mic_type == mic2) { // KNC
7008     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7009     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7010     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7011         1; // forkjoin release
7012     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7013     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7014   }
7015 #if KMP_FAST_REDUCTION_BARRIER
7016   if (__kmp_mic_type == mic2) { // KNC
7017     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7018     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7019   }
7020 #endif // KMP_FAST_REDUCTION_BARRIER
7021 #endif // KMP_MIC_SUPPORTED
7022 
7023 // From KMP_CHECKS initialization
7024 #ifdef KMP_DEBUG
7025   __kmp_env_checks = TRUE; /* development versions have the extra checks */
7026 #else
7027   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7028 #endif
7029 
7030   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7031   __kmp_foreign_tp = TRUE;
7032 
7033   __kmp_global.g.g_dynamic = FALSE;
7034   __kmp_global.g.g_dynamic_mode = dynamic_default;
7035 
7036   __kmp_init_nesting_mode();
7037 
7038   __kmp_env_initialize(NULL);
7039 
7040 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7041   __kmp_user_level_mwait_init();
7042 #endif
7043 // Print all messages in message catalog for testing purposes.
7044 #ifdef KMP_DEBUG
7045   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7046   if (__kmp_str_match_true(val)) {
7047     kmp_str_buf_t buffer;
7048     __kmp_str_buf_init(&buffer);
7049     __kmp_i18n_dump_catalog(&buffer);
7050     __kmp_printf("%s", buffer.str);
7051     __kmp_str_buf_free(&buffer);
7052   }
7053   __kmp_env_free(&val);
7054 #endif
7055 
7056   __kmp_threads_capacity =
7057       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7058   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7059   __kmp_tp_capacity = __kmp_default_tp_capacity(
7060       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7061 
7062   // If the library is shut down properly, both pools must be NULL. Just in
7063   // case, set them to NULL -- some memory may leak, but subsequent code will
7064   // work even if pools are not freed.
7065   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7066   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7067   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7068   __kmp_thread_pool = NULL;
7069   __kmp_thread_pool_insert_pt = NULL;
7070   __kmp_team_pool = NULL;
7071 
7072   /* Allocate all of the variable sized records */
7073   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7074    * expandable */
7075   /* Since allocation is cache-aligned, just add extra padding at the end */
7076   size =
7077       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7078       CACHE_LINE;
7079   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7080   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7081                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
7082 
7083   /* init thread counts */
7084   KMP_DEBUG_ASSERT(__kmp_all_nth ==
7085                    0); // Asserts fail if the library is reinitializing and
7086   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7087   __kmp_all_nth = 0;
7088   __kmp_nth = 0;
7089 
7090   /* setup the uber master thread and hierarchy */
7091   gtid = __kmp_register_root(TRUE);
7092   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
7093   KMP_ASSERT(KMP_UBER_GTID(gtid));
7094   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7095 
7096   KMP_MB(); /* Flush all pending memory write invalidates.  */
7097 
7098   __kmp_common_initialize();
7099 
7100 #if KMP_OS_UNIX
7101   /* invoke the child fork handler */
7102   __kmp_register_atfork();
7103 #endif
7104 
7105 #if !KMP_DYNAMIC_LIB
7106   {
7107     /* Invoke the exit handler when the program finishes, only for static
7108        library. For dynamic library, we already have _fini and DllMain. */
7109     int rc = atexit(__kmp_internal_end_atexit);
7110     if (rc != 0) {
7111       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7112                   __kmp_msg_null);
7113     }
7114   }
7115 #endif
7116 
7117 #if KMP_HANDLE_SIGNALS
7118 #if KMP_OS_UNIX
7119   /* NOTE: make sure that this is called before the user installs their own
7120      signal handlers so that the user handlers are called first. this way they
7121      can return false, not call our handler, avoid terminating the library, and
7122      continue execution where they left off. */
7123   __kmp_install_signals(FALSE);
7124 #endif /* KMP_OS_UNIX */
7125 #if KMP_OS_WINDOWS
7126   __kmp_install_signals(TRUE);
7127 #endif /* KMP_OS_WINDOWS */
7128 #endif
7129 
7130   /* we have finished the serial initialization */
7131   __kmp_init_counter++;
7132 
7133   __kmp_init_serial = TRUE;
7134 
7135   if (__kmp_settings) {
7136     __kmp_env_print();
7137   }
7138 
7139   if (__kmp_display_env || __kmp_display_env_verbose) {
7140     __kmp_env_print_2();
7141   }
7142 
7143 #if OMPT_SUPPORT
7144   ompt_post_init();
7145 #endif
7146 
7147   KMP_MB();
7148 
7149   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7150 }
7151 
7152 void __kmp_serial_initialize(void) {
7153   if (__kmp_init_serial) {
7154     return;
7155   }
7156   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7157   if (__kmp_init_serial) {
7158     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7159     return;
7160   }
7161   __kmp_do_serial_initialize();
7162   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7163 }
7164 
7165 static void __kmp_do_middle_initialize(void) {
7166   int i, j;
7167   int prev_dflt_team_nth;
7168 
7169   if (!__kmp_init_serial) {
7170     __kmp_do_serial_initialize();
7171   }
7172 
7173   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7174 
7175   // Save the previous value for the __kmp_dflt_team_nth so that
7176   // we can avoid some reinitialization if it hasn't changed.
7177   prev_dflt_team_nth = __kmp_dflt_team_nth;
7178 
7179 #if KMP_AFFINITY_SUPPORTED
7180   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7181   // number of cores on the machine.
7182   __kmp_affinity_initialize();
7183 
7184 #endif /* KMP_AFFINITY_SUPPORTED */
7185 
7186   KMP_ASSERT(__kmp_xproc > 0);
7187   if (__kmp_avail_proc == 0) {
7188     __kmp_avail_proc = __kmp_xproc;
7189   }
7190 
7191   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7192   // correct them now
7193   j = 0;
7194   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7195     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7196         __kmp_avail_proc;
7197     j++;
7198   }
7199 
7200   if (__kmp_dflt_team_nth == 0) {
7201 #ifdef KMP_DFLT_NTH_CORES
7202     // Default #threads = #cores
7203     __kmp_dflt_team_nth = __kmp_ncores;
7204     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7205                   "__kmp_ncores (%d)\n",
7206                   __kmp_dflt_team_nth));
7207 #else
7208     // Default #threads = #available OS procs
7209     __kmp_dflt_team_nth = __kmp_avail_proc;
7210     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7211                   "__kmp_avail_proc(%d)\n",
7212                   __kmp_dflt_team_nth));
7213 #endif /* KMP_DFLT_NTH_CORES */
7214   }
7215 
7216   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7217     __kmp_dflt_team_nth = KMP_MIN_NTH;
7218   }
7219   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7220     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7221   }
7222 
7223   if (__kmp_nesting_mode > 0)
7224     __kmp_set_nesting_mode_threads();
7225 
7226   // There's no harm in continuing if the following check fails,
7227   // but it indicates an error in the previous logic.
7228   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7229 
7230   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7231     // Run through the __kmp_threads array and set the num threads icv for each
7232     // root thread that is currently registered with the RTL (which has not
7233     // already explicitly set its nthreads-var with a call to
7234     // omp_set_num_threads()).
7235     for (i = 0; i < __kmp_threads_capacity; i++) {
7236       kmp_info_t *thread = __kmp_threads[i];
7237       if (thread == NULL)
7238         continue;
7239       if (thread->th.th_current_task->td_icvs.nproc != 0)
7240         continue;
7241 
7242       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7243     }
7244   }
7245   KA_TRACE(
7246       20,
7247       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7248        __kmp_dflt_team_nth));
7249 
7250 #ifdef KMP_ADJUST_BLOCKTIME
7251   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7252   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7253     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7254     if (__kmp_nth > __kmp_avail_proc) {
7255       __kmp_zero_bt = TRUE;
7256     }
7257   }
7258 #endif /* KMP_ADJUST_BLOCKTIME */
7259 
7260   /* we have finished middle initialization */
7261   TCW_SYNC_4(__kmp_init_middle, TRUE);
7262 
7263   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7264 }
7265 
7266 void __kmp_middle_initialize(void) {
7267   if (__kmp_init_middle) {
7268     return;
7269   }
7270   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7271   if (__kmp_init_middle) {
7272     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7273     return;
7274   }
7275   __kmp_do_middle_initialize();
7276   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7277 }
7278 
7279 void __kmp_parallel_initialize(void) {
7280   int gtid = __kmp_entry_gtid(); // this might be a new root
7281 
7282   /* synchronize parallel initialization (for sibling) */
7283   if (TCR_4(__kmp_init_parallel))
7284     return;
7285   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7286   if (TCR_4(__kmp_init_parallel)) {
7287     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7288     return;
7289   }
7290 
7291   /* TODO reinitialization after we have already shut down */
7292   if (TCR_4(__kmp_global.g.g_done)) {
7293     KA_TRACE(
7294         10,
7295         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7296     __kmp_infinite_loop();
7297   }
7298 
7299   /* jc: The lock __kmp_initz_lock is already held, so calling
7300      __kmp_serial_initialize would cause a deadlock.  So we call
7301      __kmp_do_serial_initialize directly. */
7302   if (!__kmp_init_middle) {
7303     __kmp_do_middle_initialize();
7304   }
7305   __kmp_assign_root_init_mask();
7306   __kmp_resume_if_hard_paused();
7307 
7308   /* begin initialization */
7309   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7310   KMP_ASSERT(KMP_UBER_GTID(gtid));
7311 
7312 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7313   // Save the FP control regs.
7314   // Worker threads will set theirs to these values at thread startup.
7315   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7316   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7317   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7318 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7319 
7320 #if KMP_OS_UNIX
7321 #if KMP_HANDLE_SIGNALS
7322   /*  must be after __kmp_serial_initialize  */
7323   __kmp_install_signals(TRUE);
7324 #endif
7325 #endif
7326 
7327   __kmp_suspend_initialize();
7328 
7329 #if defined(USE_LOAD_BALANCE)
7330   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7331     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7332   }
7333 #else
7334   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7335     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7336   }
7337 #endif
7338 
7339   if (__kmp_version) {
7340     __kmp_print_version_2();
7341   }
7342 
7343   /* we have finished parallel initialization */
7344   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7345 
7346   KMP_MB();
7347   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7348 
7349   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7350 }
7351 
7352 void __kmp_hidden_helper_initialize() {
7353   if (TCR_4(__kmp_init_hidden_helper))
7354     return;
7355 
7356   // __kmp_parallel_initialize is required before we initialize hidden helper
7357   if (!TCR_4(__kmp_init_parallel))
7358     __kmp_parallel_initialize();
7359 
7360   // Double check. Note that this double check should not be placed before
7361   // __kmp_parallel_initialize as it will cause dead lock.
7362   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7363   if (TCR_4(__kmp_init_hidden_helper)) {
7364     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7365     return;
7366   }
7367 
7368   // Set the count of hidden helper tasks to be executed to zero
7369   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7370 
7371   // Set the global variable indicating that we're initializing hidden helper
7372   // team/threads
7373   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7374 
7375   // Platform independent initialization
7376   __kmp_do_initialize_hidden_helper_threads();
7377 
7378   // Wait here for the finish of initialization of hidden helper teams
7379   __kmp_hidden_helper_threads_initz_wait();
7380 
7381   // We have finished hidden helper initialization
7382   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7383 
7384   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7385 }
7386 
7387 /* ------------------------------------------------------------------------ */
7388 
7389 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7390                                    kmp_team_t *team) {
7391   kmp_disp_t *dispatch;
7392 
7393   KMP_MB();
7394 
7395   /* none of the threads have encountered any constructs, yet. */
7396   this_thr->th.th_local.this_construct = 0;
7397 #if KMP_CACHE_MANAGE
7398   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7399 #endif /* KMP_CACHE_MANAGE */
7400   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7401   KMP_DEBUG_ASSERT(dispatch);
7402   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7403   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7404   // this_thr->th.th_info.ds.ds_tid ] );
7405 
7406   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7407   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7408   if (__kmp_env_consistency_check)
7409     __kmp_push_parallel(gtid, team->t.t_ident);
7410 
7411   KMP_MB(); /* Flush all pending memory write invalidates.  */
7412 }
7413 
7414 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7415                                   kmp_team_t *team) {
7416   if (__kmp_env_consistency_check)
7417     __kmp_pop_parallel(gtid, team->t.t_ident);
7418 
7419   __kmp_finish_implicit_task(this_thr);
7420 }
7421 
7422 int __kmp_invoke_task_func(int gtid) {
7423   int rc;
7424   int tid = __kmp_tid_from_gtid(gtid);
7425   kmp_info_t *this_thr = __kmp_threads[gtid];
7426   kmp_team_t *team = this_thr->th.th_team;
7427 
7428   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7429 #if USE_ITT_BUILD
7430   if (__itt_stack_caller_create_ptr) {
7431     // inform ittnotify about entering user's code
7432     if (team->t.t_stack_id != NULL) {
7433       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7434     } else {
7435       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7436       __kmp_itt_stack_callee_enter(
7437           (__itt_caller)team->t.t_parent->t.t_stack_id);
7438     }
7439   }
7440 #endif /* USE_ITT_BUILD */
7441 #if INCLUDE_SSC_MARKS
7442   SSC_MARK_INVOKING();
7443 #endif
7444 
7445 #if OMPT_SUPPORT
7446   void *dummy;
7447   void **exit_frame_p;
7448   ompt_data_t *my_task_data;
7449   ompt_data_t *my_parallel_data;
7450   int ompt_team_size;
7451 
7452   if (ompt_enabled.enabled) {
7453     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7454                          .ompt_task_info.frame.exit_frame.ptr);
7455   } else {
7456     exit_frame_p = &dummy;
7457   }
7458 
7459   my_task_data =
7460       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7461   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7462   if (ompt_enabled.ompt_callback_implicit_task) {
7463     ompt_team_size = team->t.t_nproc;
7464     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7465         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7466         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7467     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7468   }
7469 #endif
7470 
7471 #if KMP_STATS_ENABLED
7472   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7473   if (previous_state == stats_state_e::TEAMS_REGION) {
7474     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7475   } else {
7476     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7477   }
7478   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7479 #endif
7480 
7481   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7482                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7483 #if OMPT_SUPPORT
7484                               ,
7485                               exit_frame_p
7486 #endif
7487   );
7488 #if OMPT_SUPPORT
7489   *exit_frame_p = NULL;
7490   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7491 #endif
7492 
7493 #if KMP_STATS_ENABLED
7494   if (previous_state == stats_state_e::TEAMS_REGION) {
7495     KMP_SET_THREAD_STATE(previous_state);
7496   }
7497   KMP_POP_PARTITIONED_TIMER();
7498 #endif
7499 
7500 #if USE_ITT_BUILD
7501   if (__itt_stack_caller_create_ptr) {
7502     // inform ittnotify about leaving user's code
7503     if (team->t.t_stack_id != NULL) {
7504       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7505     } else {
7506       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7507       __kmp_itt_stack_callee_leave(
7508           (__itt_caller)team->t.t_parent->t.t_stack_id);
7509     }
7510   }
7511 #endif /* USE_ITT_BUILD */
7512   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7513 
7514   return rc;
7515 }
7516 
7517 void __kmp_teams_master(int gtid) {
7518   // This routine is called by all primary threads in teams construct
7519   kmp_info_t *thr = __kmp_threads[gtid];
7520   kmp_team_t *team = thr->th.th_team;
7521   ident_t *loc = team->t.t_ident;
7522   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7523   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7524   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7525   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7526                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7527 
7528   // This thread is a new CG root.  Set up the proper variables.
7529   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7530   tmp->cg_root = thr; // Make thr the CG root
7531   // Init to thread limit stored when league primary threads were forked
7532   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7533   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7534   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7535                  " cg_nthreads to 1\n",
7536                  thr, tmp));
7537   tmp->up = thr->th.th_cg_roots;
7538   thr->th.th_cg_roots = tmp;
7539 
7540 // Launch league of teams now, but not let workers execute
7541 // (they hang on fork barrier until next parallel)
7542 #if INCLUDE_SSC_MARKS
7543   SSC_MARK_FORKING();
7544 #endif
7545   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7546                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7547                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7548 #if INCLUDE_SSC_MARKS
7549   SSC_MARK_JOINING();
7550 #endif
7551   // If the team size was reduced from the limit, set it to the new size
7552   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7553     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7554   // AC: last parameter "1" eliminates join barrier which won't work because
7555   // worker threads are in a fork barrier waiting for more parallel regions
7556   __kmp_join_call(loc, gtid
7557 #if OMPT_SUPPORT
7558                   ,
7559                   fork_context_intel
7560 #endif
7561                   ,
7562                   1);
7563 }
7564 
7565 int __kmp_invoke_teams_master(int gtid) {
7566   kmp_info_t *this_thr = __kmp_threads[gtid];
7567   kmp_team_t *team = this_thr->th.th_team;
7568 #if KMP_DEBUG
7569   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7570     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7571                      (void *)__kmp_teams_master);
7572 #endif
7573   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7574 #if OMPT_SUPPORT
7575   int tid = __kmp_tid_from_gtid(gtid);
7576   ompt_data_t *task_data =
7577       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7578   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7579   if (ompt_enabled.ompt_callback_implicit_task) {
7580     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7581         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7582         ompt_task_initial);
7583     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7584   }
7585 #endif
7586   __kmp_teams_master(gtid);
7587 #if OMPT_SUPPORT
7588   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7589 #endif
7590   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7591   return 1;
7592 }
7593 
7594 /* this sets the requested number of threads for the next parallel region
7595    encountered by this team. since this should be enclosed in the forkjoin
7596    critical section it should avoid race conditions with asymmetrical nested
7597    parallelism */
7598 
7599 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7600   kmp_info_t *thr = __kmp_threads[gtid];
7601 
7602   if (num_threads > 0)
7603     thr->th.th_set_nproc = num_threads;
7604 }
7605 
7606 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7607                                     int num_threads) {
7608   KMP_DEBUG_ASSERT(thr);
7609   // Remember the number of threads for inner parallel regions
7610   if (!TCR_4(__kmp_init_middle))
7611     __kmp_middle_initialize(); // get internal globals calculated
7612   __kmp_assign_root_init_mask();
7613   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7614   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7615 
7616   if (num_threads == 0) {
7617     if (__kmp_teams_thread_limit > 0) {
7618       num_threads = __kmp_teams_thread_limit;
7619     } else {
7620       num_threads = __kmp_avail_proc / num_teams;
7621     }
7622     // adjust num_threads w/o warning as it is not user setting
7623     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7624     // no thread_limit clause specified -  do not change thread-limit-var ICV
7625     if (num_threads > __kmp_dflt_team_nth) {
7626       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7627     }
7628     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7629       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7630     } // prevent team size to exceed thread-limit-var
7631     if (num_teams * num_threads > __kmp_teams_max_nth) {
7632       num_threads = __kmp_teams_max_nth / num_teams;
7633     }
7634     if (num_threads == 0) {
7635       num_threads = 1;
7636     }
7637   } else {
7638     // This thread will be the primary thread of the league primary threads
7639     // Store new thread limit; old limit is saved in th_cg_roots list
7640     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7641     // num_threads = min(num_threads, nthreads-var)
7642     if (num_threads > __kmp_dflt_team_nth) {
7643       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7644     }
7645     if (num_teams * num_threads > __kmp_teams_max_nth) {
7646       int new_threads = __kmp_teams_max_nth / num_teams;
7647       if (new_threads == 0) {
7648         new_threads = 1;
7649       }
7650       if (new_threads != num_threads) {
7651         if (!__kmp_reserve_warn) { // user asked for too many threads
7652           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7653           __kmp_msg(kmp_ms_warning,
7654                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7655                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7656         }
7657       }
7658       num_threads = new_threads;
7659     }
7660   }
7661   thr->th.th_teams_size.nth = num_threads;
7662 }
7663 
7664 /* this sets the requested number of teams for the teams region and/or
7665    the number of threads for the next parallel region encountered  */
7666 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7667                           int num_threads) {
7668   kmp_info_t *thr = __kmp_threads[gtid];
7669   KMP_DEBUG_ASSERT(num_teams >= 0);
7670   KMP_DEBUG_ASSERT(num_threads >= 0);
7671 
7672   if (num_teams == 0) {
7673     if (__kmp_nteams > 0) {
7674       num_teams = __kmp_nteams;
7675     } else {
7676       num_teams = 1; // default number of teams is 1.
7677     }
7678   }
7679   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7680     if (!__kmp_reserve_warn) {
7681       __kmp_reserve_warn = 1;
7682       __kmp_msg(kmp_ms_warning,
7683                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7684                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7685     }
7686     num_teams = __kmp_teams_max_nth;
7687   }
7688   // Set number of teams (number of threads in the outer "parallel" of the
7689   // teams)
7690   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7691 
7692   __kmp_push_thread_limit(thr, num_teams, num_threads);
7693 }
7694 
7695 /* This sets the requested number of teams for the teams region and/or
7696    the number of threads for the next parallel region encountered  */
7697 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7698                              int num_teams_ub, int num_threads) {
7699   kmp_info_t *thr = __kmp_threads[gtid];
7700   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7701   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7702   KMP_DEBUG_ASSERT(num_threads >= 0);
7703 
7704   if (num_teams_lb > num_teams_ub) {
7705     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7706                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7707   }
7708 
7709   int num_teams = 1; // defalt number of teams is 1.
7710 
7711   if (num_teams_lb == 0 && num_teams_ub > 0)
7712     num_teams_lb = num_teams_ub;
7713 
7714   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7715     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7716     if (num_teams > __kmp_teams_max_nth) {
7717       if (!__kmp_reserve_warn) {
7718         __kmp_reserve_warn = 1;
7719         __kmp_msg(kmp_ms_warning,
7720                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7721                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7722       }
7723       num_teams = __kmp_teams_max_nth;
7724     }
7725   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7726     num_teams = num_teams_ub;
7727   } else { // num_teams_lb <= num_teams <= num_teams_ub
7728     if (num_threads == 0) {
7729       if (num_teams_ub > __kmp_teams_max_nth) {
7730         num_teams = num_teams_lb;
7731       } else {
7732         num_teams = num_teams_ub;
7733       }
7734     } else {
7735       num_teams = (num_threads > __kmp_teams_max_nth)
7736                       ? num_teams
7737                       : __kmp_teams_max_nth / num_threads;
7738       if (num_teams < num_teams_lb) {
7739         num_teams = num_teams_lb;
7740       } else if (num_teams > num_teams_ub) {
7741         num_teams = num_teams_ub;
7742       }
7743     }
7744   }
7745   // Set number of teams (number of threads in the outer "parallel" of the
7746   // teams)
7747   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7748 
7749   __kmp_push_thread_limit(thr, num_teams, num_threads);
7750 }
7751 
7752 // Set the proc_bind var to use in the following parallel region.
7753 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7754   kmp_info_t *thr = __kmp_threads[gtid];
7755   thr->th.th_set_proc_bind = proc_bind;
7756 }
7757 
7758 /* Launch the worker threads into the microtask. */
7759 
7760 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7761   kmp_info_t *this_thr = __kmp_threads[gtid];
7762 
7763 #ifdef KMP_DEBUG
7764   int f;
7765 #endif /* KMP_DEBUG */
7766 
7767   KMP_DEBUG_ASSERT(team);
7768   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7769   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7770   KMP_MB(); /* Flush all pending memory write invalidates.  */
7771 
7772   team->t.t_construct = 0; /* no single directives seen yet */
7773   team->t.t_ordered.dt.t_value =
7774       0; /* thread 0 enters the ordered section first */
7775 
7776   /* Reset the identifiers on the dispatch buffer */
7777   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7778   if (team->t.t_max_nproc > 1) {
7779     int i;
7780     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7781       team->t.t_disp_buffer[i].buffer_index = i;
7782       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7783     }
7784   } else {
7785     team->t.t_disp_buffer[0].buffer_index = 0;
7786     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7787   }
7788 
7789   KMP_MB(); /* Flush all pending memory write invalidates.  */
7790   KMP_ASSERT(this_thr->th.th_team == team);
7791 
7792 #ifdef KMP_DEBUG
7793   for (f = 0; f < team->t.t_nproc; f++) {
7794     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7795                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7796   }
7797 #endif /* KMP_DEBUG */
7798 
7799   /* release the worker threads so they may begin working */
7800   __kmp_fork_barrier(gtid, 0);
7801 }
7802 
7803 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7804   kmp_info_t *this_thr = __kmp_threads[gtid];
7805 
7806   KMP_DEBUG_ASSERT(team);
7807   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7808   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7809   KMP_MB(); /* Flush all pending memory write invalidates.  */
7810 
7811   /* Join barrier after fork */
7812 
7813 #ifdef KMP_DEBUG
7814   if (__kmp_threads[gtid] &&
7815       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7816     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7817                  __kmp_threads[gtid]);
7818     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7819                  "team->t.t_nproc=%d\n",
7820                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7821                  team->t.t_nproc);
7822     __kmp_print_structure();
7823   }
7824   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7825                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7826 #endif /* KMP_DEBUG */
7827 
7828   __kmp_join_barrier(gtid); /* wait for everyone */
7829 #if OMPT_SUPPORT
7830   if (ompt_enabled.enabled &&
7831       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7832     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7833     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7834     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7835 #if OMPT_OPTIONAL
7836     void *codeptr = NULL;
7837     if (KMP_MASTER_TID(ds_tid) &&
7838         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7839          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7840       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7841 
7842     if (ompt_enabled.ompt_callback_sync_region_wait) {
7843       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7844           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7845           codeptr);
7846     }
7847     if (ompt_enabled.ompt_callback_sync_region) {
7848       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7849           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7850           codeptr);
7851     }
7852 #endif
7853     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7854       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7855           ompt_scope_end, NULL, task_data, 0, ds_tid,
7856           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7857     }
7858   }
7859 #endif
7860 
7861   KMP_MB(); /* Flush all pending memory write invalidates.  */
7862   KMP_ASSERT(this_thr->th.th_team == team);
7863 }
7864 
7865 /* ------------------------------------------------------------------------ */
7866 
7867 #ifdef USE_LOAD_BALANCE
7868 
7869 // Return the worker threads actively spinning in the hot team, if we
7870 // are at the outermost level of parallelism.  Otherwise, return 0.
7871 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7872   int i;
7873   int retval;
7874   kmp_team_t *hot_team;
7875 
7876   if (root->r.r_active) {
7877     return 0;
7878   }
7879   hot_team = root->r.r_hot_team;
7880   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7881     return hot_team->t.t_nproc - 1; // Don't count primary thread
7882   }
7883 
7884   // Skip the primary thread - it is accounted for elsewhere.
7885   retval = 0;
7886   for (i = 1; i < hot_team->t.t_nproc; i++) {
7887     if (hot_team->t.t_threads[i]->th.th_active) {
7888       retval++;
7889     }
7890   }
7891   return retval;
7892 }
7893 
7894 // Perform an automatic adjustment to the number of
7895 // threads used by the next parallel region.
7896 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7897   int retval;
7898   int pool_active;
7899   int hot_team_active;
7900   int team_curr_active;
7901   int system_active;
7902 
7903   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7904                 set_nproc));
7905   KMP_DEBUG_ASSERT(root);
7906   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7907                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7908   KMP_DEBUG_ASSERT(set_nproc > 1);
7909 
7910   if (set_nproc == 1) {
7911     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7912     return 1;
7913   }
7914 
7915   // Threads that are active in the thread pool, active in the hot team for this
7916   // particular root (if we are at the outer par level), and the currently
7917   // executing thread (to become the primary thread) are available to add to the
7918   // new team, but are currently contributing to the system load, and must be
7919   // accounted for.
7920   pool_active = __kmp_thread_pool_active_nth;
7921   hot_team_active = __kmp_active_hot_team_nproc(root);
7922   team_curr_active = pool_active + hot_team_active + 1;
7923 
7924   // Check the system load.
7925   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7926   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7927                 "hot team active = %d\n",
7928                 system_active, pool_active, hot_team_active));
7929 
7930   if (system_active < 0) {
7931     // There was an error reading the necessary info from /proc, so use the
7932     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7933     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7934     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7935     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7936 
7937     // Make this call behave like the thread limit algorithm.
7938     retval = __kmp_avail_proc - __kmp_nth +
7939              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7940     if (retval > set_nproc) {
7941       retval = set_nproc;
7942     }
7943     if (retval < KMP_MIN_NTH) {
7944       retval = KMP_MIN_NTH;
7945     }
7946 
7947     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7948                   retval));
7949     return retval;
7950   }
7951 
7952   // There is a slight delay in the load balance algorithm in detecting new
7953   // running procs. The real system load at this instant should be at least as
7954   // large as the #active omp thread that are available to add to the team.
7955   if (system_active < team_curr_active) {
7956     system_active = team_curr_active;
7957   }
7958   retval = __kmp_avail_proc - system_active + team_curr_active;
7959   if (retval > set_nproc) {
7960     retval = set_nproc;
7961   }
7962   if (retval < KMP_MIN_NTH) {
7963     retval = KMP_MIN_NTH;
7964   }
7965 
7966   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7967   return retval;
7968 } // __kmp_load_balance_nproc()
7969 
7970 #endif /* USE_LOAD_BALANCE */
7971 
7972 /* ------------------------------------------------------------------------ */
7973 
7974 /* NOTE: this is called with the __kmp_init_lock held */
7975 void __kmp_cleanup(void) {
7976   int f;
7977 
7978   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7979 
7980   if (TCR_4(__kmp_init_parallel)) {
7981 #if KMP_HANDLE_SIGNALS
7982     __kmp_remove_signals();
7983 #endif
7984     TCW_4(__kmp_init_parallel, FALSE);
7985   }
7986 
7987   if (TCR_4(__kmp_init_middle)) {
7988 #if KMP_AFFINITY_SUPPORTED
7989     __kmp_affinity_uninitialize();
7990 #endif /* KMP_AFFINITY_SUPPORTED */
7991     __kmp_cleanup_hierarchy();
7992     TCW_4(__kmp_init_middle, FALSE);
7993   }
7994 
7995   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7996 
7997   if (__kmp_init_serial) {
7998     __kmp_runtime_destroy();
7999     __kmp_init_serial = FALSE;
8000   }
8001 
8002   __kmp_cleanup_threadprivate_caches();
8003 
8004   for (f = 0; f < __kmp_threads_capacity; f++) {
8005     if (__kmp_root[f] != NULL) {
8006       __kmp_free(__kmp_root[f]);
8007       __kmp_root[f] = NULL;
8008     }
8009   }
8010   __kmp_free(__kmp_threads);
8011   // __kmp_threads and __kmp_root were allocated at once, as single block, so
8012   // there is no need in freeing __kmp_root.
8013   __kmp_threads = NULL;
8014   __kmp_root = NULL;
8015   __kmp_threads_capacity = 0;
8016 
8017 #if KMP_USE_DYNAMIC_LOCK
8018   __kmp_cleanup_indirect_user_locks();
8019 #else
8020   __kmp_cleanup_user_locks();
8021 #endif
8022 #if OMPD_SUPPORT
8023   if (ompd_state) {
8024     __kmp_free(ompd_env_block);
8025     ompd_env_block = NULL;
8026     ompd_env_block_size = 0;
8027   }
8028 #endif
8029 
8030 #if KMP_AFFINITY_SUPPORTED
8031   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8032   __kmp_cpuinfo_file = NULL;
8033 #endif /* KMP_AFFINITY_SUPPORTED */
8034 
8035 #if KMP_USE_ADAPTIVE_LOCKS
8036 #if KMP_DEBUG_ADAPTIVE_LOCKS
8037   __kmp_print_speculative_stats();
8038 #endif
8039 #endif
8040   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8041   __kmp_nested_nth.nth = NULL;
8042   __kmp_nested_nth.size = 0;
8043   __kmp_nested_nth.used = 0;
8044   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8045   __kmp_nested_proc_bind.bind_types = NULL;
8046   __kmp_nested_proc_bind.size = 0;
8047   __kmp_nested_proc_bind.used = 0;
8048   if (__kmp_affinity_format) {
8049     KMP_INTERNAL_FREE(__kmp_affinity_format);
8050     __kmp_affinity_format = NULL;
8051   }
8052 
8053   __kmp_i18n_catclose();
8054 
8055 #if KMP_USE_HIER_SCHED
8056   __kmp_hier_scheds.deallocate();
8057 #endif
8058 
8059 #if KMP_STATS_ENABLED
8060   __kmp_stats_fini();
8061 #endif
8062 
8063   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8064 }
8065 
8066 /* ------------------------------------------------------------------------ */
8067 
8068 int __kmp_ignore_mppbeg(void) {
8069   char *env;
8070 
8071   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8072     if (__kmp_str_match_false(env))
8073       return FALSE;
8074   }
8075   // By default __kmpc_begin() is no-op.
8076   return TRUE;
8077 }
8078 
8079 int __kmp_ignore_mppend(void) {
8080   char *env;
8081 
8082   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8083     if (__kmp_str_match_false(env))
8084       return FALSE;
8085   }
8086   // By default __kmpc_end() is no-op.
8087   return TRUE;
8088 }
8089 
8090 void __kmp_internal_begin(void) {
8091   int gtid;
8092   kmp_root_t *root;
8093 
8094   /* this is a very important step as it will register new sibling threads
8095      and assign these new uber threads a new gtid */
8096   gtid = __kmp_entry_gtid();
8097   root = __kmp_threads[gtid]->th.th_root;
8098   KMP_ASSERT(KMP_UBER_GTID(gtid));
8099 
8100   if (root->r.r_begin)
8101     return;
8102   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8103   if (root->r.r_begin) {
8104     __kmp_release_lock(&root->r.r_begin_lock, gtid);
8105     return;
8106   }
8107 
8108   root->r.r_begin = TRUE;
8109 
8110   __kmp_release_lock(&root->r.r_begin_lock, gtid);
8111 }
8112 
8113 /* ------------------------------------------------------------------------ */
8114 
8115 void __kmp_user_set_library(enum library_type arg) {
8116   int gtid;
8117   kmp_root_t *root;
8118   kmp_info_t *thread;
8119 
8120   /* first, make sure we are initialized so we can get our gtid */
8121 
8122   gtid = __kmp_entry_gtid();
8123   thread = __kmp_threads[gtid];
8124 
8125   root = thread->th.th_root;
8126 
8127   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8128                 library_serial));
8129   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8130                                   thread */
8131     KMP_WARNING(SetLibraryIncorrectCall);
8132     return;
8133   }
8134 
8135   switch (arg) {
8136   case library_serial:
8137     thread->th.th_set_nproc = 0;
8138     set__nproc(thread, 1);
8139     break;
8140   case library_turnaround:
8141     thread->th.th_set_nproc = 0;
8142     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8143                                            : __kmp_dflt_team_nth_ub);
8144     break;
8145   case library_throughput:
8146     thread->th.th_set_nproc = 0;
8147     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8148                                            : __kmp_dflt_team_nth_ub);
8149     break;
8150   default:
8151     KMP_FATAL(UnknownLibraryType, arg);
8152   }
8153 
8154   __kmp_aux_set_library(arg);
8155 }
8156 
8157 void __kmp_aux_set_stacksize(size_t arg) {
8158   if (!__kmp_init_serial)
8159     __kmp_serial_initialize();
8160 
8161 #if KMP_OS_DARWIN
8162   if (arg & (0x1000 - 1)) {
8163     arg &= ~(0x1000 - 1);
8164     if (arg + 0x1000) /* check for overflow if we round up */
8165       arg += 0x1000;
8166   }
8167 #endif
8168   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8169 
8170   /* only change the default stacksize before the first parallel region */
8171   if (!TCR_4(__kmp_init_parallel)) {
8172     size_t value = arg; /* argument is in bytes */
8173 
8174     if (value < __kmp_sys_min_stksize)
8175       value = __kmp_sys_min_stksize;
8176     else if (value > KMP_MAX_STKSIZE)
8177       value = KMP_MAX_STKSIZE;
8178 
8179     __kmp_stksize = value;
8180 
8181     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8182   }
8183 
8184   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8185 }
8186 
8187 /* set the behaviour of the runtime library */
8188 /* TODO this can cause some odd behaviour with sibling parallelism... */
8189 void __kmp_aux_set_library(enum library_type arg) {
8190   __kmp_library = arg;
8191 
8192   switch (__kmp_library) {
8193   case library_serial: {
8194     KMP_INFORM(LibraryIsSerial);
8195   } break;
8196   case library_turnaround:
8197     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8198       __kmp_use_yield = 2; // only yield when oversubscribed
8199     break;
8200   case library_throughput:
8201     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8202       __kmp_dflt_blocktime = 200;
8203     break;
8204   default:
8205     KMP_FATAL(UnknownLibraryType, arg);
8206   }
8207 }
8208 
8209 /* Getting team information common for all team API */
8210 // Returns NULL if not in teams construct
8211 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8212   kmp_info_t *thr = __kmp_entry_thread();
8213   teams_serialized = 0;
8214   if (thr->th.th_teams_microtask) {
8215     kmp_team_t *team = thr->th.th_team;
8216     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8217     int ii = team->t.t_level;
8218     teams_serialized = team->t.t_serialized;
8219     int level = tlevel + 1;
8220     KMP_DEBUG_ASSERT(ii >= tlevel);
8221     while (ii > level) {
8222       for (teams_serialized = team->t.t_serialized;
8223            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8224       }
8225       if (team->t.t_serialized && (!teams_serialized)) {
8226         team = team->t.t_parent;
8227         continue;
8228       }
8229       if (ii > level) {
8230         team = team->t.t_parent;
8231         ii--;
8232       }
8233     }
8234     return team;
8235   }
8236   return NULL;
8237 }
8238 
8239 int __kmp_aux_get_team_num() {
8240   int serialized;
8241   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8242   if (team) {
8243     if (serialized > 1) {
8244       return 0; // teams region is serialized ( 1 team of 1 thread ).
8245     } else {
8246       return team->t.t_master_tid;
8247     }
8248   }
8249   return 0;
8250 }
8251 
8252 int __kmp_aux_get_num_teams() {
8253   int serialized;
8254   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8255   if (team) {
8256     if (serialized > 1) {
8257       return 1;
8258     } else {
8259       return team->t.t_parent->t.t_nproc;
8260     }
8261   }
8262   return 1;
8263 }
8264 
8265 /* ------------------------------------------------------------------------ */
8266 
8267 /*
8268  * Affinity Format Parser
8269  *
8270  * Field is in form of: %[[[0].]size]type
8271  * % and type are required (%% means print a literal '%')
8272  * type is either single char or long name surrounded by {},
8273  * e.g., N or {num_threads}
8274  * 0 => leading zeros
8275  * . => right justified when size is specified
8276  * by default output is left justified
8277  * size is the *minimum* field length
8278  * All other characters are printed as is
8279  *
8280  * Available field types:
8281  * L {thread_level}      - omp_get_level()
8282  * n {thread_num}        - omp_get_thread_num()
8283  * h {host}              - name of host machine
8284  * P {process_id}        - process id (integer)
8285  * T {thread_identifier} - native thread identifier (integer)
8286  * N {num_threads}       - omp_get_num_threads()
8287  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8288  * a {thread_affinity}   - comma separated list of integers or integer ranges
8289  *                         (values of affinity mask)
8290  *
8291  * Implementation-specific field types can be added
8292  * If a type is unknown, print "undefined"
8293  */
8294 
8295 // Structure holding the short name, long name, and corresponding data type
8296 // for snprintf.  A table of these will represent the entire valid keyword
8297 // field types.
8298 typedef struct kmp_affinity_format_field_t {
8299   char short_name; // from spec e.g., L -> thread level
8300   const char *long_name; // from spec thread_level -> thread level
8301   char field_format; // data type for snprintf (typically 'd' or 's'
8302   // for integer or string)
8303 } kmp_affinity_format_field_t;
8304 
8305 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8306 #if KMP_AFFINITY_SUPPORTED
8307     {'A', "thread_affinity", 's'},
8308 #endif
8309     {'t', "team_num", 'd'},
8310     {'T', "num_teams", 'd'},
8311     {'L', "nesting_level", 'd'},
8312     {'n', "thread_num", 'd'},
8313     {'N', "num_threads", 'd'},
8314     {'a', "ancestor_tnum", 'd'},
8315     {'H', "host", 's'},
8316     {'P', "process_id", 'd'},
8317     {'i', "native_thread_id", 'd'}};
8318 
8319 // Return the number of characters it takes to hold field
8320 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8321                                             const char **ptr,
8322                                             kmp_str_buf_t *field_buffer) {
8323   int rc, format_index, field_value;
8324   const char *width_left, *width_right;
8325   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8326   static const int FORMAT_SIZE = 20;
8327   char format[FORMAT_SIZE] = {0};
8328   char absolute_short_name = 0;
8329 
8330   KMP_DEBUG_ASSERT(gtid >= 0);
8331   KMP_DEBUG_ASSERT(th);
8332   KMP_DEBUG_ASSERT(**ptr == '%');
8333   KMP_DEBUG_ASSERT(field_buffer);
8334 
8335   __kmp_str_buf_clear(field_buffer);
8336 
8337   // Skip the initial %
8338   (*ptr)++;
8339 
8340   // Check for %% first
8341   if (**ptr == '%') {
8342     __kmp_str_buf_cat(field_buffer, "%", 1);
8343     (*ptr)++; // skip over the second %
8344     return 1;
8345   }
8346 
8347   // Parse field modifiers if they are present
8348   pad_zeros = false;
8349   if (**ptr == '0') {
8350     pad_zeros = true;
8351     (*ptr)++; // skip over 0
8352   }
8353   right_justify = false;
8354   if (**ptr == '.') {
8355     right_justify = true;
8356     (*ptr)++; // skip over .
8357   }
8358   // Parse width of field: [width_left, width_right)
8359   width_left = width_right = NULL;
8360   if (**ptr >= '0' && **ptr <= '9') {
8361     width_left = *ptr;
8362     SKIP_DIGITS(*ptr);
8363     width_right = *ptr;
8364   }
8365 
8366   // Create the format for KMP_SNPRINTF based on flags parsed above
8367   format_index = 0;
8368   format[format_index++] = '%';
8369   if (!right_justify)
8370     format[format_index++] = '-';
8371   if (pad_zeros)
8372     format[format_index++] = '0';
8373   if (width_left && width_right) {
8374     int i = 0;
8375     // Only allow 8 digit number widths.
8376     // This also prevents overflowing format variable
8377     while (i < 8 && width_left < width_right) {
8378       format[format_index++] = *width_left;
8379       width_left++;
8380       i++;
8381     }
8382   }
8383 
8384   // Parse a name (long or short)
8385   // Canonicalize the name into absolute_short_name
8386   found_valid_name = false;
8387   parse_long_name = (**ptr == '{');
8388   if (parse_long_name)
8389     (*ptr)++; // skip initial left brace
8390   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8391                              sizeof(__kmp_affinity_format_table[0]);
8392        ++i) {
8393     char short_name = __kmp_affinity_format_table[i].short_name;
8394     const char *long_name = __kmp_affinity_format_table[i].long_name;
8395     char field_format = __kmp_affinity_format_table[i].field_format;
8396     if (parse_long_name) {
8397       size_t length = KMP_STRLEN(long_name);
8398       if (strncmp(*ptr, long_name, length) == 0) {
8399         found_valid_name = true;
8400         (*ptr) += length; // skip the long name
8401       }
8402     } else if (**ptr == short_name) {
8403       found_valid_name = true;
8404       (*ptr)++; // skip the short name
8405     }
8406     if (found_valid_name) {
8407       format[format_index++] = field_format;
8408       format[format_index++] = '\0';
8409       absolute_short_name = short_name;
8410       break;
8411     }
8412   }
8413   if (parse_long_name) {
8414     if (**ptr != '}') {
8415       absolute_short_name = 0;
8416     } else {
8417       (*ptr)++; // skip over the right brace
8418     }
8419   }
8420 
8421   // Attempt to fill the buffer with the requested
8422   // value using snprintf within __kmp_str_buf_print()
8423   switch (absolute_short_name) {
8424   case 't':
8425     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8426     break;
8427   case 'T':
8428     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8429     break;
8430   case 'L':
8431     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8432     break;
8433   case 'n':
8434     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8435     break;
8436   case 'H': {
8437     static const int BUFFER_SIZE = 256;
8438     char buf[BUFFER_SIZE];
8439     __kmp_expand_host_name(buf, BUFFER_SIZE);
8440     rc = __kmp_str_buf_print(field_buffer, format, buf);
8441   } break;
8442   case 'P':
8443     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8444     break;
8445   case 'i':
8446     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8447     break;
8448   case 'N':
8449     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8450     break;
8451   case 'a':
8452     field_value =
8453         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8454     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8455     break;
8456 #if KMP_AFFINITY_SUPPORTED
8457   case 'A': {
8458     kmp_str_buf_t buf;
8459     __kmp_str_buf_init(&buf);
8460     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8461     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8462     __kmp_str_buf_free(&buf);
8463   } break;
8464 #endif
8465   default:
8466     // According to spec, If an implementation does not have info for field
8467     // type, then "undefined" is printed
8468     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8469     // Skip the field
8470     if (parse_long_name) {
8471       SKIP_TOKEN(*ptr);
8472       if (**ptr == '}')
8473         (*ptr)++;
8474     } else {
8475       (*ptr)++;
8476     }
8477   }
8478 
8479   KMP_ASSERT(format_index <= FORMAT_SIZE);
8480   return rc;
8481 }
8482 
8483 /*
8484  * Return number of characters needed to hold the affinity string
8485  * (not including null byte character)
8486  * The resultant string is printed to buffer, which the caller can then
8487  * handle afterwards
8488  */
8489 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8490                                   kmp_str_buf_t *buffer) {
8491   const char *parse_ptr;
8492   size_t retval;
8493   const kmp_info_t *th;
8494   kmp_str_buf_t field;
8495 
8496   KMP_DEBUG_ASSERT(buffer);
8497   KMP_DEBUG_ASSERT(gtid >= 0);
8498 
8499   __kmp_str_buf_init(&field);
8500   __kmp_str_buf_clear(buffer);
8501 
8502   th = __kmp_threads[gtid];
8503   retval = 0;
8504 
8505   // If format is NULL or zero-length string, then we use
8506   // affinity-format-var ICV
8507   parse_ptr = format;
8508   if (parse_ptr == NULL || *parse_ptr == '\0') {
8509     parse_ptr = __kmp_affinity_format;
8510   }
8511   KMP_DEBUG_ASSERT(parse_ptr);
8512 
8513   while (*parse_ptr != '\0') {
8514     // Parse a field
8515     if (*parse_ptr == '%') {
8516       // Put field in the buffer
8517       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8518       __kmp_str_buf_catbuf(buffer, &field);
8519       retval += rc;
8520     } else {
8521       // Put literal character in buffer
8522       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8523       retval++;
8524       parse_ptr++;
8525     }
8526   }
8527   __kmp_str_buf_free(&field);
8528   return retval;
8529 }
8530 
8531 // Displays the affinity string to stdout
8532 void __kmp_aux_display_affinity(int gtid, const char *format) {
8533   kmp_str_buf_t buf;
8534   __kmp_str_buf_init(&buf);
8535   __kmp_aux_capture_affinity(gtid, format, &buf);
8536   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8537   __kmp_str_buf_free(&buf);
8538 }
8539 
8540 /* ------------------------------------------------------------------------ */
8541 
8542 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8543   int blocktime = arg; /* argument is in milliseconds */
8544 #if KMP_USE_MONITOR
8545   int bt_intervals;
8546 #endif
8547   kmp_int8 bt_set;
8548 
8549   __kmp_save_internal_controls(thread);
8550 
8551   /* Normalize and set blocktime for the teams */
8552   if (blocktime < KMP_MIN_BLOCKTIME)
8553     blocktime = KMP_MIN_BLOCKTIME;
8554   else if (blocktime > KMP_MAX_BLOCKTIME)
8555     blocktime = KMP_MAX_BLOCKTIME;
8556 
8557   set__blocktime_team(thread->th.th_team, tid, blocktime);
8558   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8559 
8560 #if KMP_USE_MONITOR
8561   /* Calculate and set blocktime intervals for the teams */
8562   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8563 
8564   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8565   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8566 #endif
8567 
8568   /* Set whether blocktime has been set to "TRUE" */
8569   bt_set = TRUE;
8570 
8571   set__bt_set_team(thread->th.th_team, tid, bt_set);
8572   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8573 #if KMP_USE_MONITOR
8574   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8575                 "bt_intervals=%d, monitor_updates=%d\n",
8576                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8577                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8578                 __kmp_monitor_wakeups));
8579 #else
8580   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8581                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8582                 thread->th.th_team->t.t_id, tid, blocktime));
8583 #endif
8584 }
8585 
8586 void __kmp_aux_set_defaults(char const *str, size_t len) {
8587   if (!__kmp_init_serial) {
8588     __kmp_serial_initialize();
8589   }
8590   __kmp_env_initialize(str);
8591 
8592   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8593     __kmp_env_print();
8594   }
8595 } // __kmp_aux_set_defaults
8596 
8597 /* ------------------------------------------------------------------------ */
8598 /* internal fast reduction routines */
8599 
8600 PACKED_REDUCTION_METHOD_T
8601 __kmp_determine_reduction_method(
8602     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8603     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8604     kmp_critical_name *lck) {
8605 
8606   // Default reduction method: critical construct ( lck != NULL, like in current
8607   // PAROPT )
8608   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8609   // can be selected by RTL
8610   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8611   // can be selected by RTL
8612   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8613   // among generated by PAROPT.
8614 
8615   PACKED_REDUCTION_METHOD_T retval;
8616 
8617   int team_size;
8618 
8619   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8620   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8621 
8622 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8623   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8624 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8625 
8626   retval = critical_reduce_block;
8627 
8628   // another choice of getting a team size (with 1 dynamic deference) is slower
8629   team_size = __kmp_get_team_num_threads(global_tid);
8630   if (team_size == 1) {
8631 
8632     retval = empty_reduce_block;
8633 
8634   } else {
8635 
8636     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8637 
8638 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8639     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8640 
8641 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8642     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8643 
8644     int teamsize_cutoff = 4;
8645 
8646 #if KMP_MIC_SUPPORTED
8647     if (__kmp_mic_type != non_mic) {
8648       teamsize_cutoff = 8;
8649     }
8650 #endif
8651     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8652     if (tree_available) {
8653       if (team_size <= teamsize_cutoff) {
8654         if (atomic_available) {
8655           retval = atomic_reduce_block;
8656         }
8657       } else {
8658         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8659       }
8660     } else if (atomic_available) {
8661       retval = atomic_reduce_block;
8662     }
8663 #else
8664 #error "Unknown or unsupported OS"
8665 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8666        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8667 
8668 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8669 
8670 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8671 
8672     // basic tuning
8673 
8674     if (atomic_available) {
8675       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8676         retval = atomic_reduce_block;
8677       }
8678     } // otherwise: use critical section
8679 
8680 #elif KMP_OS_DARWIN
8681 
8682     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8683     if (atomic_available && (num_vars <= 3)) {
8684       retval = atomic_reduce_block;
8685     } else if (tree_available) {
8686       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8687           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8688         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8689       }
8690     } // otherwise: use critical section
8691 
8692 #else
8693 #error "Unknown or unsupported OS"
8694 #endif
8695 
8696 #else
8697 #error "Unknown or unsupported architecture"
8698 #endif
8699   }
8700 
8701   // KMP_FORCE_REDUCTION
8702 
8703   // If the team is serialized (team_size == 1), ignore the forced reduction
8704   // method and stay with the unsynchronized method (empty_reduce_block)
8705   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8706       team_size != 1) {
8707 
8708     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8709 
8710     int atomic_available, tree_available;
8711 
8712     switch ((forced_retval = __kmp_force_reduction_method)) {
8713     case critical_reduce_block:
8714       KMP_ASSERT(lck); // lck should be != 0
8715       break;
8716 
8717     case atomic_reduce_block:
8718       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8719       if (!atomic_available) {
8720         KMP_WARNING(RedMethodNotSupported, "atomic");
8721         forced_retval = critical_reduce_block;
8722       }
8723       break;
8724 
8725     case tree_reduce_block:
8726       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8727       if (!tree_available) {
8728         KMP_WARNING(RedMethodNotSupported, "tree");
8729         forced_retval = critical_reduce_block;
8730       } else {
8731 #if KMP_FAST_REDUCTION_BARRIER
8732         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8733 #endif
8734       }
8735       break;
8736 
8737     default:
8738       KMP_ASSERT(0); // "unsupported method specified"
8739     }
8740 
8741     retval = forced_retval;
8742   }
8743 
8744   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8745 
8746 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8747 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8748 
8749   return (retval);
8750 }
8751 // this function is for testing set/get/determine reduce method
8752 kmp_int32 __kmp_get_reduce_method(void) {
8753   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8754 }
8755 
8756 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8757 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8758 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8759 
8760 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8761 // OpenMP is used subsequently.
8762 void __kmp_hard_pause() {
8763   __kmp_pause_status = kmp_hard_paused;
8764   __kmp_internal_end_thread(-1);
8765 }
8766 
8767 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8768 void __kmp_resume_if_soft_paused() {
8769   if (__kmp_pause_status == kmp_soft_paused) {
8770     __kmp_pause_status = kmp_not_paused;
8771 
8772     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8773       kmp_info_t *thread = __kmp_threads[gtid];
8774       if (thread) { // Wake it if sleeping
8775         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8776                          thread);
8777         if (fl.is_sleeping())
8778           fl.resume(gtid);
8779         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8780           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8781         } else { // thread holds the lock and may sleep soon
8782           do { // until either the thread sleeps, or we can get the lock
8783             if (fl.is_sleeping()) {
8784               fl.resume(gtid);
8785               break;
8786             } else if (__kmp_try_suspend_mx(thread)) {
8787               __kmp_unlock_suspend_mx(thread);
8788               break;
8789             }
8790           } while (1);
8791         }
8792       }
8793     }
8794   }
8795 }
8796 
8797 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8798 // TODO: add warning messages
8799 int __kmp_pause_resource(kmp_pause_status_t level) {
8800   if (level == kmp_not_paused) { // requesting resume
8801     if (__kmp_pause_status == kmp_not_paused) {
8802       // error message about runtime not being paused, so can't resume
8803       return 1;
8804     } else {
8805       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8806                        __kmp_pause_status == kmp_hard_paused);
8807       __kmp_pause_status = kmp_not_paused;
8808       return 0;
8809     }
8810   } else if (level == kmp_soft_paused) { // requesting soft pause
8811     if (__kmp_pause_status != kmp_not_paused) {
8812       // error message about already being paused
8813       return 1;
8814     } else {
8815       __kmp_soft_pause();
8816       return 0;
8817     }
8818   } else if (level == kmp_hard_paused) { // requesting hard pause
8819     if (__kmp_pause_status != kmp_not_paused) {
8820       // error message about already being paused
8821       return 1;
8822     } else {
8823       __kmp_hard_pause();
8824       return 0;
8825     }
8826   } else {
8827     // error message about invalid level
8828     return 1;
8829   }
8830 }
8831 
8832 void __kmp_omp_display_env(int verbose) {
8833   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8834   if (__kmp_init_serial == 0)
8835     __kmp_do_serial_initialize();
8836   __kmp_display_env_impl(!verbose, verbose);
8837   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8838 }
8839 
8840 // The team size is changing, so distributed barrier must be modified
8841 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
8842                                int new_nthreads) {
8843   KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
8844                    bp_dist_bar);
8845   kmp_info_t **other_threads = team->t.t_threads;
8846 
8847   // We want all the workers to stop waiting on the barrier while we adjust the
8848   // size of the team.
8849   for (int f = 1; f < old_nthreads; ++f) {
8850     KMP_DEBUG_ASSERT(other_threads[f] != NULL);
8851     // Ignore threads that are already inactive or not present in the team
8852     if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
8853       // teams construct causes thread_limit to get passed in, and some of
8854       // those could be inactive; just ignore them
8855       continue;
8856     }
8857     // If thread is transitioning still to in_use state, wait for it
8858     if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
8859       while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
8860         KMP_CPU_PAUSE();
8861     }
8862     // The thread should be in_use now
8863     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
8864     // Transition to unused state
8865     team->t.t_threads[f]->th.th_used_in_team.store(2);
8866     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
8867   }
8868   // Release all the workers
8869   kmp_uint64 new_value; // new value for go
8870   new_value = team->t.b->go_release();
8871 
8872   KMP_MFENCE();
8873 
8874   // Workers should see transition status 2 and move to 0; but may need to be
8875   // woken up first
8876   size_t my_go_index;
8877   int count = old_nthreads - 1;
8878   while (count > 0) {
8879     count = old_nthreads - 1;
8880     for (int f = 1; f < old_nthreads; ++f) {
8881       my_go_index = f / team->t.b->threads_per_go;
8882       if (other_threads[f]->th.th_used_in_team.load() != 0) {
8883         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
8884           kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
8885               void *, other_threads[f]->th.th_sleep_loc);
8886           __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
8887         }
8888       } else {
8889         KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
8890         count--;
8891       }
8892     }
8893   }
8894   // Now update the barrier size
8895   team->t.b->update_num_threads(new_nthreads);
8896   team->t.b->go_reset();
8897 }
8898 
8899 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
8900   // Add the threads back to the team
8901   KMP_DEBUG_ASSERT(team);
8902   // Threads were paused and pointed at th_used_in_team temporarily during a
8903   // resize of the team. We're going to set th_used_in_team to 3 to indicate to
8904   // the thread that it should transition itself back into the team. Then, if
8905   // blocktime isn't infinite, the thread could be sleeping, so we send a resume
8906   // to wake it up.
8907   for (int f = 1; f < new_nthreads; ++f) {
8908     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
8909     KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
8910                                 3);
8911     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
8912       __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
8913                       (kmp_flag_32<false, false> *)NULL);
8914     }
8915   }
8916   // The threads should be transitioning to the team; when they are done, they
8917   // should have set th_used_in_team to 1. This loop forces master to wait until
8918   // all threads have moved into the team and are waiting in the barrier.
8919   int count = new_nthreads - 1;
8920   while (count > 0) {
8921     count = new_nthreads - 1;
8922     for (int f = 1; f < new_nthreads; ++f) {
8923       if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
8924         count--;
8925       }
8926     }
8927   }
8928 }
8929 
8930 // Globals and functions for hidden helper task
8931 kmp_info_t **__kmp_hidden_helper_threads;
8932 kmp_info_t *__kmp_hidden_helper_main_thread;
8933 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8934 #if KMP_OS_LINUX
8935 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8936 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8937 #else
8938 kmp_int32 __kmp_hidden_helper_threads_num = 0;
8939 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8940 #endif
8941 
8942 namespace {
8943 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8944 
8945 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8946   // This is an explicit synchronization on all hidden helper threads in case
8947   // that when a regular thread pushes a hidden helper task to one hidden
8948   // helper thread, the thread has not been awaken once since they're released
8949   // by the main thread after creating the team.
8950   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8951   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8952          __kmp_hidden_helper_threads_num)
8953     ;
8954 
8955   // If main thread, then wait for signal
8956   if (__kmpc_master(nullptr, *gtid)) {
8957     // First, unset the initial state and release the initial thread
8958     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8959     __kmp_hidden_helper_initz_release();
8960     __kmp_hidden_helper_main_thread_wait();
8961     // Now wake up all worker threads
8962     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8963       __kmp_hidden_helper_worker_thread_signal();
8964     }
8965   }
8966 }
8967 } // namespace
8968 
8969 void __kmp_hidden_helper_threads_initz_routine() {
8970   // Create a new root for hidden helper team/threads
8971   const int gtid = __kmp_register_root(TRUE);
8972   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8973   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8974   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8975       __kmp_hidden_helper_threads_num;
8976 
8977   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8978 
8979   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8980 
8981   // Set the initialization flag to FALSE
8982   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8983 
8984   __kmp_hidden_helper_threads_deinitz_release();
8985 }
8986 
8987 /* Nesting Mode:
8988    Set via KMP_NESTING_MODE, which takes an integer.
8989    Note: we skip duplicate topology levels, and skip levels with only
8990       one entity.
8991    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
8992    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
8993       in the topology, and initializes the number of threads at each of those
8994       levels to the number of entities at each level, respectively, below the
8995       entity at the parent level.
8996    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
8997       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
8998       the user to turn nesting on explicitly. This is an even more experimental
8999       option to this experimental feature, and may change or go away in the
9000       future.
9001 */
9002 
9003 // Allocate space to store nesting levels
9004 void __kmp_init_nesting_mode() {
9005   int levels = KMP_HW_LAST;
9006   __kmp_nesting_mode_nlevels = levels;
9007   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9008   for (int i = 0; i < levels; ++i)
9009     __kmp_nesting_nth_level[i] = 0;
9010   if (__kmp_nested_nth.size < levels) {
9011     __kmp_nested_nth.nth =
9012         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9013     __kmp_nested_nth.size = levels;
9014   }
9015 }
9016 
9017 // Set # threads for top levels of nesting; must be called after topology set
9018 void __kmp_set_nesting_mode_threads() {
9019   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9020 
9021   if (__kmp_nesting_mode == 1)
9022     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9023   else if (__kmp_nesting_mode > 1)
9024     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9025 
9026   if (__kmp_topology) { // use topology info
9027     int loc, hw_level;
9028     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9029                                 loc < __kmp_nesting_mode_nlevels;
9030          loc++, hw_level++) {
9031       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9032       if (__kmp_nesting_nth_level[loc] == 1)
9033         loc--;
9034     }
9035     // Make sure all cores are used
9036     if (__kmp_nesting_mode > 1 && loc > 1) {
9037       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9038       int num_cores = __kmp_topology->get_count(core_level);
9039       int upper_levels = 1;
9040       for (int level = 0; level < loc - 1; ++level)
9041         upper_levels *= __kmp_nesting_nth_level[level];
9042       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9043         __kmp_nesting_nth_level[loc - 1] =
9044             num_cores / __kmp_nesting_nth_level[loc - 2];
9045     }
9046     __kmp_nesting_mode_nlevels = loc;
9047     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9048   } else { // no topology info available; provide a reasonable guesstimation
9049     if (__kmp_avail_proc >= 4) {
9050       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9051       __kmp_nesting_nth_level[1] = 2;
9052       __kmp_nesting_mode_nlevels = 2;
9053     } else {
9054       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9055       __kmp_nesting_mode_nlevels = 1;
9056     }
9057     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9058   }
9059   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9060     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9061   }
9062   set__nproc(thread, __kmp_nesting_nth_level[0]);
9063   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9064     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9065   if (get__max_active_levels(thread) > 1) {
9066     // if max levels was set, set nesting mode levels to same
9067     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9068   }
9069   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9070     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9071 }
9072