1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #include "tsan_annotations.h"
51 
52 #if KMP_OS_WINDOWS
53 // windows does not need include files as it doesn't use shared memory
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60 
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63     KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] =
67     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68 
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71     KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75 
76 /* ------------------------------------------------------------------------ */
77 
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81 
82 /* Forward declarations */
83 
84 void __kmp_cleanup(void);
85 
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87                                   int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89                                   kmp_internal_control_t *new_icvs,
90                                   ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93                                    int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99                           kmp_internal_control_t *new_icvs, ident_t *loc);
100 
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104 
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111 
112 /* Calculate the identifier of the current thread */
113 /* fast (and somewhat portable) way to get unique identifier of executing
114    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
115 int __kmp_get_global_thread_id() {
116   int i;
117   kmp_info_t **other_threads;
118   size_t stack_data;
119   char *stack_addr;
120   size_t stack_size;
121   char *stack_base;
122 
123   KA_TRACE(
124       1000,
125       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
126        __kmp_nth, __kmp_all_nth));
127 
128   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
129      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
130      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
131      __kmp_init_gtid for this to work. */
132 
133   if (!TCR_4(__kmp_init_gtid))
134     return KMP_GTID_DNE;
135 
136 #ifdef KMP_TDATA_GTID
137   if (TCR_4(__kmp_gtid_mode) >= 3) {
138     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
139     return __kmp_gtid;
140   }
141 #endif
142   if (TCR_4(__kmp_gtid_mode) >= 2) {
143     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
144     return __kmp_gtid_get_specific();
145   }
146   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
147 
148   stack_addr = (char *)&stack_data;
149   other_threads = __kmp_threads;
150 
151   /* ATT: The code below is a source of potential bugs due to unsynchronized
152      access to __kmp_threads array. For example:
153      1. Current thread loads other_threads[i] to thr and checks it, it is
154         non-NULL.
155      2. Current thread is suspended by OS.
156      3. Another thread unregisters and finishes (debug versions of free()
157         may fill memory with something like 0xEF).
158      4. Current thread is resumed.
159      5. Current thread reads junk from *thr.
160      TODO: Fix it.  --ln  */
161 
162   for (i = 0; i < __kmp_threads_capacity; i++) {
163 
164     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
165     if (!thr)
166       continue;
167 
168     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
169     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
170 
171     /* stack grows down -- search through all of the active threads */
172 
173     if (stack_addr <= stack_base) {
174       size_t stack_diff = stack_base - stack_addr;
175 
176       if (stack_diff <= stack_size) {
177         /* The only way we can be closer than the allocated */
178         /* stack size is if we are running on this thread. */
179         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
180         return i;
181       }
182     }
183   }
184 
185   /* get specific to try and determine our gtid */
186   KA_TRACE(1000,
187            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
188             "thread, using TLS\n"));
189   i = __kmp_gtid_get_specific();
190 
191   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
192 
193   /* if we havn't been assigned a gtid, then return code */
194   if (i < 0)
195     return i;
196 
197   /* dynamically updated stack window for uber threads to avoid get_specific
198      call */
199   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
200     KMP_FATAL(StackOverflow, i);
201   }
202 
203   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204   if (stack_addr > stack_base) {
205     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
206     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
207             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
208                 stack_base);
209   } else {
210     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
211             stack_base - stack_addr);
212   }
213 
214   /* Reprint stack bounds for ubermaster since they have been refined */
215   if (__kmp_storage_map) {
216     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
217     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
218     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
219                                  other_threads[i]->th.th_info.ds.ds_stacksize,
220                                  "th_%d stack (refinement)", i);
221   }
222   return i;
223 }
224 
225 int __kmp_get_global_thread_id_reg() {
226   int gtid;
227 
228   if (!__kmp_init_serial) {
229     gtid = KMP_GTID_DNE;
230   } else
231 #ifdef KMP_TDATA_GTID
232       if (TCR_4(__kmp_gtid_mode) >= 3) {
233     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
234     gtid = __kmp_gtid;
235   } else
236 #endif
237       if (TCR_4(__kmp_gtid_mode) >= 2) {
238     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
239     gtid = __kmp_gtid_get_specific();
240   } else {
241     KA_TRACE(1000,
242              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
243     gtid = __kmp_get_global_thread_id();
244   }
245 
246   /* we must be a new uber master sibling thread */
247   if (gtid == KMP_GTID_DNE) {
248     KA_TRACE(10,
249              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
250               "Registering a new gtid.\n"));
251     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
252     if (!__kmp_init_serial) {
253       __kmp_do_serial_initialize();
254       gtid = __kmp_gtid_get_specific();
255     } else {
256       gtid = __kmp_register_root(FALSE);
257     }
258     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
259     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
260   }
261 
262   KMP_DEBUG_ASSERT(gtid >= 0);
263 
264   return gtid;
265 }
266 
267 /* caller must hold forkjoin_lock */
268 void __kmp_check_stack_overlap(kmp_info_t *th) {
269   int f;
270   char *stack_beg = NULL;
271   char *stack_end = NULL;
272   int gtid;
273 
274   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
275   if (__kmp_storage_map) {
276     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
277     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
278 
279     gtid = __kmp_gtid_from_thread(th);
280 
281     if (gtid == KMP_GTID_MONITOR) {
282       __kmp_print_storage_map_gtid(
283           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
284           "th_%s stack (%s)", "mon",
285           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
286     } else {
287       __kmp_print_storage_map_gtid(
288           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
289           "th_%d stack (%s)", gtid,
290           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
291     }
292   }
293 
294   /* No point in checking ubermaster threads since they use refinement and
295    * cannot overlap */
296   gtid = __kmp_gtid_from_thread(th);
297   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
298     KA_TRACE(10,
299              ("__kmp_check_stack_overlap: performing extensive checking\n"));
300     if (stack_beg == NULL) {
301       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
302       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
303     }
304 
305     for (f = 0; f < __kmp_threads_capacity; f++) {
306       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
307 
308       if (f_th && f_th != th) {
309         char *other_stack_end =
310             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
311         char *other_stack_beg =
312             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
313         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
314             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
315 
316           /* Print the other stack values before the abort */
317           if (__kmp_storage_map)
318             __kmp_print_storage_map_gtid(
319                 -1, other_stack_beg, other_stack_end,
320                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
321                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
322 
323           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
324                       __kmp_msg_null);
325         }
326       }
327     }
328   }
329   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
330 }
331 
332 /* ------------------------------------------------------------------------ */
333 
334 void __kmp_infinite_loop(void) {
335   static int done = FALSE;
336 
337   while (!done) {
338     KMP_YIELD(TRUE);
339   }
340 }
341 
342 #define MAX_MESSAGE 512
343 
344 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
345                                   char const *format, ...) {
346   char buffer[MAX_MESSAGE];
347   va_list ap;
348 
349   va_start(ap, format);
350   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
351                p2, (unsigned long)size, format);
352   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
353   __kmp_vprintf(kmp_err, buffer, ap);
354 #if KMP_PRINT_DATA_PLACEMENT
355   int node;
356   if (gtid >= 0) {
357     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
358       if (__kmp_storage_map_verbose) {
359         node = __kmp_get_host_node(p1);
360         if (node < 0) /* doesn't work, so don't try this next time */
361           __kmp_storage_map_verbose = FALSE;
362         else {
363           char *last;
364           int lastNode;
365           int localProc = __kmp_get_cpu_from_gtid(gtid);
366 
367           const int page_size = KMP_GET_PAGE_SIZE();
368 
369           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
370           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
371           if (localProc >= 0)
372             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
373                                  localProc >> 1);
374           else
375             __kmp_printf_no_lock("  GTID %d\n", gtid);
376 #if KMP_USE_PRCTL
377           /* The more elaborate format is disabled for now because of the prctl
378            * hanging bug. */
379           do {
380             last = p1;
381             lastNode = node;
382             /* This loop collates adjacent pages with the same host node. */
383             do {
384               (char *)p1 += page_size;
385             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
386             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
387                                  lastNode);
388           } while (p1 <= p2);
389 #else
390           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
391                                (char *)p1 + (page_size - 1),
392                                __kmp_get_host_node(p1));
393           if (p1 < p2) {
394             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
395                                  (char *)p2 + (page_size - 1),
396                                  __kmp_get_host_node(p2));
397           }
398 #endif
399         }
400       }
401     } else
402       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
403   }
404 #endif /* KMP_PRINT_DATA_PLACEMENT */
405   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406 }
407 
408 void __kmp_warn(char const *format, ...) {
409   char buffer[MAX_MESSAGE];
410   va_list ap;
411 
412   if (__kmp_generate_warnings == kmp_warnings_off) {
413     return;
414   }
415 
416   va_start(ap, format);
417 
418   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
419   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
420   __kmp_vprintf(kmp_err, buffer, ap);
421   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
422 
423   va_end(ap);
424 }
425 
426 void __kmp_abort_process() {
427   // Later threads may stall here, but that's ok because abort() will kill them.
428   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
429 
430   if (__kmp_debug_buf) {
431     __kmp_dump_debug_buffer();
432   }
433 
434   if (KMP_OS_WINDOWS) {
435     // Let other threads know of abnormal termination and prevent deadlock
436     // if abort happened during library initialization or shutdown
437     __kmp_global.g.g_abort = SIGABRT;
438 
439     /* On Windows* OS by default abort() causes pop-up error box, which stalls
440        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
441        boxes. _set_abort_behavior() works well, but this function is not
442        available in VS7 (this is not problem for DLL, but it is a problem for
443        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
444        help, at least in some versions of MS C RTL.
445 
446        It seems following sequence is the only way to simulate abort() and
447        avoid pop-up error box. */
448     raise(SIGABRT);
449     _exit(3); // Just in case, if signal ignored, exit anyway.
450   } else {
451     __kmp_unregister_library();
452     abort();
453   }
454 
455   __kmp_infinite_loop();
456   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
457 
458 } // __kmp_abort_process
459 
460 void __kmp_abort_thread(void) {
461   // TODO: Eliminate g_abort global variable and this function.
462   // In case of abort just call abort(), it will kill all the threads.
463   __kmp_infinite_loop();
464 } // __kmp_abort_thread
465 
466 /* Print out the storage map for the major kmp_info_t thread data structures
467    that are allocated together. */
468 
469 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
470   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
471                                gtid);
472 
473   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
474                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
475 
476   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
477                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
478 
479   __kmp_print_storage_map_gtid(
480       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
481       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
482 
483   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
484                                &thr->th.th_bar[bs_plain_barrier + 1],
485                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
486                                gtid);
487 
488   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
489                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
490                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
491                                gtid);
492 
493 #if KMP_FAST_REDUCTION_BARRIER
494   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
495                                &thr->th.th_bar[bs_reduction_barrier + 1],
496                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
497                                gtid);
498 #endif // KMP_FAST_REDUCTION_BARRIER
499 }
500 
501 /* Print out the storage map for the major kmp_team_t team data structures
502    that are allocated together. */
503 
504 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
505                                          int team_id, int num_thr) {
506   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
507   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
508                                header, team_id);
509 
510   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
511                                &team->t.t_bar[bs_last_barrier],
512                                sizeof(kmp_balign_team_t) * bs_last_barrier,
513                                "%s_%d.t_bar", header, team_id);
514 
515   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
516                                &team->t.t_bar[bs_plain_barrier + 1],
517                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
518                                header, team_id);
519 
520   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
521                                &team->t.t_bar[bs_forkjoin_barrier + 1],
522                                sizeof(kmp_balign_team_t),
523                                "%s_%d.t_bar[forkjoin]", header, team_id);
524 
525 #if KMP_FAST_REDUCTION_BARRIER
526   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
527                                &team->t.t_bar[bs_reduction_barrier + 1],
528                                sizeof(kmp_balign_team_t),
529                                "%s_%d.t_bar[reduction]", header, team_id);
530 #endif // KMP_FAST_REDUCTION_BARRIER
531 
532   __kmp_print_storage_map_gtid(
533       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
534       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
535 
536   __kmp_print_storage_map_gtid(
537       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
538       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
539 
540   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
541                                &team->t.t_disp_buffer[num_disp_buff],
542                                sizeof(dispatch_shared_info_t) * num_disp_buff,
543                                "%s_%d.t_disp_buffer", header, team_id);
544 }
545 
546 static void __kmp_init_allocator() {
547   __kmp_init_memkind();
548   __kmp_init_target_mem();
549 }
550 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
551 
552 /* ------------------------------------------------------------------------ */
553 
554 #if KMP_DYNAMIC_LIB
555 #if KMP_OS_WINDOWS
556 
557 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
558   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
559 
560   switch (fdwReason) {
561 
562   case DLL_PROCESS_ATTACH:
563     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
564 
565     return TRUE;
566 
567   case DLL_PROCESS_DETACH:
568     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
569 
570     // According to Windows* documentation for DllMain entry point:
571     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
572     //   lpReserved == NULL when FreeLibrary() is called,
573     //   lpReserved != NULL when the process is terminated.
574     // When FreeLibrary() is called, worker threads remain alive. So the
575     // runtime's state is consistent and executing proper shutdown is OK.
576     // When the process is terminated, worker threads have exited or been
577     // forcefully terminated by the OS and only the shutdown thread remains.
578     // This can leave the runtime in an inconsistent state.
579     // Hence, only attempt proper cleanup when FreeLibrary() is called.
580     // Otherwise, rely on OS to reclaim resources.
581     if (lpReserved == NULL)
582       __kmp_internal_end_library(__kmp_gtid_get_specific());
583 
584     return TRUE;
585 
586   case DLL_THREAD_ATTACH:
587     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
588 
589     /* if we want to register new siblings all the time here call
590      * __kmp_get_gtid(); */
591     return TRUE;
592 
593   case DLL_THREAD_DETACH:
594     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
595 
596     __kmp_internal_end_thread(__kmp_gtid_get_specific());
597     return TRUE;
598   }
599 
600   return TRUE;
601 }
602 
603 #endif /* KMP_OS_WINDOWS */
604 #endif /* KMP_DYNAMIC_LIB */
605 
606 /* __kmp_parallel_deo -- Wait until it's our turn. */
607 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
608   int gtid = *gtid_ref;
609 #ifdef BUILD_PARALLEL_ORDERED
610   kmp_team_t *team = __kmp_team_from_gtid(gtid);
611 #endif /* BUILD_PARALLEL_ORDERED */
612 
613   if (__kmp_env_consistency_check) {
614     if (__kmp_threads[gtid]->th.th_root->r.r_active)
615 #if KMP_USE_DYNAMIC_LOCK
616       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
617 #else
618       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
619 #endif
620   }
621 #ifdef BUILD_PARALLEL_ORDERED
622   if (!team->t.t_serialized) {
623     KMP_MB();
624     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
625              NULL);
626     KMP_MB();
627   }
628 #endif /* BUILD_PARALLEL_ORDERED */
629 }
630 
631 /* __kmp_parallel_dxo -- Signal the next task. */
632 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
633   int gtid = *gtid_ref;
634 #ifdef BUILD_PARALLEL_ORDERED
635   int tid = __kmp_tid_from_gtid(gtid);
636   kmp_team_t *team = __kmp_team_from_gtid(gtid);
637 #endif /* BUILD_PARALLEL_ORDERED */
638 
639   if (__kmp_env_consistency_check) {
640     if (__kmp_threads[gtid]->th.th_root->r.r_active)
641       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
642   }
643 #ifdef BUILD_PARALLEL_ORDERED
644   if (!team->t.t_serialized) {
645     KMP_MB(); /* Flush all pending memory write invalidates.  */
646 
647     /* use the tid of the next thread in this team */
648     /* TODO replace with general release procedure */
649     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
650 
651     KMP_MB(); /* Flush all pending memory write invalidates.  */
652   }
653 #endif /* BUILD_PARALLEL_ORDERED */
654 }
655 
656 /* ------------------------------------------------------------------------ */
657 /* The BARRIER for a SINGLE process section is always explicit   */
658 
659 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
660   int status;
661   kmp_info_t *th;
662   kmp_team_t *team;
663 
664   if (!TCR_4(__kmp_init_parallel))
665     __kmp_parallel_initialize();
666   __kmp_resume_if_soft_paused();
667 
668   th = __kmp_threads[gtid];
669   team = th->th.th_team;
670   status = 0;
671 
672   th->th.th_ident = id_ref;
673 
674   if (team->t.t_serialized) {
675     status = 1;
676   } else {
677     kmp_int32 old_this = th->th.th_local.this_construct;
678 
679     ++th->th.th_local.this_construct;
680     /* try to set team count to thread count--success means thread got the
681        single block */
682     /* TODO: Should this be acquire or release? */
683     if (team->t.t_construct == old_this) {
684       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
685                                               th->th.th_local.this_construct);
686     }
687 #if USE_ITT_BUILD
688     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
689         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
690         team->t.t_active_level == 1) {
691       // Only report metadata by primary thread of active team at level 1
692       __kmp_itt_metadata_single(id_ref);
693     }
694 #endif /* USE_ITT_BUILD */
695   }
696 
697   if (__kmp_env_consistency_check) {
698     if (status && push_ws) {
699       __kmp_push_workshare(gtid, ct_psingle, id_ref);
700     } else {
701       __kmp_check_workshare(gtid, ct_psingle, id_ref);
702     }
703   }
704 #if USE_ITT_BUILD
705   if (status) {
706     __kmp_itt_single_start(gtid);
707   }
708 #endif /* USE_ITT_BUILD */
709   return status;
710 }
711 
712 void __kmp_exit_single(int gtid) {
713 #if USE_ITT_BUILD
714   __kmp_itt_single_end(gtid);
715 #endif /* USE_ITT_BUILD */
716   if (__kmp_env_consistency_check)
717     __kmp_pop_workshare(gtid, ct_psingle, NULL);
718 }
719 
720 /* determine if we can go parallel or must use a serialized parallel region and
721  * how many threads we can use
722  * set_nproc is the number of threads requested for the team
723  * returns 0 if we should serialize or only use one thread,
724  * otherwise the number of threads to use
725  * The forkjoin lock is held by the caller. */
726 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
727                                  int master_tid, int set_nthreads,
728                                  int enter_teams) {
729   int capacity;
730   int new_nthreads;
731   KMP_DEBUG_ASSERT(__kmp_init_serial);
732   KMP_DEBUG_ASSERT(root && parent_team);
733   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
734 
735   // If dyn-var is set, dynamically adjust the number of desired threads,
736   // according to the method specified by dynamic_mode.
737   new_nthreads = set_nthreads;
738   if (!get__dynamic_2(parent_team, master_tid)) {
739     ;
740   }
741 #ifdef USE_LOAD_BALANCE
742   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
743     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
744     if (new_nthreads == 1) {
745       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
746                     "reservation to 1 thread\n",
747                     master_tid));
748       return 1;
749     }
750     if (new_nthreads < set_nthreads) {
751       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
752                     "reservation to %d threads\n",
753                     master_tid, new_nthreads));
754     }
755   }
756 #endif /* USE_LOAD_BALANCE */
757   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
758     new_nthreads = __kmp_avail_proc - __kmp_nth +
759                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
760     if (new_nthreads <= 1) {
761       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
762                     "reservation to 1 thread\n",
763                     master_tid));
764       return 1;
765     }
766     if (new_nthreads < set_nthreads) {
767       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
768                     "reservation to %d threads\n",
769                     master_tid, new_nthreads));
770     } else {
771       new_nthreads = set_nthreads;
772     }
773   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
774     if (set_nthreads > 2) {
775       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
776       new_nthreads = (new_nthreads % set_nthreads) + 1;
777       if (new_nthreads == 1) {
778         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
779                       "reservation to 1 thread\n",
780                       master_tid));
781         return 1;
782       }
783       if (new_nthreads < set_nthreads) {
784         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
785                       "reservation to %d threads\n",
786                       master_tid, new_nthreads));
787       }
788     }
789   } else {
790     KMP_ASSERT(0);
791   }
792 
793   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
794   if (__kmp_nth + new_nthreads -
795           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
796       __kmp_max_nth) {
797     int tl_nthreads = __kmp_max_nth - __kmp_nth +
798                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
799     if (tl_nthreads <= 0) {
800       tl_nthreads = 1;
801     }
802 
803     // If dyn-var is false, emit a 1-time warning.
804     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
805       __kmp_reserve_warn = 1;
806       __kmp_msg(kmp_ms_warning,
807                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
808                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
809     }
810     if (tl_nthreads == 1) {
811       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
812                     "reduced reservation to 1 thread\n",
813                     master_tid));
814       return 1;
815     }
816     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
817                   "reservation to %d threads\n",
818                   master_tid, tl_nthreads));
819     new_nthreads = tl_nthreads;
820   }
821 
822   // Respect OMP_THREAD_LIMIT
823   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
824   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
825   if (cg_nthreads + new_nthreads -
826           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
827       max_cg_threads) {
828     int tl_nthreads = max_cg_threads - cg_nthreads +
829                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
830     if (tl_nthreads <= 0) {
831       tl_nthreads = 1;
832     }
833 
834     // If dyn-var is false, emit a 1-time warning.
835     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
836       __kmp_reserve_warn = 1;
837       __kmp_msg(kmp_ms_warning,
838                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
839                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
840     }
841     if (tl_nthreads == 1) {
842       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
843                     "reduced reservation to 1 thread\n",
844                     master_tid));
845       return 1;
846     }
847     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
848                   "reservation to %d threads\n",
849                   master_tid, tl_nthreads));
850     new_nthreads = tl_nthreads;
851   }
852 
853   // Check if the threads array is large enough, or needs expanding.
854   // See comment in __kmp_register_root() about the adjustment if
855   // __kmp_threads[0] == NULL.
856   capacity = __kmp_threads_capacity;
857   if (TCR_PTR(__kmp_threads[0]) == NULL) {
858     --capacity;
859   }
860   // If it is not for initializing the hidden helper team, we need to take
861   // __kmp_hidden_helper_threads_num out of the capacity because it is included
862   // in __kmp_threads_capacity.
863   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
864     capacity -= __kmp_hidden_helper_threads_num;
865   }
866   if (__kmp_nth + new_nthreads -
867           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
868       capacity) {
869     // Expand the threads array.
870     int slotsRequired = __kmp_nth + new_nthreads -
871                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
872                         capacity;
873     int slotsAdded = __kmp_expand_threads(slotsRequired);
874     if (slotsAdded < slotsRequired) {
875       // The threads array was not expanded enough.
876       new_nthreads -= (slotsRequired - slotsAdded);
877       KMP_ASSERT(new_nthreads >= 1);
878 
879       // If dyn-var is false, emit a 1-time warning.
880       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
881         __kmp_reserve_warn = 1;
882         if (__kmp_tp_cached) {
883           __kmp_msg(kmp_ms_warning,
884                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
885                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
886                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
887         } else {
888           __kmp_msg(kmp_ms_warning,
889                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
890                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
891         }
892       }
893     }
894   }
895 
896 #ifdef KMP_DEBUG
897   if (new_nthreads == 1) {
898     KC_TRACE(10,
899              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
900               "dead roots and rechecking; requested %d threads\n",
901               __kmp_get_gtid(), set_nthreads));
902   } else {
903     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
904                   " %d threads\n",
905                   __kmp_get_gtid(), new_nthreads, set_nthreads));
906   }
907 #endif // KMP_DEBUG
908   return new_nthreads;
909 }
910 
911 /* Allocate threads from the thread pool and assign them to the new team. We are
912    assured that there are enough threads available, because we checked on that
913    earlier within critical section forkjoin */
914 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
915                                     kmp_info_t *master_th, int master_gtid) {
916   int i;
917   int use_hot_team;
918 
919   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
920   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
921   KMP_MB();
922 
923   /* first, let's setup the primary thread */
924   master_th->th.th_info.ds.ds_tid = 0;
925   master_th->th.th_team = team;
926   master_th->th.th_team_nproc = team->t.t_nproc;
927   master_th->th.th_team_master = master_th;
928   master_th->th.th_team_serialized = FALSE;
929   master_th->th.th_dispatch = &team->t.t_dispatch[0];
930 
931 /* make sure we are not the optimized hot team */
932 #if KMP_NESTED_HOT_TEAMS
933   use_hot_team = 0;
934   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
935   if (hot_teams) { // hot teams array is not allocated if
936     // KMP_HOT_TEAMS_MAX_LEVEL=0
937     int level = team->t.t_active_level - 1; // index in array of hot teams
938     if (master_th->th.th_teams_microtask) { // are we inside the teams?
939       if (master_th->th.th_teams_size.nteams > 1) {
940         ++level; // level was not increased in teams construct for
941         // team_of_masters
942       }
943       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
944           master_th->th.th_teams_level == team->t.t_level) {
945         ++level; // level was not increased in teams construct for
946         // team_of_workers before the parallel
947       } // team->t.t_level will be increased inside parallel
948     }
949     if (level < __kmp_hot_teams_max_level) {
950       if (hot_teams[level].hot_team) {
951         // hot team has already been allocated for given level
952         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
953         use_hot_team = 1; // the team is ready to use
954       } else {
955         use_hot_team = 0; // AC: threads are not allocated yet
956         hot_teams[level].hot_team = team; // remember new hot team
957         hot_teams[level].hot_team_nth = team->t.t_nproc;
958       }
959     } else {
960       use_hot_team = 0;
961     }
962   }
963 #else
964   use_hot_team = team == root->r.r_hot_team;
965 #endif
966   if (!use_hot_team) {
967 
968     /* install the primary thread */
969     team->t.t_threads[0] = master_th;
970     __kmp_initialize_info(master_th, team, 0, master_gtid);
971 
972     /* now, install the worker threads */
973     for (i = 1; i < team->t.t_nproc; i++) {
974 
975       /* fork or reallocate a new thread and install it in team */
976       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
977       team->t.t_threads[i] = thr;
978       KMP_DEBUG_ASSERT(thr);
979       KMP_DEBUG_ASSERT(thr->th.th_team == team);
980       /* align team and thread arrived states */
981       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
982                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
983                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
984                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
985                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
986                     team->t.t_bar[bs_plain_barrier].b_arrived));
987       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
988       thr->th.th_teams_level = master_th->th.th_teams_level;
989       thr->th.th_teams_size = master_th->th.th_teams_size;
990       { // Initialize threads' barrier data.
991         int b;
992         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
993         for (b = 0; b < bs_last_barrier; ++b) {
994           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
995           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
996 #if USE_DEBUGGER
997           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
998 #endif
999         }
1000       }
1001     }
1002 
1003 #if KMP_AFFINITY_SUPPORTED
1004     __kmp_partition_places(team);
1005 #endif
1006   }
1007 
1008   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1009     for (i = 0; i < team->t.t_nproc; i++) {
1010       kmp_info_t *thr = team->t.t_threads[i];
1011       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1012           thr->th.th_prev_level != team->t.t_level) {
1013         team->t.t_display_affinity = 1;
1014         break;
1015       }
1016     }
1017   }
1018 
1019   KMP_MB();
1020 }
1021 
1022 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1023 // Propagate any changes to the floating point control registers out to the team
1024 // We try to avoid unnecessary writes to the relevant cache line in the team
1025 // structure, so we don't make changes unless they are needed.
1026 inline static void propagateFPControl(kmp_team_t *team) {
1027   if (__kmp_inherit_fp_control) {
1028     kmp_int16 x87_fpu_control_word;
1029     kmp_uint32 mxcsr;
1030 
1031     // Get primary thread's values of FPU control flags (both X87 and vector)
1032     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1033     __kmp_store_mxcsr(&mxcsr);
1034     mxcsr &= KMP_X86_MXCSR_MASK;
1035 
1036     // There is no point looking at t_fp_control_saved here.
1037     // If it is TRUE, we still have to update the values if they are different
1038     // from those we now have. If it is FALSE we didn't save anything yet, but
1039     // our objective is the same. We have to ensure that the values in the team
1040     // are the same as those we have.
1041     // So, this code achieves what we need whether or not t_fp_control_saved is
1042     // true. By checking whether the value needs updating we avoid unnecessary
1043     // writes that would put the cache-line into a written state, causing all
1044     // threads in the team to have to read it again.
1045     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1046     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1047     // Although we don't use this value, other code in the runtime wants to know
1048     // whether it should restore them. So we must ensure it is correct.
1049     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1050   } else {
1051     // Similarly here. Don't write to this cache-line in the team structure
1052     // unless we have to.
1053     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1054   }
1055 }
1056 
1057 // Do the opposite, setting the hardware registers to the updated values from
1058 // the team.
1059 inline static void updateHWFPControl(kmp_team_t *team) {
1060   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1061     // Only reset the fp control regs if they have been changed in the team.
1062     // the parallel region that we are exiting.
1063     kmp_int16 x87_fpu_control_word;
1064     kmp_uint32 mxcsr;
1065     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1066     __kmp_store_mxcsr(&mxcsr);
1067     mxcsr &= KMP_X86_MXCSR_MASK;
1068 
1069     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1070       __kmp_clear_x87_fpu_status_word();
1071       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1072     }
1073 
1074     if (team->t.t_mxcsr != mxcsr) {
1075       __kmp_load_mxcsr(&team->t.t_mxcsr);
1076     }
1077   }
1078 }
1079 #else
1080 #define propagateFPControl(x) ((void)0)
1081 #define updateHWFPControl(x) ((void)0)
1082 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1083 
1084 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1085                                      int realloc); // forward declaration
1086 
1087 /* Run a parallel region that has been serialized, so runs only in a team of the
1088    single primary thread. */
1089 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1090   kmp_info_t *this_thr;
1091   kmp_team_t *serial_team;
1092 
1093   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1094 
1095   /* Skip all this code for autopar serialized loops since it results in
1096      unacceptable overhead */
1097   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1098     return;
1099 
1100   if (!TCR_4(__kmp_init_parallel))
1101     __kmp_parallel_initialize();
1102   __kmp_resume_if_soft_paused();
1103 
1104   this_thr = __kmp_threads[global_tid];
1105   serial_team = this_thr->th.th_serial_team;
1106 
1107   /* utilize the serialized team held by this thread */
1108   KMP_DEBUG_ASSERT(serial_team);
1109   KMP_MB();
1110 
1111   if (__kmp_tasking_mode != tskm_immediate_exec) {
1112     KMP_DEBUG_ASSERT(
1113         this_thr->th.th_task_team ==
1114         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1115     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1116                      NULL);
1117     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1118                   "team %p, new task_team = NULL\n",
1119                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1120     this_thr->th.th_task_team = NULL;
1121   }
1122 
1123   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1124   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1125     proc_bind = proc_bind_false;
1126   } else if (proc_bind == proc_bind_default) {
1127     // No proc_bind clause was specified, so use the current value
1128     // of proc-bind-var for this parallel region.
1129     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1130   }
1131   // Reset for next parallel region
1132   this_thr->th.th_set_proc_bind = proc_bind_default;
1133 
1134 #if OMPT_SUPPORT
1135   ompt_data_t ompt_parallel_data = ompt_data_none;
1136   ompt_data_t *implicit_task_data;
1137   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1138   if (ompt_enabled.enabled &&
1139       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1140 
1141     ompt_task_info_t *parent_task_info;
1142     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1143 
1144     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1145     if (ompt_enabled.ompt_callback_parallel_begin) {
1146       int team_size = 1;
1147 
1148       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1149           &(parent_task_info->task_data), &(parent_task_info->frame),
1150           &ompt_parallel_data, team_size,
1151           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1152     }
1153   }
1154 #endif // OMPT_SUPPORT
1155 
1156   if (this_thr->th.th_team != serial_team) {
1157     // Nested level will be an index in the nested nthreads array
1158     int level = this_thr->th.th_team->t.t_level;
1159 
1160     if (serial_team->t.t_serialized) {
1161       /* this serial team was already used
1162          TODO increase performance by making this locks more specific */
1163       kmp_team_t *new_team;
1164 
1165       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1166 
1167       new_team =
1168           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1169 #if OMPT_SUPPORT
1170                               ompt_parallel_data,
1171 #endif
1172                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1173                               0 USE_NESTED_HOT_ARG(NULL));
1174       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1175       KMP_ASSERT(new_team);
1176 
1177       /* setup new serialized team and install it */
1178       new_team->t.t_threads[0] = this_thr;
1179       new_team->t.t_parent = this_thr->th.th_team;
1180       serial_team = new_team;
1181       this_thr->th.th_serial_team = serial_team;
1182 
1183       KF_TRACE(
1184           10,
1185           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1186            global_tid, serial_team));
1187 
1188       /* TODO the above breaks the requirement that if we run out of resources,
1189          then we can still guarantee that serialized teams are ok, since we may
1190          need to allocate a new one */
1191     } else {
1192       KF_TRACE(
1193           10,
1194           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1195            global_tid, serial_team));
1196     }
1197 
1198     /* we have to initialize this serial team */
1199     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1200     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1201     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1202     serial_team->t.t_ident = loc;
1203     serial_team->t.t_serialized = 1;
1204     serial_team->t.t_nproc = 1;
1205     serial_team->t.t_parent = this_thr->th.th_team;
1206     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1207     this_thr->th.th_team = serial_team;
1208     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1209 
1210     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1211                   this_thr->th.th_current_task));
1212     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1213     this_thr->th.th_current_task->td_flags.executing = 0;
1214 
1215     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1216 
1217     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1218        implicit task for each serialized task represented by
1219        team->t.t_serialized? */
1220     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1221               &this_thr->th.th_current_task->td_parent->td_icvs);
1222 
1223     // Thread value exists in the nested nthreads array for the next nested
1224     // level
1225     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1226       this_thr->th.th_current_task->td_icvs.nproc =
1227           __kmp_nested_nth.nth[level + 1];
1228     }
1229 
1230     if (__kmp_nested_proc_bind.used &&
1231         (level + 1 < __kmp_nested_proc_bind.used)) {
1232       this_thr->th.th_current_task->td_icvs.proc_bind =
1233           __kmp_nested_proc_bind.bind_types[level + 1];
1234     }
1235 
1236 #if USE_DEBUGGER
1237     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1238 #endif
1239     this_thr->th.th_info.ds.ds_tid = 0;
1240 
1241     /* set thread cache values */
1242     this_thr->th.th_team_nproc = 1;
1243     this_thr->th.th_team_master = this_thr;
1244     this_thr->th.th_team_serialized = 1;
1245 
1246     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1247     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1248     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1249 
1250     propagateFPControl(serial_team);
1251 
1252     /* check if we need to allocate dispatch buffers stack */
1253     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1254     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1255       serial_team->t.t_dispatch->th_disp_buffer =
1256           (dispatch_private_info_t *)__kmp_allocate(
1257               sizeof(dispatch_private_info_t));
1258     }
1259     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1260 
1261     KMP_MB();
1262 
1263   } else {
1264     /* this serialized team is already being used,
1265      * that's fine, just add another nested level */
1266     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1267     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1268     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1269     ++serial_team->t.t_serialized;
1270     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1271 
1272     // Nested level will be an index in the nested nthreads array
1273     int level = this_thr->th.th_team->t.t_level;
1274     // Thread value exists in the nested nthreads array for the next nested
1275     // level
1276     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1277       this_thr->th.th_current_task->td_icvs.nproc =
1278           __kmp_nested_nth.nth[level + 1];
1279     }
1280     serial_team->t.t_level++;
1281     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1282                   "of serial team %p to %d\n",
1283                   global_tid, serial_team, serial_team->t.t_level));
1284 
1285     /* allocate/push dispatch buffers stack */
1286     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1287     {
1288       dispatch_private_info_t *disp_buffer =
1289           (dispatch_private_info_t *)__kmp_allocate(
1290               sizeof(dispatch_private_info_t));
1291       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1292       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1293     }
1294     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1295 
1296     KMP_MB();
1297   }
1298   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1299 
1300   // Perform the display affinity functionality for
1301   // serialized parallel regions
1302   if (__kmp_display_affinity) {
1303     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1304         this_thr->th.th_prev_num_threads != 1) {
1305       // NULL means use the affinity-format-var ICV
1306       __kmp_aux_display_affinity(global_tid, NULL);
1307       this_thr->th.th_prev_level = serial_team->t.t_level;
1308       this_thr->th.th_prev_num_threads = 1;
1309     }
1310   }
1311 
1312   if (__kmp_env_consistency_check)
1313     __kmp_push_parallel(global_tid, NULL);
1314 #if OMPT_SUPPORT
1315   serial_team->t.ompt_team_info.master_return_address = codeptr;
1316   if (ompt_enabled.enabled &&
1317       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1318     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1319         OMPT_GET_FRAME_ADDRESS(0);
1320 
1321     ompt_lw_taskteam_t lw_taskteam;
1322     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1323                             &ompt_parallel_data, codeptr);
1324 
1325     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1326     // don't use lw_taskteam after linking. content was swaped
1327 
1328     /* OMPT implicit task begin */
1329     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1330     if (ompt_enabled.ompt_callback_implicit_task) {
1331       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1332           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1333           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1334           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1335       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1336           __kmp_tid_from_gtid(global_tid);
1337     }
1338 
1339     /* OMPT state */
1340     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1341     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1342         OMPT_GET_FRAME_ADDRESS(0);
1343   }
1344 #endif
1345 }
1346 
1347 /* most of the work for a fork */
1348 /* return true if we really went parallel, false if serialized */
1349 int __kmp_fork_call(ident_t *loc, int gtid,
1350                     enum fork_context_e call_context, // Intel, GNU, ...
1351                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1352                     kmp_va_list ap) {
1353   void **argv;
1354   int i;
1355   int master_tid;
1356   int master_this_cons;
1357   kmp_team_t *team;
1358   kmp_team_t *parent_team;
1359   kmp_info_t *master_th;
1360   kmp_root_t *root;
1361   int nthreads;
1362   int master_active;
1363   int master_set_numthreads;
1364   int level;
1365   int active_level;
1366   int teams_level;
1367 #if KMP_NESTED_HOT_TEAMS
1368   kmp_hot_team_ptr_t **p_hot_teams;
1369 #endif
1370   { // KMP_TIME_BLOCK
1371     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1372     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1373 
1374     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1375     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1376       /* Some systems prefer the stack for the root thread(s) to start with */
1377       /* some gap from the parent stack to prevent false sharing. */
1378       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1379       /* These 2 lines below are so this does not get optimized out */
1380       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1381         __kmp_stkpadding += (short)((kmp_int64)dummy);
1382     }
1383 
1384     /* initialize if needed */
1385     KMP_DEBUG_ASSERT(
1386         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1387     if (!TCR_4(__kmp_init_parallel))
1388       __kmp_parallel_initialize();
1389     __kmp_resume_if_soft_paused();
1390 
1391     /* setup current data */
1392     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1393     // shutdown
1394     parent_team = master_th->th.th_team;
1395     master_tid = master_th->th.th_info.ds.ds_tid;
1396     master_this_cons = master_th->th.th_local.this_construct;
1397     root = master_th->th.th_root;
1398     master_active = root->r.r_active;
1399     master_set_numthreads = master_th->th.th_set_nproc;
1400 
1401 #if OMPT_SUPPORT
1402     ompt_data_t ompt_parallel_data = ompt_data_none;
1403     ompt_data_t *parent_task_data;
1404     ompt_frame_t *ompt_frame;
1405     ompt_data_t *implicit_task_data;
1406     void *return_address = NULL;
1407 
1408     if (ompt_enabled.enabled) {
1409       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1410                                     NULL, NULL);
1411       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1412     }
1413 #endif
1414 
1415     // Assign affinity to root thread if it hasn't happened yet
1416     __kmp_assign_root_init_mask();
1417 
1418     // Nested level will be an index in the nested nthreads array
1419     level = parent_team->t.t_level;
1420     // used to launch non-serial teams even if nested is not allowed
1421     active_level = parent_team->t.t_active_level;
1422     // needed to check nesting inside the teams
1423     teams_level = master_th->th.th_teams_level;
1424 #if KMP_NESTED_HOT_TEAMS
1425     p_hot_teams = &master_th->th.th_hot_teams;
1426     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1427       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1428           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1429       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1430       // it is either actual or not needed (when active_level > 0)
1431       (*p_hot_teams)[0].hot_team_nth = 1;
1432     }
1433 #endif
1434 
1435 #if OMPT_SUPPORT
1436     if (ompt_enabled.enabled) {
1437       if (ompt_enabled.ompt_callback_parallel_begin) {
1438         int team_size = master_set_numthreads
1439                             ? master_set_numthreads
1440                             : get__nproc_2(parent_team, master_tid);
1441         int flags = OMPT_INVOKER(call_context) |
1442                     ((microtask == (microtask_t)__kmp_teams_master)
1443                          ? ompt_parallel_league
1444                          : ompt_parallel_team);
1445         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1446             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1447             return_address);
1448       }
1449       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1450     }
1451 #endif
1452 
1453     master_th->th.th_ident = loc;
1454 
1455     if (master_th->th.th_teams_microtask && ap &&
1456         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1457       // AC: This is start of parallel that is nested inside teams construct.
1458       // The team is actual (hot), all workers are ready at the fork barrier.
1459       // No lock needed to initialize the team a bit, then free workers.
1460       parent_team->t.t_ident = loc;
1461       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1462       parent_team->t.t_argc = argc;
1463       argv = (void **)parent_team->t.t_argv;
1464       for (i = argc - 1; i >= 0; --i)
1465         *argv++ = va_arg(kmp_va_deref(ap), void *);
1466       // Increment our nested depth levels, but not increase the serialization
1467       if (parent_team == master_th->th.th_serial_team) {
1468         // AC: we are in serialized parallel
1469         __kmpc_serialized_parallel(loc, gtid);
1470         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1471 
1472         if (call_context == fork_context_gnu) {
1473           // AC: need to decrement t_serialized for enquiry functions to work
1474           // correctly, will restore at join time
1475           parent_team->t.t_serialized--;
1476           return TRUE;
1477         }
1478 
1479 #if OMPD_SUPPORT
1480         parent_team->t.t_pkfn = microtask;
1481 #endif
1482 
1483 #if OMPT_SUPPORT
1484         void *dummy;
1485         void **exit_frame_p;
1486 
1487         ompt_lw_taskteam_t lw_taskteam;
1488 
1489         if (ompt_enabled.enabled) {
1490           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1491                                   &ompt_parallel_data, return_address);
1492           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1493 
1494           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1495           // don't use lw_taskteam after linking. content was swaped
1496 
1497           /* OMPT implicit task begin */
1498           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1499           if (ompt_enabled.ompt_callback_implicit_task) {
1500             OMPT_CUR_TASK_INFO(master_th)->thread_num =
1501                 __kmp_tid_from_gtid(gtid);
1502             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1503                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1504                 implicit_task_data, 1,
1505                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1506           }
1507 
1508           /* OMPT state */
1509           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1510         } else {
1511           exit_frame_p = &dummy;
1512         }
1513 #endif
1514         // AC: need to decrement t_serialized for enquiry functions to work
1515         // correctly, will restore at join time
1516         parent_team->t.t_serialized--;
1517 
1518         {
1519           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1520           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1521           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1522 #if OMPT_SUPPORT
1523                                  ,
1524                                  exit_frame_p
1525 #endif
1526           );
1527         }
1528 
1529 #if OMPT_SUPPORT
1530         if (ompt_enabled.enabled) {
1531           *exit_frame_p = NULL;
1532           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1533           if (ompt_enabled.ompt_callback_implicit_task) {
1534             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1535                 ompt_scope_end, NULL, implicit_task_data, 1,
1536                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1537           }
1538           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1539           __ompt_lw_taskteam_unlink(master_th);
1540           if (ompt_enabled.ompt_callback_parallel_end) {
1541             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1542                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1543                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1544                 return_address);
1545           }
1546           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1547         }
1548 #endif
1549         return TRUE;
1550       }
1551 
1552       parent_team->t.t_pkfn = microtask;
1553       parent_team->t.t_invoke = invoker;
1554       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1555       parent_team->t.t_active_level++;
1556       parent_team->t.t_level++;
1557       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1558 
1559 #if OMPT_SUPPORT
1560       if (ompt_enabled.enabled) {
1561         ompt_lw_taskteam_t lw_taskteam;
1562         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1563                                 &ompt_parallel_data, return_address);
1564         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1565       }
1566 #endif
1567 
1568       /* Change number of threads in the team if requested */
1569       if (master_set_numthreads) { // The parallel has num_threads clause
1570         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1571           // AC: only can reduce number of threads dynamically, can't increase
1572           kmp_info_t **other_threads = parent_team->t.t_threads;
1573           parent_team->t.t_nproc = master_set_numthreads;
1574           for (i = 0; i < master_set_numthreads; ++i) {
1575             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1576           }
1577           // Keep extra threads hot in the team for possible next parallels
1578         }
1579         master_th->th.th_set_nproc = 0;
1580       }
1581 
1582 #if USE_DEBUGGER
1583       if (__kmp_debugging) { // Let debugger override number of threads.
1584         int nth = __kmp_omp_num_threads(loc);
1585         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1586           master_set_numthreads = nth;
1587         }
1588       }
1589 #endif
1590 
1591 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1592       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1593            KMP_ITT_DEBUG) &&
1594           __kmp_forkjoin_frames_mode == 3 &&
1595           parent_team->t.t_active_level == 1 // only report frames at level 1
1596           && master_th->th.th_teams_size.nteams == 1) {
1597         kmp_uint64 tmp_time = __itt_get_timestamp();
1598         master_th->th.th_frame_time = tmp_time;
1599         parent_team->t.t_region_time = tmp_time;
1600       }
1601       if (__itt_stack_caller_create_ptr) {
1602         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1603         // create new stack stitching id before entering fork barrier
1604         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1605       }
1606 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1607 
1608       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1609                     "master_th=%p, gtid=%d\n",
1610                     root, parent_team, master_th, gtid));
1611       __kmp_internal_fork(loc, gtid, parent_team);
1612       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1613                     "master_th=%p, gtid=%d\n",
1614                     root, parent_team, master_th, gtid));
1615 
1616       if (call_context == fork_context_gnu)
1617         return TRUE;
1618 
1619       /* Invoke microtask for PRIMARY thread */
1620       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1621                     parent_team->t.t_id, parent_team->t.t_pkfn));
1622 
1623       if (!parent_team->t.t_invoke(gtid)) {
1624         KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1625       }
1626       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1627                     parent_team->t.t_id, parent_team->t.t_pkfn));
1628       KMP_MB(); /* Flush all pending memory write invalidates.  */
1629 
1630       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1631 
1632       return TRUE;
1633     } // Parallel closely nested in teams construct
1634 
1635 #if KMP_DEBUG
1636     if (__kmp_tasking_mode != tskm_immediate_exec) {
1637       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1638                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1639     }
1640 #endif
1641 
1642     int enter_teams = 0;
1643     if (parent_team->t.t_active_level >=
1644         master_th->th.th_current_task->td_icvs.max_active_levels) {
1645       nthreads = 1;
1646     } else {
1647       enter_teams = ((ap == NULL && active_level == 0) ||
1648                      (ap && teams_level > 0 && teams_level == level));
1649       nthreads =
1650           master_set_numthreads
1651               ? master_set_numthreads
1652               : get__nproc_2(
1653                     parent_team,
1654                     master_tid); // TODO: get nproc directly from current task
1655 
1656       // Check if we need to take forkjoin lock? (no need for serialized
1657       // parallel out of teams construct). This code moved here from
1658       // __kmp_reserve_threads() to speedup nested serialized parallels.
1659       if (nthreads > 1) {
1660         if ((get__max_active_levels(master_th) == 1 &&
1661              (root->r.r_in_parallel && !enter_teams)) ||
1662             (__kmp_library == library_serial)) {
1663           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1664                         " threads\n",
1665                         gtid, nthreads));
1666           nthreads = 1;
1667         }
1668       }
1669       if (nthreads > 1) {
1670         /* determine how many new threads we can use */
1671         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1672         /* AC: If we execute teams from parallel region (on host), then teams
1673            should be created but each can only have 1 thread if nesting is
1674            disabled. If teams called from serial region, then teams and their
1675            threads should be created regardless of the nesting setting. */
1676         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1677                                          nthreads, enter_teams);
1678         if (nthreads == 1) {
1679           // Free lock for single thread execution here; for multi-thread
1680           // execution it will be freed later after team of threads created
1681           // and initialized
1682           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1683         }
1684       }
1685     }
1686     KMP_DEBUG_ASSERT(nthreads > 0);
1687 
1688     // If we temporarily changed the set number of threads then restore it now
1689     master_th->th.th_set_nproc = 0;
1690 
1691     /* create a serialized parallel region? */
1692     if (nthreads == 1) {
1693 /* josh todo: hypothetical question: what do we do for OS X*? */
1694 #if KMP_OS_LINUX &&                                                            \
1695     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1696       void *args[argc];
1697 #else
1698       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1699 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1700           KMP_ARCH_AARCH64) */
1701 
1702       KA_TRACE(20,
1703                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1704 
1705       __kmpc_serialized_parallel(loc, gtid);
1706 
1707 #if OMPD_SUPPORT
1708       master_th->th.th_serial_team->t.t_pkfn = microtask;
1709 #endif
1710 
1711       if (call_context == fork_context_intel) {
1712         /* TODO this sucks, use the compiler itself to pass args! :) */
1713         master_th->th.th_serial_team->t.t_ident = loc;
1714         if (!ap) {
1715           // revert change made in __kmpc_serialized_parallel()
1716           master_th->th.th_serial_team->t.t_level--;
1717           // Get args from parent team for teams construct
1718 
1719 #if OMPT_SUPPORT
1720           void *dummy;
1721           void **exit_frame_p;
1722           ompt_task_info_t *task_info;
1723 
1724           ompt_lw_taskteam_t lw_taskteam;
1725 
1726           if (ompt_enabled.enabled) {
1727             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1728                                     &ompt_parallel_data, return_address);
1729 
1730             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1731             // don't use lw_taskteam after linking. content was swaped
1732 
1733             task_info = OMPT_CUR_TASK_INFO(master_th);
1734             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1735             if (ompt_enabled.ompt_callback_implicit_task) {
1736               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1737                   __kmp_tid_from_gtid(gtid);
1738               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1739                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1740                   &(task_info->task_data), 1,
1741                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1742                   ompt_task_implicit);
1743             }
1744 
1745             /* OMPT state */
1746             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1747           } else {
1748             exit_frame_p = &dummy;
1749           }
1750 #endif
1751 
1752           {
1753             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1754             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1755             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1756                                    parent_team->t.t_argv
1757 #if OMPT_SUPPORT
1758                                    ,
1759                                    exit_frame_p
1760 #endif
1761             );
1762           }
1763 
1764 #if OMPT_SUPPORT
1765           if (ompt_enabled.enabled) {
1766             *exit_frame_p = NULL;
1767             if (ompt_enabled.ompt_callback_implicit_task) {
1768               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1769                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1770                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1771                   ompt_task_implicit);
1772             }
1773             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1774             __ompt_lw_taskteam_unlink(master_th);
1775             if (ompt_enabled.ompt_callback_parallel_end) {
1776               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1777                   &ompt_parallel_data, parent_task_data,
1778                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1779                   return_address);
1780             }
1781             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1782           }
1783 #endif
1784         } else if (microtask == (microtask_t)__kmp_teams_master) {
1785           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1786                            master_th->th.th_serial_team);
1787           team = master_th->th.th_team;
1788           // team->t.t_pkfn = microtask;
1789           team->t.t_invoke = invoker;
1790           __kmp_alloc_argv_entries(argc, team, TRUE);
1791           team->t.t_argc = argc;
1792           argv = (void **)team->t.t_argv;
1793           if (ap) {
1794             for (i = argc - 1; i >= 0; --i)
1795               *argv++ = va_arg(kmp_va_deref(ap), void *);
1796           } else {
1797             for (i = 0; i < argc; ++i)
1798               // Get args from parent team for teams construct
1799               argv[i] = parent_team->t.t_argv[i];
1800           }
1801           // AC: revert change made in __kmpc_serialized_parallel()
1802           //     because initial code in teams should have level=0
1803           team->t.t_level--;
1804           // AC: call special invoker for outer "parallel" of teams construct
1805           invoker(gtid);
1806 #if OMPT_SUPPORT
1807           if (ompt_enabled.enabled) {
1808             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1809             if (ompt_enabled.ompt_callback_implicit_task) {
1810               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1811                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1812                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1813             }
1814             if (ompt_enabled.ompt_callback_parallel_end) {
1815               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1816                   &ompt_parallel_data, parent_task_data,
1817                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1818                   return_address);
1819             }
1820             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1821           }
1822 #endif
1823         } else {
1824           argv = args;
1825           for (i = argc - 1; i >= 0; --i)
1826             *argv++ = va_arg(kmp_va_deref(ap), void *);
1827           KMP_MB();
1828 
1829 #if OMPT_SUPPORT
1830           void *dummy;
1831           void **exit_frame_p;
1832           ompt_task_info_t *task_info;
1833 
1834           ompt_lw_taskteam_t lw_taskteam;
1835 
1836           if (ompt_enabled.enabled) {
1837             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1838                                     &ompt_parallel_data, return_address);
1839             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1840             // don't use lw_taskteam after linking. content was swaped
1841             task_info = OMPT_CUR_TASK_INFO(master_th);
1842             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1843 
1844             /* OMPT implicit task begin */
1845             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1846             if (ompt_enabled.ompt_callback_implicit_task) {
1847               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1848                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1849                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1850                   ompt_task_implicit);
1851               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1852                   __kmp_tid_from_gtid(gtid);
1853             }
1854 
1855             /* OMPT state */
1856             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1857           } else {
1858             exit_frame_p = &dummy;
1859           }
1860 #endif
1861 
1862           {
1863             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1864             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1865             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1866 #if OMPT_SUPPORT
1867                                    ,
1868                                    exit_frame_p
1869 #endif
1870             );
1871           }
1872 
1873 #if OMPT_SUPPORT
1874           if (ompt_enabled.enabled) {
1875             *exit_frame_p = NULL;
1876             if (ompt_enabled.ompt_callback_implicit_task) {
1877               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1878                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1879                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1880                   ompt_task_implicit);
1881             }
1882 
1883             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1884             __ompt_lw_taskteam_unlink(master_th);
1885             if (ompt_enabled.ompt_callback_parallel_end) {
1886               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1887                   &ompt_parallel_data, parent_task_data,
1888                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1889                   return_address);
1890             }
1891             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1892           }
1893 #endif
1894         }
1895       } else if (call_context == fork_context_gnu) {
1896 #if OMPT_SUPPORT
1897         ompt_lw_taskteam_t lwt;
1898         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1899                                 return_address);
1900 
1901         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1902         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1903 // don't use lw_taskteam after linking. content was swaped
1904 #endif
1905 
1906         // we were called from GNU native code
1907         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1908         return FALSE;
1909       } else {
1910         KMP_ASSERT2(call_context < fork_context_last,
1911                     "__kmp_fork_call: unknown fork_context parameter");
1912       }
1913 
1914       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1915       KMP_MB();
1916       return FALSE;
1917     } // if (nthreads == 1)
1918 
1919     // GEH: only modify the executing flag in the case when not serialized
1920     //      serialized case is handled in kmpc_serialized_parallel
1921     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1922                   "curtask=%p, curtask_max_aclevel=%d\n",
1923                   parent_team->t.t_active_level, master_th,
1924                   master_th->th.th_current_task,
1925                   master_th->th.th_current_task->td_icvs.max_active_levels));
1926     // TODO: GEH - cannot do this assertion because root thread not set up as
1927     // executing
1928     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1929     master_th->th.th_current_task->td_flags.executing = 0;
1930 
1931     if (!master_th->th.th_teams_microtask || level > teams_level) {
1932       /* Increment our nested depth level */
1933       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1934     }
1935 
1936     // See if we need to make a copy of the ICVs.
1937     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1938     if ((level + 1 < __kmp_nested_nth.used) &&
1939         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1940       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1941     } else {
1942       nthreads_icv = 0; // don't update
1943     }
1944 
1945     // Figure out the proc_bind_policy for the new team.
1946     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1947     kmp_proc_bind_t proc_bind_icv =
1948         proc_bind_default; // proc_bind_default means don't update
1949     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1950       proc_bind = proc_bind_false;
1951     } else {
1952       if (proc_bind == proc_bind_default) {
1953         // No proc_bind clause specified; use current proc-bind-var for this
1954         // parallel region
1955         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1956       }
1957       /* else: The proc_bind policy was specified explicitly on parallel clause.
1958          This overrides proc-bind-var for this parallel region, but does not
1959          change proc-bind-var. */
1960       // Figure the value of proc-bind-var for the child threads.
1961       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1962           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1963            master_th->th.th_current_task->td_icvs.proc_bind)) {
1964         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1965       }
1966     }
1967 
1968     // Reset for next parallel region
1969     master_th->th.th_set_proc_bind = proc_bind_default;
1970 
1971     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1972       kmp_internal_control_t new_icvs;
1973       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1974       new_icvs.next = NULL;
1975       if (nthreads_icv > 0) {
1976         new_icvs.nproc = nthreads_icv;
1977       }
1978       if (proc_bind_icv != proc_bind_default) {
1979         new_icvs.proc_bind = proc_bind_icv;
1980       }
1981 
1982       /* allocate a new parallel team */
1983       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1984       team = __kmp_allocate_team(root, nthreads, nthreads,
1985 #if OMPT_SUPPORT
1986                                  ompt_parallel_data,
1987 #endif
1988                                  proc_bind, &new_icvs,
1989                                  argc USE_NESTED_HOT_ARG(master_th));
1990     } else {
1991       /* allocate a new parallel team */
1992       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1993       team = __kmp_allocate_team(root, nthreads, nthreads,
1994 #if OMPT_SUPPORT
1995                                  ompt_parallel_data,
1996 #endif
1997                                  proc_bind,
1998                                  &master_th->th.th_current_task->td_icvs,
1999                                  argc USE_NESTED_HOT_ARG(master_th));
2000     }
2001     KF_TRACE(
2002         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2003 
2004     /* setup the new team */
2005     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2006     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2007     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2008     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2009     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2010 #if OMPT_SUPPORT
2011     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2012                           return_address);
2013 #endif
2014     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2015     // TODO: parent_team->t.t_level == INT_MAX ???
2016     if (!master_th->th.th_teams_microtask || level > teams_level) {
2017       int new_level = parent_team->t.t_level + 1;
2018       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2019       new_level = parent_team->t.t_active_level + 1;
2020       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2021     } else {
2022       // AC: Do not increase parallel level at start of the teams construct
2023       int new_level = parent_team->t.t_level;
2024       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2025       new_level = parent_team->t.t_active_level;
2026       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2027     }
2028     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2029     // set primary thread's schedule as new run-time schedule
2030     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2031 
2032     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2033     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2034 
2035     // Update the floating point rounding in the team if required.
2036     propagateFPControl(team);
2037 #if OMPD_SUPPORT
2038     if (ompd_state & OMPD_ENABLE_BP)
2039       ompd_bp_parallel_begin();
2040 #endif
2041 
2042     if (__kmp_tasking_mode != tskm_immediate_exec) {
2043       // Set primary thread's task team to team's task team. Unless this is hot
2044       // team, it should be NULL.
2045       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2046                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2047       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2048                     "%p, new task_team %p / team %p\n",
2049                     __kmp_gtid_from_thread(master_th),
2050                     master_th->th.th_task_team, parent_team,
2051                     team->t.t_task_team[master_th->th.th_task_state], team));
2052 
2053       if (active_level || master_th->th.th_task_team) {
2054         // Take a memo of primary thread's task_state
2055         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2056         if (master_th->th.th_task_state_top >=
2057             master_th->th.th_task_state_stack_sz) { // increase size
2058           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2059           kmp_uint8 *old_stack, *new_stack;
2060           kmp_uint32 i;
2061           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2062           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2063             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2064           }
2065           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2066                ++i) { // zero-init rest of stack
2067             new_stack[i] = 0;
2068           }
2069           old_stack = master_th->th.th_task_state_memo_stack;
2070           master_th->th.th_task_state_memo_stack = new_stack;
2071           master_th->th.th_task_state_stack_sz = new_size;
2072           __kmp_free(old_stack);
2073         }
2074         // Store primary thread's task_state on stack
2075         master_th->th
2076             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2077             master_th->th.th_task_state;
2078         master_th->th.th_task_state_top++;
2079 #if KMP_NESTED_HOT_TEAMS
2080         if (master_th->th.th_hot_teams &&
2081             active_level < __kmp_hot_teams_max_level &&
2082             team == master_th->th.th_hot_teams[active_level].hot_team) {
2083           // Restore primary thread's nested state if nested hot team
2084           master_th->th.th_task_state =
2085               master_th->th
2086                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2087         } else {
2088 #endif
2089           master_th->th.th_task_state = 0;
2090 #if KMP_NESTED_HOT_TEAMS
2091         }
2092 #endif
2093       }
2094 #if !KMP_NESTED_HOT_TEAMS
2095       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2096                        (team == root->r.r_hot_team));
2097 #endif
2098     }
2099 
2100     KA_TRACE(
2101         20,
2102         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2103          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2104          team->t.t_nproc));
2105     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2106                      (team->t.t_master_tid == 0 &&
2107                       (team->t.t_parent == root->r.r_root_team ||
2108                        team->t.t_parent->t.t_serialized)));
2109     KMP_MB();
2110 
2111     /* now, setup the arguments */
2112     argv = (void **)team->t.t_argv;
2113     if (ap) {
2114       for (i = argc - 1; i >= 0; --i) {
2115         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2116         KMP_CHECK_UPDATE(*argv, new_argv);
2117         argv++;
2118       }
2119     } else {
2120       for (i = 0; i < argc; ++i) {
2121         // Get args from parent team for teams construct
2122         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2123       }
2124     }
2125 
2126     /* now actually fork the threads */
2127     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2128     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2129       root->r.r_active = TRUE;
2130 
2131     __kmp_fork_team_threads(root, team, master_th, gtid);
2132     __kmp_setup_icv_copy(team, nthreads,
2133                          &master_th->th.th_current_task->td_icvs, loc);
2134 
2135 #if OMPT_SUPPORT
2136     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2137 #endif
2138 
2139     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2140 
2141 #if USE_ITT_BUILD
2142     if (team->t.t_active_level == 1 // only report frames at level 1
2143         && !master_th->th.th_teams_microtask) { // not in teams construct
2144 #if USE_ITT_NOTIFY
2145       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2146           (__kmp_forkjoin_frames_mode == 3 ||
2147            __kmp_forkjoin_frames_mode == 1)) {
2148         kmp_uint64 tmp_time = 0;
2149         if (__itt_get_timestamp_ptr)
2150           tmp_time = __itt_get_timestamp();
2151         // Internal fork - report frame begin
2152         master_th->th.th_frame_time = tmp_time;
2153         if (__kmp_forkjoin_frames_mode == 3)
2154           team->t.t_region_time = tmp_time;
2155       } else
2156 // only one notification scheme (either "submit" or "forking/joined", not both)
2157 #endif /* USE_ITT_NOTIFY */
2158           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2159               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2160         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2161         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2162       }
2163     }
2164 #endif /* USE_ITT_BUILD */
2165 
2166     /* now go on and do the work */
2167     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2168     KMP_MB();
2169     KF_TRACE(10,
2170              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2171               root, team, master_th, gtid));
2172 
2173 #if USE_ITT_BUILD
2174     if (__itt_stack_caller_create_ptr) {
2175       // create new stack stitching id before entering fork barrier
2176       if (!enter_teams) {
2177         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2178         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2179       } else if (parent_team->t.t_serialized) {
2180         // keep stack stitching id in the serialized parent_team;
2181         // current team will be used for parallel inside the teams;
2182         // if parent_team is active, then it already keeps stack stitching id
2183         // for the league of teams
2184         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2185         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2186       }
2187     }
2188 #endif /* USE_ITT_BUILD */
2189 
2190     // AC: skip __kmp_internal_fork at teams construct, let only primary
2191     // threads execute
2192     if (ap) {
2193       __kmp_internal_fork(loc, gtid, team);
2194       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2195                     "master_th=%p, gtid=%d\n",
2196                     root, team, master_th, gtid));
2197     }
2198 
2199     if (call_context == fork_context_gnu) {
2200       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2201       return TRUE;
2202     }
2203 
2204     /* Invoke microtask for PRIMARY thread */
2205     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2206                   team->t.t_id, team->t.t_pkfn));
2207   } // END of timer KMP_fork_call block
2208 
2209 #if KMP_STATS_ENABLED
2210   // If beginning a teams construct, then change thread state
2211   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2212   if (!ap) {
2213     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2214   }
2215 #endif
2216 
2217   if (!team->t.t_invoke(gtid)) {
2218     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2219   }
2220 
2221 #if KMP_STATS_ENABLED
2222   // If was beginning of a teams construct, then reset thread state
2223   if (!ap) {
2224     KMP_SET_THREAD_STATE(previous_state);
2225   }
2226 #endif
2227 
2228   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2229                 team->t.t_id, team->t.t_pkfn));
2230   KMP_MB(); /* Flush all pending memory write invalidates.  */
2231 
2232   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2233 #if OMPT_SUPPORT
2234   if (ompt_enabled.enabled) {
2235     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2236   }
2237 #endif
2238 
2239   return TRUE;
2240 }
2241 
2242 #if OMPT_SUPPORT
2243 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2244                                             kmp_team_t *team) {
2245   // restore state outside the region
2246   thread->th.ompt_thread_info.state =
2247       ((team->t.t_serialized) ? ompt_state_work_serial
2248                               : ompt_state_work_parallel);
2249 }
2250 
2251 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2252                                    kmp_team_t *team, ompt_data_t *parallel_data,
2253                                    int flags, void *codeptr) {
2254   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2255   if (ompt_enabled.ompt_callback_parallel_end) {
2256     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2257         parallel_data, &(task_info->task_data), flags, codeptr);
2258   }
2259 
2260   task_info->frame.enter_frame = ompt_data_none;
2261   __kmp_join_restore_state(thread, team);
2262 }
2263 #endif
2264 
2265 void __kmp_join_call(ident_t *loc, int gtid
2266 #if OMPT_SUPPORT
2267                      ,
2268                      enum fork_context_e fork_context
2269 #endif
2270                      ,
2271                      int exit_teams) {
2272   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2273   kmp_team_t *team;
2274   kmp_team_t *parent_team;
2275   kmp_info_t *master_th;
2276   kmp_root_t *root;
2277   int master_active;
2278 
2279   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2280 
2281   /* setup current data */
2282   master_th = __kmp_threads[gtid];
2283   root = master_th->th.th_root;
2284   team = master_th->th.th_team;
2285   parent_team = team->t.t_parent;
2286 
2287   master_th->th.th_ident = loc;
2288 
2289 #if OMPT_SUPPORT
2290   void *team_microtask = (void *)team->t.t_pkfn;
2291   // For GOMP interface with serialized parallel, need the
2292   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2293   // and end-parallel events.
2294   if (ompt_enabled.enabled &&
2295       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2296     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2297   }
2298 #endif
2299 
2300 #if KMP_DEBUG
2301   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2302     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2303                   "th_task_team = %p\n",
2304                   __kmp_gtid_from_thread(master_th), team,
2305                   team->t.t_task_team[master_th->th.th_task_state],
2306                   master_th->th.th_task_team));
2307     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2308                      team->t.t_task_team[master_th->th.th_task_state]);
2309   }
2310 #endif
2311 
2312   if (team->t.t_serialized) {
2313     if (master_th->th.th_teams_microtask) {
2314       // We are in teams construct
2315       int level = team->t.t_level;
2316       int tlevel = master_th->th.th_teams_level;
2317       if (level == tlevel) {
2318         // AC: we haven't incremented it earlier at start of teams construct,
2319         //     so do it here - at the end of teams construct
2320         team->t.t_level++;
2321       } else if (level == tlevel + 1) {
2322         // AC: we are exiting parallel inside teams, need to increment
2323         // serialization in order to restore it in the next call to
2324         // __kmpc_end_serialized_parallel
2325         team->t.t_serialized++;
2326       }
2327     }
2328     __kmpc_end_serialized_parallel(loc, gtid);
2329 
2330 #if OMPT_SUPPORT
2331     if (ompt_enabled.enabled) {
2332       __kmp_join_restore_state(master_th, parent_team);
2333     }
2334 #endif
2335 
2336     return;
2337   }
2338 
2339   master_active = team->t.t_master_active;
2340 
2341   if (!exit_teams) {
2342     // AC: No barrier for internal teams at exit from teams construct.
2343     //     But there is barrier for external team (league).
2344     __kmp_internal_join(loc, gtid, team);
2345 #if USE_ITT_BUILD
2346     if (__itt_stack_caller_create_ptr) {
2347       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2348       // destroy the stack stitching id after join barrier
2349       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2350       team->t.t_stack_id = NULL;
2351     }
2352 #endif
2353   } else {
2354     master_th->th.th_task_state =
2355         0; // AC: no tasking in teams (out of any parallel)
2356 #if USE_ITT_BUILD
2357     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2358       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2359       // destroy the stack stitching id on exit from the teams construct
2360       // if parent_team is active, then the id will be destroyed later on
2361       // by master of the league of teams
2362       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2363       parent_team->t.t_stack_id = NULL;
2364     }
2365 #endif
2366   }
2367 
2368   KMP_MB();
2369 
2370 #if OMPT_SUPPORT
2371   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2372   void *codeptr = team->t.ompt_team_info.master_return_address;
2373 #endif
2374 
2375 #if USE_ITT_BUILD
2376   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2377   if (team->t.t_active_level == 1 &&
2378       (!master_th->th.th_teams_microtask || /* not in teams construct */
2379        master_th->th.th_teams_size.nteams == 1)) {
2380     master_th->th.th_ident = loc;
2381     // only one notification scheme (either "submit" or "forking/joined", not
2382     // both)
2383     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2384         __kmp_forkjoin_frames_mode == 3)
2385       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2386                              master_th->th.th_frame_time, 0, loc,
2387                              master_th->th.th_team_nproc, 1);
2388     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2389              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2390       __kmp_itt_region_joined(gtid);
2391   } // active_level == 1
2392 #endif /* USE_ITT_BUILD */
2393 
2394   if (master_th->th.th_teams_microtask && !exit_teams &&
2395       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2396       team->t.t_level == master_th->th.th_teams_level + 1) {
2397 // AC: We need to leave the team structure intact at the end of parallel
2398 // inside the teams construct, so that at the next parallel same (hot) team
2399 // works, only adjust nesting levels
2400 #if OMPT_SUPPORT
2401     ompt_data_t ompt_parallel_data = ompt_data_none;
2402     if (ompt_enabled.enabled) {
2403       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2404       if (ompt_enabled.ompt_callback_implicit_task) {
2405         int ompt_team_size = team->t.t_nproc;
2406         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2407             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2408             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2409       }
2410       task_info->frame.exit_frame = ompt_data_none;
2411       task_info->task_data = ompt_data_none;
2412       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2413       __ompt_lw_taskteam_unlink(master_th);
2414     }
2415 #endif
2416     /* Decrement our nested depth level */
2417     team->t.t_level--;
2418     team->t.t_active_level--;
2419     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2420 
2421     // Restore number of threads in the team if needed. This code relies on
2422     // the proper adjustment of th_teams_size.nth after the fork in
2423     // __kmp_teams_master on each teams primary thread in the case that
2424     // __kmp_reserve_threads reduced it.
2425     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2426       int old_num = master_th->th.th_team_nproc;
2427       int new_num = master_th->th.th_teams_size.nth;
2428       kmp_info_t **other_threads = team->t.t_threads;
2429       team->t.t_nproc = new_num;
2430       for (int i = 0; i < old_num; ++i) {
2431         other_threads[i]->th.th_team_nproc = new_num;
2432       }
2433       // Adjust states of non-used threads of the team
2434       for (int i = old_num; i < new_num; ++i) {
2435         // Re-initialize thread's barrier data.
2436         KMP_DEBUG_ASSERT(other_threads[i]);
2437         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2438         for (int b = 0; b < bs_last_barrier; ++b) {
2439           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2440           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2441 #if USE_DEBUGGER
2442           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2443 #endif
2444         }
2445         if (__kmp_tasking_mode != tskm_immediate_exec) {
2446           // Synchronize thread's task state
2447           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2448         }
2449       }
2450     }
2451 
2452 #if OMPT_SUPPORT
2453     if (ompt_enabled.enabled) {
2454       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2455                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2456     }
2457 #endif
2458 
2459     return;
2460   }
2461 
2462   /* do cleanup and restore the parent team */
2463   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2464   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2465 
2466   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2467 
2468   /* jc: The following lock has instructions with REL and ACQ semantics,
2469      separating the parallel user code called in this parallel region
2470      from the serial user code called after this function returns. */
2471   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2472 
2473   if (!master_th->th.th_teams_microtask ||
2474       team->t.t_level > master_th->th.th_teams_level) {
2475     /* Decrement our nested depth level */
2476     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2477   }
2478   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2479 
2480 #if OMPT_SUPPORT
2481   if (ompt_enabled.enabled) {
2482     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2483     if (ompt_enabled.ompt_callback_implicit_task) {
2484       int flags = (team_microtask == (void *)__kmp_teams_master)
2485                       ? ompt_task_initial
2486                       : ompt_task_implicit;
2487       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2488       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2489           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2490           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2491     }
2492     task_info->frame.exit_frame = ompt_data_none;
2493     task_info->task_data = ompt_data_none;
2494   }
2495 #endif
2496 
2497   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2498                 master_th, team));
2499   __kmp_pop_current_task_from_thread(master_th);
2500 
2501 #if KMP_AFFINITY_SUPPORTED
2502   // Restore master thread's partition.
2503   master_th->th.th_first_place = team->t.t_first_place;
2504   master_th->th.th_last_place = team->t.t_last_place;
2505 #endif // KMP_AFFINITY_SUPPORTED
2506   master_th->th.th_def_allocator = team->t.t_def_allocator;
2507 
2508 #if OMPD_SUPPORT
2509   if (ompd_state & OMPD_ENABLE_BP)
2510     ompd_bp_parallel_end();
2511 #endif
2512   updateHWFPControl(team);
2513 
2514   if (root->r.r_active != master_active)
2515     root->r.r_active = master_active;
2516 
2517   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2518                             master_th)); // this will free worker threads
2519 
2520   /* this race was fun to find. make sure the following is in the critical
2521      region otherwise assertions may fail occasionally since the old team may be
2522      reallocated and the hierarchy appears inconsistent. it is actually safe to
2523      run and won't cause any bugs, but will cause those assertion failures. it's
2524      only one deref&assign so might as well put this in the critical region */
2525   master_th->th.th_team = parent_team;
2526   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2527   master_th->th.th_team_master = parent_team->t.t_threads[0];
2528   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2529 
2530   /* restore serialized team, if need be */
2531   if (parent_team->t.t_serialized &&
2532       parent_team != master_th->th.th_serial_team &&
2533       parent_team != root->r.r_root_team) {
2534     __kmp_free_team(root,
2535                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2536     master_th->th.th_serial_team = parent_team;
2537   }
2538 
2539   if (__kmp_tasking_mode != tskm_immediate_exec) {
2540     if (master_th->th.th_task_state_top >
2541         0) { // Restore task state from memo stack
2542       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2543       // Remember primary thread's state if we re-use this nested hot team
2544       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2545           master_th->th.th_task_state;
2546       --master_th->th.th_task_state_top; // pop
2547       // Now restore state at this level
2548       master_th->th.th_task_state =
2549           master_th->th
2550               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2551     }
2552     // Copy the task team from the parent team to the primary thread
2553     master_th->th.th_task_team =
2554         parent_team->t.t_task_team[master_th->th.th_task_state];
2555     KA_TRACE(20,
2556              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2557               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2558               parent_team));
2559   }
2560 
2561   // TODO: GEH - cannot do this assertion because root thread not set up as
2562   // executing
2563   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2564   master_th->th.th_current_task->td_flags.executing = 1;
2565 
2566   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2567 
2568 #if OMPT_SUPPORT
2569   int flags =
2570       OMPT_INVOKER(fork_context) |
2571       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2572                                                       : ompt_parallel_team);
2573   if (ompt_enabled.enabled) {
2574     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2575                     codeptr);
2576   }
2577 #endif
2578 
2579   KMP_MB();
2580   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2581 }
2582 
2583 /* Check whether we should push an internal control record onto the
2584    serial team stack.  If so, do it.  */
2585 void __kmp_save_internal_controls(kmp_info_t *thread) {
2586 
2587   if (thread->th.th_team != thread->th.th_serial_team) {
2588     return;
2589   }
2590   if (thread->th.th_team->t.t_serialized > 1) {
2591     int push = 0;
2592 
2593     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2594       push = 1;
2595     } else {
2596       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2597           thread->th.th_team->t.t_serialized) {
2598         push = 1;
2599       }
2600     }
2601     if (push) { /* push a record on the serial team's stack */
2602       kmp_internal_control_t *control =
2603           (kmp_internal_control_t *)__kmp_allocate(
2604               sizeof(kmp_internal_control_t));
2605 
2606       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2607 
2608       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2609 
2610       control->next = thread->th.th_team->t.t_control_stack_top;
2611       thread->th.th_team->t.t_control_stack_top = control;
2612     }
2613   }
2614 }
2615 
2616 /* Changes set_nproc */
2617 void __kmp_set_num_threads(int new_nth, int gtid) {
2618   kmp_info_t *thread;
2619   kmp_root_t *root;
2620 
2621   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2622   KMP_DEBUG_ASSERT(__kmp_init_serial);
2623 
2624   if (new_nth < 1)
2625     new_nth = 1;
2626   else if (new_nth > __kmp_max_nth)
2627     new_nth = __kmp_max_nth;
2628 
2629   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2630   thread = __kmp_threads[gtid];
2631   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2632     return; // nothing to do
2633 
2634   __kmp_save_internal_controls(thread);
2635 
2636   set__nproc(thread, new_nth);
2637 
2638   // If this omp_set_num_threads() call will cause the hot team size to be
2639   // reduced (in the absence of a num_threads clause), then reduce it now,
2640   // rather than waiting for the next parallel region.
2641   root = thread->th.th_root;
2642   if (__kmp_init_parallel && (!root->r.r_active) &&
2643       (root->r.r_hot_team->t.t_nproc > new_nth)
2644 #if KMP_NESTED_HOT_TEAMS
2645       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2646 #endif
2647   ) {
2648     kmp_team_t *hot_team = root->r.r_hot_team;
2649     int f;
2650 
2651     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2652 
2653     // Release the extra threads we don't need any more.
2654     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2655       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2656       if (__kmp_tasking_mode != tskm_immediate_exec) {
2657         // When decreasing team size, threads no longer in the team should unref
2658         // task team.
2659         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2660       }
2661       __kmp_free_thread(hot_team->t.t_threads[f]);
2662       hot_team->t.t_threads[f] = NULL;
2663     }
2664     hot_team->t.t_nproc = new_nth;
2665 #if KMP_NESTED_HOT_TEAMS
2666     if (thread->th.th_hot_teams) {
2667       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2668       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2669     }
2670 #endif
2671 
2672     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2673 
2674     // Update the t_nproc field in the threads that are still active.
2675     for (f = 0; f < new_nth; f++) {
2676       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2677       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2678     }
2679     // Special flag in case omp_set_num_threads() call
2680     hot_team->t.t_size_changed = -1;
2681   }
2682 }
2683 
2684 /* Changes max_active_levels */
2685 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2686   kmp_info_t *thread;
2687 
2688   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2689                 "%d = (%d)\n",
2690                 gtid, max_active_levels));
2691   KMP_DEBUG_ASSERT(__kmp_init_serial);
2692 
2693   // validate max_active_levels
2694   if (max_active_levels < 0) {
2695     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2696     // We ignore this call if the user has specified a negative value.
2697     // The current setting won't be changed. The last valid setting will be
2698     // used. A warning will be issued (if warnings are allowed as controlled by
2699     // the KMP_WARNINGS env var).
2700     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2701                   "max_active_levels for thread %d = (%d)\n",
2702                   gtid, max_active_levels));
2703     return;
2704   }
2705   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2706     // it's OK, the max_active_levels is within the valid range: [ 0;
2707     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2708     // We allow a zero value. (implementation defined behavior)
2709   } else {
2710     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2711                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2712     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2713     // Current upper limit is MAX_INT. (implementation defined behavior)
2714     // If the input exceeds the upper limit, we correct the input to be the
2715     // upper limit. (implementation defined behavior)
2716     // Actually, the flow should never get here until we use MAX_INT limit.
2717   }
2718   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2719                 "max_active_levels for thread %d = (%d)\n",
2720                 gtid, max_active_levels));
2721 
2722   thread = __kmp_threads[gtid];
2723 
2724   __kmp_save_internal_controls(thread);
2725 
2726   set__max_active_levels(thread, max_active_levels);
2727 }
2728 
2729 /* Gets max_active_levels */
2730 int __kmp_get_max_active_levels(int gtid) {
2731   kmp_info_t *thread;
2732 
2733   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2734   KMP_DEBUG_ASSERT(__kmp_init_serial);
2735 
2736   thread = __kmp_threads[gtid];
2737   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2738   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2739                 "curtask_maxaclevel=%d\n",
2740                 gtid, thread->th.th_current_task,
2741                 thread->th.th_current_task->td_icvs.max_active_levels));
2742   return thread->th.th_current_task->td_icvs.max_active_levels;
2743 }
2744 
2745 // nteams-var per-device ICV
2746 void __kmp_set_num_teams(int num_teams) {
2747   if (num_teams > 0)
2748     __kmp_nteams = num_teams;
2749 }
2750 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2751 // teams-thread-limit-var per-device ICV
2752 void __kmp_set_teams_thread_limit(int limit) {
2753   if (limit > 0)
2754     __kmp_teams_thread_limit = limit;
2755 }
2756 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2757 
2758 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2759 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2760 
2761 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2762 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2763   kmp_info_t *thread;
2764   kmp_sched_t orig_kind;
2765   //    kmp_team_t *team;
2766 
2767   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2768                 gtid, (int)kind, chunk));
2769   KMP_DEBUG_ASSERT(__kmp_init_serial);
2770 
2771   // Check if the kind parameter is valid, correct if needed.
2772   // Valid parameters should fit in one of two intervals - standard or extended:
2773   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2774   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2775   orig_kind = kind;
2776   kind = __kmp_sched_without_mods(kind);
2777 
2778   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2779       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2780     // TODO: Hint needs attention in case we change the default schedule.
2781     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2782               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2783               __kmp_msg_null);
2784     kind = kmp_sched_default;
2785     chunk = 0; // ignore chunk value in case of bad kind
2786   }
2787 
2788   thread = __kmp_threads[gtid];
2789 
2790   __kmp_save_internal_controls(thread);
2791 
2792   if (kind < kmp_sched_upper_std) {
2793     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2794       // differ static chunked vs. unchunked:  chunk should be invalid to
2795       // indicate unchunked schedule (which is the default)
2796       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2797     } else {
2798       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2799           __kmp_sch_map[kind - kmp_sched_lower - 1];
2800     }
2801   } else {
2802     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2803     //    kmp_sched_lower - 2 ];
2804     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2805         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2806                       kmp_sched_lower - 2];
2807   }
2808   __kmp_sched_apply_mods_intkind(
2809       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2810   if (kind == kmp_sched_auto || chunk < 1) {
2811     // ignore parameter chunk for schedule auto
2812     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2813   } else {
2814     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2815   }
2816 }
2817 
2818 /* Gets def_sched_var ICV values */
2819 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2820   kmp_info_t *thread;
2821   enum sched_type th_type;
2822 
2823   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2824   KMP_DEBUG_ASSERT(__kmp_init_serial);
2825 
2826   thread = __kmp_threads[gtid];
2827 
2828   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2829   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2830   case kmp_sch_static:
2831   case kmp_sch_static_greedy:
2832   case kmp_sch_static_balanced:
2833     *kind = kmp_sched_static;
2834     __kmp_sched_apply_mods_stdkind(kind, th_type);
2835     *chunk = 0; // chunk was not set, try to show this fact via zero value
2836     return;
2837   case kmp_sch_static_chunked:
2838     *kind = kmp_sched_static;
2839     break;
2840   case kmp_sch_dynamic_chunked:
2841     *kind = kmp_sched_dynamic;
2842     break;
2843   case kmp_sch_guided_chunked:
2844   case kmp_sch_guided_iterative_chunked:
2845   case kmp_sch_guided_analytical_chunked:
2846     *kind = kmp_sched_guided;
2847     break;
2848   case kmp_sch_auto:
2849     *kind = kmp_sched_auto;
2850     break;
2851   case kmp_sch_trapezoidal:
2852     *kind = kmp_sched_trapezoidal;
2853     break;
2854 #if KMP_STATIC_STEAL_ENABLED
2855   case kmp_sch_static_steal:
2856     *kind = kmp_sched_static_steal;
2857     break;
2858 #endif
2859   default:
2860     KMP_FATAL(UnknownSchedulingType, th_type);
2861   }
2862 
2863   __kmp_sched_apply_mods_stdkind(kind, th_type);
2864   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2865 }
2866 
2867 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2868 
2869   int ii, dd;
2870   kmp_team_t *team;
2871   kmp_info_t *thr;
2872 
2873   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2874   KMP_DEBUG_ASSERT(__kmp_init_serial);
2875 
2876   // validate level
2877   if (level == 0)
2878     return 0;
2879   if (level < 0)
2880     return -1;
2881   thr = __kmp_threads[gtid];
2882   team = thr->th.th_team;
2883   ii = team->t.t_level;
2884   if (level > ii)
2885     return -1;
2886 
2887   if (thr->th.th_teams_microtask) {
2888     // AC: we are in teams region where multiple nested teams have same level
2889     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2890     if (level <=
2891         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2892       KMP_DEBUG_ASSERT(ii >= tlevel);
2893       // AC: As we need to pass by the teams league, we need to artificially
2894       // increase ii
2895       if (ii == tlevel) {
2896         ii += 2; // three teams have same level
2897       } else {
2898         ii++; // two teams have same level
2899       }
2900     }
2901   }
2902 
2903   if (ii == level)
2904     return __kmp_tid_from_gtid(gtid);
2905 
2906   dd = team->t.t_serialized;
2907   level++;
2908   while (ii > level) {
2909     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2910     }
2911     if ((team->t.t_serialized) && (!dd)) {
2912       team = team->t.t_parent;
2913       continue;
2914     }
2915     if (ii > level) {
2916       team = team->t.t_parent;
2917       dd = team->t.t_serialized;
2918       ii--;
2919     }
2920   }
2921 
2922   return (dd > 1) ? (0) : (team->t.t_master_tid);
2923 }
2924 
2925 int __kmp_get_team_size(int gtid, int level) {
2926 
2927   int ii, dd;
2928   kmp_team_t *team;
2929   kmp_info_t *thr;
2930 
2931   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2932   KMP_DEBUG_ASSERT(__kmp_init_serial);
2933 
2934   // validate level
2935   if (level == 0)
2936     return 1;
2937   if (level < 0)
2938     return -1;
2939   thr = __kmp_threads[gtid];
2940   team = thr->th.th_team;
2941   ii = team->t.t_level;
2942   if (level > ii)
2943     return -1;
2944 
2945   if (thr->th.th_teams_microtask) {
2946     // AC: we are in teams region where multiple nested teams have same level
2947     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2948     if (level <=
2949         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2950       KMP_DEBUG_ASSERT(ii >= tlevel);
2951       // AC: As we need to pass by the teams league, we need to artificially
2952       // increase ii
2953       if (ii == tlevel) {
2954         ii += 2; // three teams have same level
2955       } else {
2956         ii++; // two teams have same level
2957       }
2958     }
2959   }
2960 
2961   while (ii > level) {
2962     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2963     }
2964     if (team->t.t_serialized && (!dd)) {
2965       team = team->t.t_parent;
2966       continue;
2967     }
2968     if (ii > level) {
2969       team = team->t.t_parent;
2970       ii--;
2971     }
2972   }
2973 
2974   return team->t.t_nproc;
2975 }
2976 
2977 kmp_r_sched_t __kmp_get_schedule_global() {
2978   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2979   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2980   // independently. So one can get the updated schedule here.
2981 
2982   kmp_r_sched_t r_sched;
2983 
2984   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2985   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2986   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2987   // different roots (even in OMP 2.5)
2988   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2989   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2990   if (s == kmp_sch_static) {
2991     // replace STATIC with more detailed schedule (balanced or greedy)
2992     r_sched.r_sched_type = __kmp_static;
2993   } else if (s == kmp_sch_guided_chunked) {
2994     // replace GUIDED with more detailed schedule (iterative or analytical)
2995     r_sched.r_sched_type = __kmp_guided;
2996   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2997     r_sched.r_sched_type = __kmp_sched;
2998   }
2999   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3000 
3001   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3002     // __kmp_chunk may be wrong here (if it was not ever set)
3003     r_sched.chunk = KMP_DEFAULT_CHUNK;
3004   } else {
3005     r_sched.chunk = __kmp_chunk;
3006   }
3007 
3008   return r_sched;
3009 }
3010 
3011 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3012    at least argc number of *t_argv entries for the requested team. */
3013 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3014 
3015   KMP_DEBUG_ASSERT(team);
3016   if (!realloc || argc > team->t.t_max_argc) {
3017 
3018     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3019                    "current entries=%d\n",
3020                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3021     /* if previously allocated heap space for args, free them */
3022     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3023       __kmp_free((void *)team->t.t_argv);
3024 
3025     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3026       /* use unused space in the cache line for arguments */
3027       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3028       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3029                      "argv entries\n",
3030                      team->t.t_id, team->t.t_max_argc));
3031       team->t.t_argv = &team->t.t_inline_argv[0];
3032       if (__kmp_storage_map) {
3033         __kmp_print_storage_map_gtid(
3034             -1, &team->t.t_inline_argv[0],
3035             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3036             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3037             team->t.t_id);
3038       }
3039     } else {
3040       /* allocate space for arguments in the heap */
3041       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3042                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3043                                : 2 * argc;
3044       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3045                      "argv entries\n",
3046                      team->t.t_id, team->t.t_max_argc));
3047       team->t.t_argv =
3048           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3049       if (__kmp_storage_map) {
3050         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3051                                      &team->t.t_argv[team->t.t_max_argc],
3052                                      sizeof(void *) * team->t.t_max_argc,
3053                                      "team_%d.t_argv", team->t.t_id);
3054       }
3055     }
3056   }
3057 }
3058 
3059 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3060   int i;
3061   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3062   team->t.t_threads =
3063       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3064   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3065       sizeof(dispatch_shared_info_t) * num_disp_buff);
3066   team->t.t_dispatch =
3067       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3068   team->t.t_implicit_task_taskdata =
3069       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3070   team->t.t_max_nproc = max_nth;
3071 
3072   /* setup dispatch buffers */
3073   for (i = 0; i < num_disp_buff; ++i) {
3074     team->t.t_disp_buffer[i].buffer_index = i;
3075     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3076   }
3077 }
3078 
3079 static void __kmp_free_team_arrays(kmp_team_t *team) {
3080   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3081   int i;
3082   for (i = 0; i < team->t.t_max_nproc; ++i) {
3083     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3084       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3085       team->t.t_dispatch[i].th_disp_buffer = NULL;
3086     }
3087   }
3088 #if KMP_USE_HIER_SCHED
3089   __kmp_dispatch_free_hierarchies(team);
3090 #endif
3091   __kmp_free(team->t.t_threads);
3092   __kmp_free(team->t.t_disp_buffer);
3093   __kmp_free(team->t.t_dispatch);
3094   __kmp_free(team->t.t_implicit_task_taskdata);
3095   team->t.t_threads = NULL;
3096   team->t.t_disp_buffer = NULL;
3097   team->t.t_dispatch = NULL;
3098   team->t.t_implicit_task_taskdata = 0;
3099 }
3100 
3101 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3102   kmp_info_t **oldThreads = team->t.t_threads;
3103 
3104   __kmp_free(team->t.t_disp_buffer);
3105   __kmp_free(team->t.t_dispatch);
3106   __kmp_free(team->t.t_implicit_task_taskdata);
3107   __kmp_allocate_team_arrays(team, max_nth);
3108 
3109   KMP_MEMCPY(team->t.t_threads, oldThreads,
3110              team->t.t_nproc * sizeof(kmp_info_t *));
3111 
3112   __kmp_free(oldThreads);
3113 }
3114 
3115 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3116 
3117   kmp_r_sched_t r_sched =
3118       __kmp_get_schedule_global(); // get current state of scheduling globals
3119 
3120   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3121 
3122   kmp_internal_control_t g_icvs = {
3123     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3124     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3125     // adjustment of threads (per thread)
3126     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3127     // whether blocktime is explicitly set
3128     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3129 #if KMP_USE_MONITOR
3130     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3131 // intervals
3132 #endif
3133     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3134     // next parallel region (per thread)
3135     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3136     __kmp_cg_max_nth, // int thread_limit;
3137     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3138     // for max_active_levels
3139     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3140     // {sched,chunk} pair
3141     __kmp_nested_proc_bind.bind_types[0],
3142     __kmp_default_device,
3143     NULL // struct kmp_internal_control *next;
3144   };
3145 
3146   return g_icvs;
3147 }
3148 
3149 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3150 
3151   kmp_internal_control_t gx_icvs;
3152   gx_icvs.serial_nesting_level =
3153       0; // probably =team->t.t_serial like in save_inter_controls
3154   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3155   gx_icvs.next = NULL;
3156 
3157   return gx_icvs;
3158 }
3159 
3160 static void __kmp_initialize_root(kmp_root_t *root) {
3161   int f;
3162   kmp_team_t *root_team;
3163   kmp_team_t *hot_team;
3164   int hot_team_max_nth;
3165   kmp_r_sched_t r_sched =
3166       __kmp_get_schedule_global(); // get current state of scheduling globals
3167   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3168   KMP_DEBUG_ASSERT(root);
3169   KMP_ASSERT(!root->r.r_begin);
3170 
3171   /* setup the root state structure */
3172   __kmp_init_lock(&root->r.r_begin_lock);
3173   root->r.r_begin = FALSE;
3174   root->r.r_active = FALSE;
3175   root->r.r_in_parallel = 0;
3176   root->r.r_blocktime = __kmp_dflt_blocktime;
3177 #if KMP_AFFINITY_SUPPORTED
3178   root->r.r_affinity_assigned = FALSE;
3179 #endif
3180 
3181   /* setup the root team for this task */
3182   /* allocate the root team structure */
3183   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3184 
3185   root_team =
3186       __kmp_allocate_team(root,
3187                           1, // new_nproc
3188                           1, // max_nproc
3189 #if OMPT_SUPPORT
3190                           ompt_data_none, // root parallel id
3191 #endif
3192                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3193                           0 // argc
3194                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3195                           );
3196 #if USE_DEBUGGER
3197   // Non-NULL value should be assigned to make the debugger display the root
3198   // team.
3199   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3200 #endif
3201 
3202   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3203 
3204   root->r.r_root_team = root_team;
3205   root_team->t.t_control_stack_top = NULL;
3206 
3207   /* initialize root team */
3208   root_team->t.t_threads[0] = NULL;
3209   root_team->t.t_nproc = 1;
3210   root_team->t.t_serialized = 1;
3211   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3212   root_team->t.t_sched.sched = r_sched.sched;
3213   KA_TRACE(
3214       20,
3215       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3216        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3217 
3218   /* setup the  hot team for this task */
3219   /* allocate the hot team structure */
3220   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3221 
3222   hot_team =
3223       __kmp_allocate_team(root,
3224                           1, // new_nproc
3225                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3226 #if OMPT_SUPPORT
3227                           ompt_data_none, // root parallel id
3228 #endif
3229                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3230                           0 // argc
3231                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3232                           );
3233   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3234 
3235   root->r.r_hot_team = hot_team;
3236   root_team->t.t_control_stack_top = NULL;
3237 
3238   /* first-time initialization */
3239   hot_team->t.t_parent = root_team;
3240 
3241   /* initialize hot team */
3242   hot_team_max_nth = hot_team->t.t_max_nproc;
3243   for (f = 0; f < hot_team_max_nth; ++f) {
3244     hot_team->t.t_threads[f] = NULL;
3245   }
3246   hot_team->t.t_nproc = 1;
3247   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3248   hot_team->t.t_sched.sched = r_sched.sched;
3249   hot_team->t.t_size_changed = 0;
3250 }
3251 
3252 #ifdef KMP_DEBUG
3253 
3254 typedef struct kmp_team_list_item {
3255   kmp_team_p const *entry;
3256   struct kmp_team_list_item *next;
3257 } kmp_team_list_item_t;
3258 typedef kmp_team_list_item_t *kmp_team_list_t;
3259 
3260 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3261     kmp_team_list_t list, // List of teams.
3262     kmp_team_p const *team // Team to add.
3263 ) {
3264 
3265   // List must terminate with item where both entry and next are NULL.
3266   // Team is added to the list only once.
3267   // List is sorted in ascending order by team id.
3268   // Team id is *not* a key.
3269 
3270   kmp_team_list_t l;
3271 
3272   KMP_DEBUG_ASSERT(list != NULL);
3273   if (team == NULL) {
3274     return;
3275   }
3276 
3277   __kmp_print_structure_team_accum(list, team->t.t_parent);
3278   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3279 
3280   // Search list for the team.
3281   l = list;
3282   while (l->next != NULL && l->entry != team) {
3283     l = l->next;
3284   }
3285   if (l->next != NULL) {
3286     return; // Team has been added before, exit.
3287   }
3288 
3289   // Team is not found. Search list again for insertion point.
3290   l = list;
3291   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3292     l = l->next;
3293   }
3294 
3295   // Insert team.
3296   {
3297     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3298         sizeof(kmp_team_list_item_t));
3299     *item = *l;
3300     l->entry = team;
3301     l->next = item;
3302   }
3303 }
3304 
3305 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3306 
3307 ) {
3308   __kmp_printf("%s", title);
3309   if (team != NULL) {
3310     __kmp_printf("%2x %p\n", team->t.t_id, team);
3311   } else {
3312     __kmp_printf(" - (nil)\n");
3313   }
3314 }
3315 
3316 static void __kmp_print_structure_thread(char const *title,
3317                                          kmp_info_p const *thread) {
3318   __kmp_printf("%s", title);
3319   if (thread != NULL) {
3320     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3321   } else {
3322     __kmp_printf(" - (nil)\n");
3323   }
3324 }
3325 
3326 void __kmp_print_structure(void) {
3327 
3328   kmp_team_list_t list;
3329 
3330   // Initialize list of teams.
3331   list =
3332       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3333   list->entry = NULL;
3334   list->next = NULL;
3335 
3336   __kmp_printf("\n------------------------------\nGlobal Thread "
3337                "Table\n------------------------------\n");
3338   {
3339     int gtid;
3340     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3341       __kmp_printf("%2d", gtid);
3342       if (__kmp_threads != NULL) {
3343         __kmp_printf(" %p", __kmp_threads[gtid]);
3344       }
3345       if (__kmp_root != NULL) {
3346         __kmp_printf(" %p", __kmp_root[gtid]);
3347       }
3348       __kmp_printf("\n");
3349     }
3350   }
3351 
3352   // Print out __kmp_threads array.
3353   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3354                "----------\n");
3355   if (__kmp_threads != NULL) {
3356     int gtid;
3357     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3358       kmp_info_t const *thread = __kmp_threads[gtid];
3359       if (thread != NULL) {
3360         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3361         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3362         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3363         __kmp_print_structure_team("    Serial Team:  ",
3364                                    thread->th.th_serial_team);
3365         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3366         __kmp_print_structure_thread("    Primary:      ",
3367                                      thread->th.th_team_master);
3368         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3369         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3370         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3371         __kmp_print_structure_thread("    Next in pool: ",
3372                                      thread->th.th_next_pool);
3373         __kmp_printf("\n");
3374         __kmp_print_structure_team_accum(list, thread->th.th_team);
3375         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3376       }
3377     }
3378   } else {
3379     __kmp_printf("Threads array is not allocated.\n");
3380   }
3381 
3382   // Print out __kmp_root array.
3383   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3384                "--------\n");
3385   if (__kmp_root != NULL) {
3386     int gtid;
3387     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3388       kmp_root_t const *root = __kmp_root[gtid];
3389       if (root != NULL) {
3390         __kmp_printf("GTID %2d %p:\n", gtid, root);
3391         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3392         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3393         __kmp_print_structure_thread("    Uber Thread:  ",
3394                                      root->r.r_uber_thread);
3395         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3396         __kmp_printf("    In Parallel:  %2d\n",
3397                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3398         __kmp_printf("\n");
3399         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3400         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3401       }
3402     }
3403   } else {
3404     __kmp_printf("Ubers array is not allocated.\n");
3405   }
3406 
3407   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3408                "--------\n");
3409   while (list->next != NULL) {
3410     kmp_team_p const *team = list->entry;
3411     int i;
3412     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3413     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3414     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3415     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3416     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3417     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3418     for (i = 0; i < team->t.t_nproc; ++i) {
3419       __kmp_printf("    Thread %2d:      ", i);
3420       __kmp_print_structure_thread("", team->t.t_threads[i]);
3421     }
3422     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3423     __kmp_printf("\n");
3424     list = list->next;
3425   }
3426 
3427   // Print out __kmp_thread_pool and __kmp_team_pool.
3428   __kmp_printf("\n------------------------------\nPools\n----------------------"
3429                "--------\n");
3430   __kmp_print_structure_thread("Thread pool:          ",
3431                                CCAST(kmp_info_t *, __kmp_thread_pool));
3432   __kmp_print_structure_team("Team pool:            ",
3433                              CCAST(kmp_team_t *, __kmp_team_pool));
3434   __kmp_printf("\n");
3435 
3436   // Free team list.
3437   while (list != NULL) {
3438     kmp_team_list_item_t *item = list;
3439     list = list->next;
3440     KMP_INTERNAL_FREE(item);
3441   }
3442 }
3443 
3444 #endif
3445 
3446 //---------------------------------------------------------------------------
3447 //  Stuff for per-thread fast random number generator
3448 //  Table of primes
3449 static const unsigned __kmp_primes[] = {
3450     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3451     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3452     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3453     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3454     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3455     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3456     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3457     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3458     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3459     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3460     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3461 
3462 //---------------------------------------------------------------------------
3463 //  __kmp_get_random: Get a random number using a linear congruential method.
3464 unsigned short __kmp_get_random(kmp_info_t *thread) {
3465   unsigned x = thread->th.th_x;
3466   unsigned short r = (unsigned short)(x >> 16);
3467 
3468   thread->th.th_x = x * thread->th.th_a + 1;
3469 
3470   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3471                 thread->th.th_info.ds.ds_tid, r));
3472 
3473   return r;
3474 }
3475 //--------------------------------------------------------
3476 // __kmp_init_random: Initialize a random number generator
3477 void __kmp_init_random(kmp_info_t *thread) {
3478   unsigned seed = thread->th.th_info.ds.ds_tid;
3479 
3480   thread->th.th_a =
3481       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3482   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3483   KA_TRACE(30,
3484            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3485 }
3486 
3487 #if KMP_OS_WINDOWS
3488 /* reclaim array entries for root threads that are already dead, returns number
3489  * reclaimed */
3490 static int __kmp_reclaim_dead_roots(void) {
3491   int i, r = 0;
3492 
3493   for (i = 0; i < __kmp_threads_capacity; ++i) {
3494     if (KMP_UBER_GTID(i) &&
3495         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3496         !__kmp_root[i]
3497              ->r.r_active) { // AC: reclaim only roots died in non-active state
3498       r += __kmp_unregister_root_other_thread(i);
3499     }
3500   }
3501   return r;
3502 }
3503 #endif
3504 
3505 /* This function attempts to create free entries in __kmp_threads and
3506    __kmp_root, and returns the number of free entries generated.
3507 
3508    For Windows* OS static library, the first mechanism used is to reclaim array
3509    entries for root threads that are already dead.
3510 
3511    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3512    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3513    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3514    threadprivate cache array has been created. Synchronization with
3515    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3516 
3517    After any dead root reclamation, if the clipping value allows array expansion
3518    to result in the generation of a total of nNeed free slots, the function does
3519    that expansion. If not, nothing is done beyond the possible initial root
3520    thread reclamation.
3521 
3522    If any argument is negative, the behavior is undefined. */
3523 static int __kmp_expand_threads(int nNeed) {
3524   int added = 0;
3525   int minimumRequiredCapacity;
3526   int newCapacity;
3527   kmp_info_t **newThreads;
3528   kmp_root_t **newRoot;
3529 
3530   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3531   // resizing __kmp_threads does not need additional protection if foreign
3532   // threads are present
3533 
3534 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3535   /* only for Windows static library */
3536   /* reclaim array entries for root threads that are already dead */
3537   added = __kmp_reclaim_dead_roots();
3538 
3539   if (nNeed) {
3540     nNeed -= added;
3541     if (nNeed < 0)
3542       nNeed = 0;
3543   }
3544 #endif
3545   if (nNeed <= 0)
3546     return added;
3547 
3548   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3549   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3550   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3551   // > __kmp_max_nth in one of two ways:
3552   //
3553   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3554   //    may not be reused by another thread, so we may need to increase
3555   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3556   //
3557   // 2) New foreign root(s) are encountered.  We always register new foreign
3558   //    roots. This may cause a smaller # of threads to be allocated at
3559   //    subsequent parallel regions, but the worker threads hang around (and
3560   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3561   //
3562   // Anyway, that is the reason for moving the check to see if
3563   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3564   // instead of having it performed here. -BB
3565 
3566   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3567 
3568   /* compute expansion headroom to check if we can expand */
3569   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3570     /* possible expansion too small -- give up */
3571     return added;
3572   }
3573   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3574 
3575   newCapacity = __kmp_threads_capacity;
3576   do {
3577     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3578                                                           : __kmp_sys_max_nth;
3579   } while (newCapacity < minimumRequiredCapacity);
3580   newThreads = (kmp_info_t **)__kmp_allocate(
3581       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3582   newRoot =
3583       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3584   KMP_MEMCPY(newThreads, __kmp_threads,
3585              __kmp_threads_capacity * sizeof(kmp_info_t *));
3586   KMP_MEMCPY(newRoot, __kmp_root,
3587              __kmp_threads_capacity * sizeof(kmp_root_t *));
3588 
3589   kmp_info_t **temp_threads = __kmp_threads;
3590   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3591   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3592   __kmp_free(temp_threads);
3593   added += newCapacity - __kmp_threads_capacity;
3594   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3595 
3596   if (newCapacity > __kmp_tp_capacity) {
3597     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3598     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3599       __kmp_threadprivate_resize_cache(newCapacity);
3600     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3601       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3602     }
3603     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3604   }
3605 
3606   return added;
3607 }
3608 
3609 /* Register the current thread as a root thread and obtain our gtid. We must
3610    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3611    thread that calls from __kmp_do_serial_initialize() */
3612 int __kmp_register_root(int initial_thread) {
3613   kmp_info_t *root_thread;
3614   kmp_root_t *root;
3615   int gtid;
3616   int capacity;
3617   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3618   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3619   KMP_MB();
3620 
3621   /* 2007-03-02:
3622      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3623      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3624      work as expected -- it may return false (that means there is at least one
3625      empty slot in __kmp_threads array), but it is possible the only free slot
3626      is #0, which is reserved for initial thread and so cannot be used for this
3627      one. Following code workarounds this bug.
3628 
3629      However, right solution seems to be not reserving slot #0 for initial
3630      thread because:
3631      (1) there is no magic in slot #0,
3632      (2) we cannot detect initial thread reliably (the first thread which does
3633         serial initialization may be not a real initial thread).
3634   */
3635   capacity = __kmp_threads_capacity;
3636   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3637     --capacity;
3638   }
3639 
3640   // If it is not for initializing the hidden helper team, we need to take
3641   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3642   // in __kmp_threads_capacity.
3643   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3644     capacity -= __kmp_hidden_helper_threads_num;
3645   }
3646 
3647   /* see if there are too many threads */
3648   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3649     if (__kmp_tp_cached) {
3650       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3651                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3652                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3653     } else {
3654       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3655                   __kmp_msg_null);
3656     }
3657   }
3658 
3659   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3660   // 0: initial thread, also a regular OpenMP thread.
3661   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3662   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3663   // regular OpenMP threads.
3664   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3665     // Find an available thread slot for hidden helper thread. Slots for hidden
3666     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3667     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3668                    gtid <= __kmp_hidden_helper_threads_num;
3669          gtid++)
3670       ;
3671     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3672     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3673                  "hidden helper thread: T#%d\n",
3674                  gtid));
3675   } else {
3676     /* find an available thread slot */
3677     // Don't reassign the zero slot since we need that to only be used by
3678     // initial thread. Slots for hidden helper threads should also be skipped.
3679     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3680       gtid = 0;
3681     } else {
3682       for (gtid = __kmp_hidden_helper_threads_num + 1;
3683            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3684         ;
3685     }
3686     KA_TRACE(
3687         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3688     KMP_ASSERT(gtid < __kmp_threads_capacity);
3689   }
3690 
3691   /* update global accounting */
3692   __kmp_all_nth++;
3693   TCW_4(__kmp_nth, __kmp_nth + 1);
3694 
3695   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3696   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3697   if (__kmp_adjust_gtid_mode) {
3698     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3699       if (TCR_4(__kmp_gtid_mode) != 2) {
3700         TCW_4(__kmp_gtid_mode, 2);
3701       }
3702     } else {
3703       if (TCR_4(__kmp_gtid_mode) != 1) {
3704         TCW_4(__kmp_gtid_mode, 1);
3705       }
3706     }
3707   }
3708 
3709 #ifdef KMP_ADJUST_BLOCKTIME
3710   /* Adjust blocktime to zero if necessary            */
3711   /* Middle initialization might not have occurred yet */
3712   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3713     if (__kmp_nth > __kmp_avail_proc) {
3714       __kmp_zero_bt = TRUE;
3715     }
3716   }
3717 #endif /* KMP_ADJUST_BLOCKTIME */
3718 
3719   /* setup this new hierarchy */
3720   if (!(root = __kmp_root[gtid])) {
3721     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3722     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3723   }
3724 
3725 #if KMP_STATS_ENABLED
3726   // Initialize stats as soon as possible (right after gtid assignment).
3727   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3728   __kmp_stats_thread_ptr->startLife();
3729   KMP_SET_THREAD_STATE(SERIAL_REGION);
3730   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3731 #endif
3732   __kmp_initialize_root(root);
3733 
3734   /* setup new root thread structure */
3735   if (root->r.r_uber_thread) {
3736     root_thread = root->r.r_uber_thread;
3737   } else {
3738     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3739     if (__kmp_storage_map) {
3740       __kmp_print_thread_storage_map(root_thread, gtid);
3741     }
3742     root_thread->th.th_info.ds.ds_gtid = gtid;
3743 #if OMPT_SUPPORT
3744     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3745 #endif
3746     root_thread->th.th_root = root;
3747     if (__kmp_env_consistency_check) {
3748       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3749     }
3750 #if USE_FAST_MEMORY
3751     __kmp_initialize_fast_memory(root_thread);
3752 #endif /* USE_FAST_MEMORY */
3753 
3754 #if KMP_USE_BGET
3755     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3756     __kmp_initialize_bget(root_thread);
3757 #endif
3758     __kmp_init_random(root_thread); // Initialize random number generator
3759   }
3760 
3761   /* setup the serial team held in reserve by the root thread */
3762   if (!root_thread->th.th_serial_team) {
3763     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3764     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3765     root_thread->th.th_serial_team = __kmp_allocate_team(
3766         root, 1, 1,
3767 #if OMPT_SUPPORT
3768         ompt_data_none, // root parallel id
3769 #endif
3770         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3771   }
3772   KMP_ASSERT(root_thread->th.th_serial_team);
3773   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3774                 root_thread->th.th_serial_team));
3775 
3776   /* drop root_thread into place */
3777   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3778 
3779   root->r.r_root_team->t.t_threads[0] = root_thread;
3780   root->r.r_hot_team->t.t_threads[0] = root_thread;
3781   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3782   // AC: the team created in reserve, not for execution (it is unused for now).
3783   root_thread->th.th_serial_team->t.t_serialized = 0;
3784   root->r.r_uber_thread = root_thread;
3785 
3786   /* initialize the thread, get it ready to go */
3787   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3788   TCW_4(__kmp_init_gtid, TRUE);
3789 
3790   /* prepare the primary thread for get_gtid() */
3791   __kmp_gtid_set_specific(gtid);
3792 
3793 #if USE_ITT_BUILD
3794   __kmp_itt_thread_name(gtid);
3795 #endif /* USE_ITT_BUILD */
3796 
3797 #ifdef KMP_TDATA_GTID
3798   __kmp_gtid = gtid;
3799 #endif
3800   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3801   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3802 
3803   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3804                 "plain=%u\n",
3805                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3806                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3807                 KMP_INIT_BARRIER_STATE));
3808   { // Initialize barrier data.
3809     int b;
3810     for (b = 0; b < bs_last_barrier; ++b) {
3811       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3812 #if USE_DEBUGGER
3813       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3814 #endif
3815     }
3816   }
3817   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3818                    KMP_INIT_BARRIER_STATE);
3819 
3820 #if KMP_AFFINITY_SUPPORTED
3821   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3822   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3823   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3824   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3825 #endif /* KMP_AFFINITY_SUPPORTED */
3826   root_thread->th.th_def_allocator = __kmp_def_allocator;
3827   root_thread->th.th_prev_level = 0;
3828   root_thread->th.th_prev_num_threads = 1;
3829 
3830   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3831   tmp->cg_root = root_thread;
3832   tmp->cg_thread_limit = __kmp_cg_max_nth;
3833   tmp->cg_nthreads = 1;
3834   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3835                  " cg_nthreads init to 1\n",
3836                  root_thread, tmp));
3837   tmp->up = NULL;
3838   root_thread->th.th_cg_roots = tmp;
3839 
3840   __kmp_root_counter++;
3841 
3842 #if OMPT_SUPPORT
3843   if (!initial_thread && ompt_enabled.enabled) {
3844 
3845     kmp_info_t *root_thread = ompt_get_thread();
3846 
3847     ompt_set_thread_state(root_thread, ompt_state_overhead);
3848 
3849     if (ompt_enabled.ompt_callback_thread_begin) {
3850       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3851           ompt_thread_initial, __ompt_get_thread_data_internal());
3852     }
3853     ompt_data_t *task_data;
3854     ompt_data_t *parallel_data;
3855     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3856                                   NULL);
3857     if (ompt_enabled.ompt_callback_implicit_task) {
3858       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3859           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3860     }
3861 
3862     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3863   }
3864 #endif
3865 #if OMPD_SUPPORT
3866   if (ompd_state & OMPD_ENABLE_BP)
3867     ompd_bp_thread_begin();
3868 #endif
3869 
3870   KMP_MB();
3871   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3872 
3873   return gtid;
3874 }
3875 
3876 #if KMP_NESTED_HOT_TEAMS
3877 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3878                                 const int max_level) {
3879   int i, n, nth;
3880   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3881   if (!hot_teams || !hot_teams[level].hot_team) {
3882     return 0;
3883   }
3884   KMP_DEBUG_ASSERT(level < max_level);
3885   kmp_team_t *team = hot_teams[level].hot_team;
3886   nth = hot_teams[level].hot_team_nth;
3887   n = nth - 1; // primary thread is not freed
3888   if (level < max_level - 1) {
3889     for (i = 0; i < nth; ++i) {
3890       kmp_info_t *th = team->t.t_threads[i];
3891       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3892       if (i > 0 && th->th.th_hot_teams) {
3893         __kmp_free(th->th.th_hot_teams);
3894         th->th.th_hot_teams = NULL;
3895       }
3896     }
3897   }
3898   __kmp_free_team(root, team, NULL);
3899   return n;
3900 }
3901 #endif
3902 
3903 // Resets a root thread and clear its root and hot teams.
3904 // Returns the number of __kmp_threads entries directly and indirectly freed.
3905 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3906   kmp_team_t *root_team = root->r.r_root_team;
3907   kmp_team_t *hot_team = root->r.r_hot_team;
3908   int n = hot_team->t.t_nproc;
3909   int i;
3910 
3911   KMP_DEBUG_ASSERT(!root->r.r_active);
3912 
3913   root->r.r_root_team = NULL;
3914   root->r.r_hot_team = NULL;
3915   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3916   // before call to __kmp_free_team().
3917   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3918 #if KMP_NESTED_HOT_TEAMS
3919   if (__kmp_hot_teams_max_level >
3920       0) { // need to free nested hot teams and their threads if any
3921     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3922       kmp_info_t *th = hot_team->t.t_threads[i];
3923       if (__kmp_hot_teams_max_level > 1) {
3924         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3925       }
3926       if (th->th.th_hot_teams) {
3927         __kmp_free(th->th.th_hot_teams);
3928         th->th.th_hot_teams = NULL;
3929       }
3930     }
3931   }
3932 #endif
3933   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3934 
3935   // Before we can reap the thread, we need to make certain that all other
3936   // threads in the teams that had this root as ancestor have stopped trying to
3937   // steal tasks.
3938   if (__kmp_tasking_mode != tskm_immediate_exec) {
3939     __kmp_wait_to_unref_task_teams();
3940   }
3941 
3942 #if KMP_OS_WINDOWS
3943   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3944   KA_TRACE(
3945       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3946            "\n",
3947            (LPVOID) & (root->r.r_uber_thread->th),
3948            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3949   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3950 #endif /* KMP_OS_WINDOWS */
3951 
3952 #if OMPD_SUPPORT
3953   if (ompd_state & OMPD_ENABLE_BP)
3954     ompd_bp_thread_end();
3955 #endif
3956 
3957 #if OMPT_SUPPORT
3958   ompt_data_t *task_data;
3959   ompt_data_t *parallel_data;
3960   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3961                                 NULL);
3962   if (ompt_enabled.ompt_callback_implicit_task) {
3963     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3964         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3965   }
3966   if (ompt_enabled.ompt_callback_thread_end) {
3967     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3968         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3969   }
3970 #endif
3971 
3972   TCW_4(__kmp_nth,
3973         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3974   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3975   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3976                  " to %d\n",
3977                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3978                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3979   if (i == 1) {
3980     // need to free contention group structure
3981     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3982                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3983     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3984     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3985     root->r.r_uber_thread->th.th_cg_roots = NULL;
3986   }
3987   __kmp_reap_thread(root->r.r_uber_thread, 1);
3988 
3989   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3990   // instead of freeing.
3991   root->r.r_uber_thread = NULL;
3992   /* mark root as no longer in use */
3993   root->r.r_begin = FALSE;
3994 
3995   return n;
3996 }
3997 
3998 void __kmp_unregister_root_current_thread(int gtid) {
3999   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4000   /* this lock should be ok, since unregister_root_current_thread is never
4001      called during an abort, only during a normal close. furthermore, if you
4002      have the forkjoin lock, you should never try to get the initz lock */
4003   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4004   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4005     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4006                   "exiting T#%d\n",
4007                   gtid));
4008     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4009     return;
4010   }
4011   kmp_root_t *root = __kmp_root[gtid];
4012 
4013   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4014   KMP_ASSERT(KMP_UBER_GTID(gtid));
4015   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4016   KMP_ASSERT(root->r.r_active == FALSE);
4017 
4018   KMP_MB();
4019 
4020   kmp_info_t *thread = __kmp_threads[gtid];
4021   kmp_team_t *team = thread->th.th_team;
4022   kmp_task_team_t *task_team = thread->th.th_task_team;
4023 
4024   // we need to wait for the proxy tasks before finishing the thread
4025   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4026 #if OMPT_SUPPORT
4027     // the runtime is shutting down so we won't report any events
4028     thread->th.ompt_thread_info.state = ompt_state_undefined;
4029 #endif
4030     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4031   }
4032 
4033   __kmp_reset_root(gtid, root);
4034 
4035   KMP_MB();
4036   KC_TRACE(10,
4037            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4038 
4039   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4040 }
4041 
4042 #if KMP_OS_WINDOWS
4043 /* __kmp_forkjoin_lock must be already held
4044    Unregisters a root thread that is not the current thread.  Returns the number
4045    of __kmp_threads entries freed as a result. */
4046 static int __kmp_unregister_root_other_thread(int gtid) {
4047   kmp_root_t *root = __kmp_root[gtid];
4048   int r;
4049 
4050   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4051   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4052   KMP_ASSERT(KMP_UBER_GTID(gtid));
4053   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4054   KMP_ASSERT(root->r.r_active == FALSE);
4055 
4056   r = __kmp_reset_root(gtid, root);
4057   KC_TRACE(10,
4058            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4059   return r;
4060 }
4061 #endif
4062 
4063 #if KMP_DEBUG
4064 void __kmp_task_info() {
4065 
4066   kmp_int32 gtid = __kmp_entry_gtid();
4067   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4068   kmp_info_t *this_thr = __kmp_threads[gtid];
4069   kmp_team_t *steam = this_thr->th.th_serial_team;
4070   kmp_team_t *team = this_thr->th.th_team;
4071 
4072   __kmp_printf(
4073       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4074       "ptask=%p\n",
4075       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4076       team->t.t_implicit_task_taskdata[tid].td_parent);
4077 }
4078 #endif // KMP_DEBUG
4079 
4080 /* TODO optimize with one big memclr, take out what isn't needed, split
4081    responsibility to workers as much as possible, and delay initialization of
4082    features as much as possible  */
4083 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4084                                   int tid, int gtid) {
4085   /* this_thr->th.th_info.ds.ds_gtid is setup in
4086      kmp_allocate_thread/create_worker.
4087      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4088   KMP_DEBUG_ASSERT(this_thr != NULL);
4089   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4090   KMP_DEBUG_ASSERT(team);
4091   KMP_DEBUG_ASSERT(team->t.t_threads);
4092   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4093   kmp_info_t *master = team->t.t_threads[0];
4094   KMP_DEBUG_ASSERT(master);
4095   KMP_DEBUG_ASSERT(master->th.th_root);
4096 
4097   KMP_MB();
4098 
4099   TCW_SYNC_PTR(this_thr->th.th_team, team);
4100 
4101   this_thr->th.th_info.ds.ds_tid = tid;
4102   this_thr->th.th_set_nproc = 0;
4103   if (__kmp_tasking_mode != tskm_immediate_exec)
4104     // When tasking is possible, threads are not safe to reap until they are
4105     // done tasking; this will be set when tasking code is exited in wait
4106     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4107   else // no tasking --> always safe to reap
4108     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4109   this_thr->th.th_set_proc_bind = proc_bind_default;
4110 #if KMP_AFFINITY_SUPPORTED
4111   this_thr->th.th_new_place = this_thr->th.th_current_place;
4112 #endif
4113   this_thr->th.th_root = master->th.th_root;
4114 
4115   /* setup the thread's cache of the team structure */
4116   this_thr->th.th_team_nproc = team->t.t_nproc;
4117   this_thr->th.th_team_master = master;
4118   this_thr->th.th_team_serialized = team->t.t_serialized;
4119   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4120 
4121   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4122 
4123   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4124                 tid, gtid, this_thr, this_thr->th.th_current_task));
4125 
4126   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4127                            team, tid, TRUE);
4128 
4129   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4130                 tid, gtid, this_thr, this_thr->th.th_current_task));
4131   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4132   // __kmp_initialize_team()?
4133 
4134   /* TODO no worksharing in speculative threads */
4135   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4136 
4137   this_thr->th.th_local.this_construct = 0;
4138 
4139   if (!this_thr->th.th_pri_common) {
4140     this_thr->th.th_pri_common =
4141         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4142     if (__kmp_storage_map) {
4143       __kmp_print_storage_map_gtid(
4144           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4145           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4146     }
4147     this_thr->th.th_pri_head = NULL;
4148   }
4149 
4150   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4151       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4152     // Make new thread's CG root same as primary thread's
4153     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4154     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4155     if (tmp) {
4156       // worker changes CG, need to check if old CG should be freed
4157       int i = tmp->cg_nthreads--;
4158       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4159                      " on node %p of thread %p to %d\n",
4160                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4161       if (i == 1) {
4162         __kmp_free(tmp); // last thread left CG --> free it
4163       }
4164     }
4165     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4166     // Increment new thread's CG root's counter to add the new thread
4167     this_thr->th.th_cg_roots->cg_nthreads++;
4168     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4169                    " node %p of thread %p to %d\n",
4170                    this_thr, this_thr->th.th_cg_roots,
4171                    this_thr->th.th_cg_roots->cg_root,
4172                    this_thr->th.th_cg_roots->cg_nthreads));
4173     this_thr->th.th_current_task->td_icvs.thread_limit =
4174         this_thr->th.th_cg_roots->cg_thread_limit;
4175   }
4176 
4177   /* Initialize dynamic dispatch */
4178   {
4179     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4180     // Use team max_nproc since this will never change for the team.
4181     size_t disp_size =
4182         sizeof(dispatch_private_info_t) *
4183         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4184     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4185                   team->t.t_max_nproc));
4186     KMP_ASSERT(dispatch);
4187     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4188     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4189 
4190     dispatch->th_disp_index = 0;
4191     dispatch->th_doacross_buf_idx = 0;
4192     if (!dispatch->th_disp_buffer) {
4193       dispatch->th_disp_buffer =
4194           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4195 
4196       if (__kmp_storage_map) {
4197         __kmp_print_storage_map_gtid(
4198             gtid, &dispatch->th_disp_buffer[0],
4199             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4200                                           ? 1
4201                                           : __kmp_dispatch_num_buffers],
4202             disp_size,
4203             "th_%d.th_dispatch.th_disp_buffer "
4204             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4205             gtid, team->t.t_id, gtid);
4206       }
4207     } else {
4208       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4209     }
4210 
4211     dispatch->th_dispatch_pr_current = 0;
4212     dispatch->th_dispatch_sh_current = 0;
4213 
4214     dispatch->th_deo_fcn = 0; /* ORDERED     */
4215     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4216   }
4217 
4218   this_thr->th.th_next_pool = NULL;
4219 
4220   if (!this_thr->th.th_task_state_memo_stack) {
4221     size_t i;
4222     this_thr->th.th_task_state_memo_stack =
4223         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4224     this_thr->th.th_task_state_top = 0;
4225     this_thr->th.th_task_state_stack_sz = 4;
4226     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4227          ++i) // zero init the stack
4228       this_thr->th.th_task_state_memo_stack[i] = 0;
4229   }
4230 
4231   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4232   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4233 
4234   KMP_MB();
4235 }
4236 
4237 /* allocate a new thread for the requesting team. this is only called from
4238    within a forkjoin critical section. we will first try to get an available
4239    thread from the thread pool. if none is available, we will fork a new one
4240    assuming we are able to create a new one. this should be assured, as the
4241    caller should check on this first. */
4242 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4243                                   int new_tid) {
4244   kmp_team_t *serial_team;
4245   kmp_info_t *new_thr;
4246   int new_gtid;
4247 
4248   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4249   KMP_DEBUG_ASSERT(root && team);
4250 #if !KMP_NESTED_HOT_TEAMS
4251   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4252 #endif
4253   KMP_MB();
4254 
4255   /* first, try to get one from the thread pool */
4256   if (__kmp_thread_pool) {
4257     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4258     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4259     if (new_thr == __kmp_thread_pool_insert_pt) {
4260       __kmp_thread_pool_insert_pt = NULL;
4261     }
4262     TCW_4(new_thr->th.th_in_pool, FALSE);
4263     __kmp_suspend_initialize_thread(new_thr);
4264     __kmp_lock_suspend_mx(new_thr);
4265     if (new_thr->th.th_active_in_pool == TRUE) {
4266       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4267       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4268       new_thr->th.th_active_in_pool = FALSE;
4269     }
4270     __kmp_unlock_suspend_mx(new_thr);
4271 
4272     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4273                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4274     KMP_ASSERT(!new_thr->th.th_team);
4275     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4276 
4277     /* setup the thread structure */
4278     __kmp_initialize_info(new_thr, team, new_tid,
4279                           new_thr->th.th_info.ds.ds_gtid);
4280     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4281 
4282     TCW_4(__kmp_nth, __kmp_nth + 1);
4283 
4284     new_thr->th.th_task_state = 0;
4285     new_thr->th.th_task_state_top = 0;
4286     new_thr->th.th_task_state_stack_sz = 4;
4287 
4288 #ifdef KMP_ADJUST_BLOCKTIME
4289     /* Adjust blocktime back to zero if necessary */
4290     /* Middle initialization might not have occurred yet */
4291     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4292       if (__kmp_nth > __kmp_avail_proc) {
4293         __kmp_zero_bt = TRUE;
4294       }
4295     }
4296 #endif /* KMP_ADJUST_BLOCKTIME */
4297 
4298 #if KMP_DEBUG
4299     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4300     // KMP_BARRIER_PARENT_FLAG.
4301     int b;
4302     kmp_balign_t *balign = new_thr->th.th_bar;
4303     for (b = 0; b < bs_last_barrier; ++b)
4304       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4305 #endif
4306 
4307     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4308                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4309 
4310     KMP_MB();
4311     return new_thr;
4312   }
4313 
4314   /* no, well fork a new one */
4315   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4316   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4317 
4318 #if KMP_USE_MONITOR
4319   // If this is the first worker thread the RTL is creating, then also
4320   // launch the monitor thread.  We try to do this as early as possible.
4321   if (!TCR_4(__kmp_init_monitor)) {
4322     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4323     if (!TCR_4(__kmp_init_monitor)) {
4324       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4325       TCW_4(__kmp_init_monitor, 1);
4326       __kmp_create_monitor(&__kmp_monitor);
4327       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4328 #if KMP_OS_WINDOWS
4329       // AC: wait until monitor has started. This is a fix for CQ232808.
4330       // The reason is that if the library is loaded/unloaded in a loop with
4331       // small (parallel) work in between, then there is high probability that
4332       // monitor thread started after the library shutdown. At shutdown it is
4333       // too late to cope with the problem, because when the primary thread is
4334       // in DllMain (process detach) the monitor has no chances to start (it is
4335       // blocked), and primary thread has no means to inform the monitor that
4336       // the library has gone, because all the memory which the monitor can
4337       // access is going to be released/reset.
4338       while (TCR_4(__kmp_init_monitor) < 2) {
4339         KMP_YIELD(TRUE);
4340       }
4341       KF_TRACE(10, ("after monitor thread has started\n"));
4342 #endif
4343     }
4344     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4345   }
4346 #endif
4347 
4348   KMP_MB();
4349 
4350   {
4351     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4352                              ? 1
4353                              : __kmp_hidden_helper_threads_num + 1;
4354 
4355     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4356          ++new_gtid) {
4357       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4358     }
4359 
4360     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4361       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4362     }
4363   }
4364 
4365   /* allocate space for it. */
4366   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4367 
4368   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4369 
4370 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4371   // suppress race conditions detection on synchronization flags in debug mode
4372   // this helps to analyze library internals eliminating false positives
4373   __itt_suppress_mark_range(
4374       __itt_suppress_range, __itt_suppress_threading_errors,
4375       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4376   __itt_suppress_mark_range(
4377       __itt_suppress_range, __itt_suppress_threading_errors,
4378       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4379 #if KMP_OS_WINDOWS
4380   __itt_suppress_mark_range(
4381       __itt_suppress_range, __itt_suppress_threading_errors,
4382       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4383 #else
4384   __itt_suppress_mark_range(__itt_suppress_range,
4385                             __itt_suppress_threading_errors,
4386                             &new_thr->th.th_suspend_init_count,
4387                             sizeof(new_thr->th.th_suspend_init_count));
4388 #endif
4389   // TODO: check if we need to also suppress b_arrived flags
4390   __itt_suppress_mark_range(__itt_suppress_range,
4391                             __itt_suppress_threading_errors,
4392                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4393                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4394   __itt_suppress_mark_range(__itt_suppress_range,
4395                             __itt_suppress_threading_errors,
4396                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4397                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4398   __itt_suppress_mark_range(__itt_suppress_range,
4399                             __itt_suppress_threading_errors,
4400                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4401                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4402 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4403   if (__kmp_storage_map) {
4404     __kmp_print_thread_storage_map(new_thr, new_gtid);
4405   }
4406 
4407   // add the reserve serialized team, initialized from the team's primary thread
4408   {
4409     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4410     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4411     new_thr->th.th_serial_team = serial_team =
4412         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4413 #if OMPT_SUPPORT
4414                                           ompt_data_none, // root parallel id
4415 #endif
4416                                           proc_bind_default, &r_icvs,
4417                                           0 USE_NESTED_HOT_ARG(NULL));
4418   }
4419   KMP_ASSERT(serial_team);
4420   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4421   // execution (it is unused for now).
4422   serial_team->t.t_threads[0] = new_thr;
4423   KF_TRACE(10,
4424            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4425             new_thr));
4426 
4427   /* setup the thread structures */
4428   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4429 
4430 #if USE_FAST_MEMORY
4431   __kmp_initialize_fast_memory(new_thr);
4432 #endif /* USE_FAST_MEMORY */
4433 
4434 #if KMP_USE_BGET
4435   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4436   __kmp_initialize_bget(new_thr);
4437 #endif
4438 
4439   __kmp_init_random(new_thr); // Initialize random number generator
4440 
4441   /* Initialize these only once when thread is grabbed for a team allocation */
4442   KA_TRACE(20,
4443            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4444             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4445 
4446   int b;
4447   kmp_balign_t *balign = new_thr->th.th_bar;
4448   for (b = 0; b < bs_last_barrier; ++b) {
4449     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4450     balign[b].bb.team = NULL;
4451     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4452     balign[b].bb.use_oncore_barrier = 0;
4453   }
4454 
4455   new_thr->th.th_spin_here = FALSE;
4456   new_thr->th.th_next_waiting = 0;
4457 #if KMP_OS_UNIX
4458   new_thr->th.th_blocking = false;
4459 #endif
4460 
4461 #if KMP_AFFINITY_SUPPORTED
4462   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4463   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4464   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4465   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4466 #endif
4467   new_thr->th.th_def_allocator = __kmp_def_allocator;
4468   new_thr->th.th_prev_level = 0;
4469   new_thr->th.th_prev_num_threads = 1;
4470 
4471   TCW_4(new_thr->th.th_in_pool, FALSE);
4472   new_thr->th.th_active_in_pool = FALSE;
4473   TCW_4(new_thr->th.th_active, TRUE);
4474 
4475   /* adjust the global counters */
4476   __kmp_all_nth++;
4477   __kmp_nth++;
4478 
4479   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4480   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4481   if (__kmp_adjust_gtid_mode) {
4482     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4483       if (TCR_4(__kmp_gtid_mode) != 2) {
4484         TCW_4(__kmp_gtid_mode, 2);
4485       }
4486     } else {
4487       if (TCR_4(__kmp_gtid_mode) != 1) {
4488         TCW_4(__kmp_gtid_mode, 1);
4489       }
4490     }
4491   }
4492 
4493 #ifdef KMP_ADJUST_BLOCKTIME
4494   /* Adjust blocktime back to zero if necessary       */
4495   /* Middle initialization might not have occurred yet */
4496   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4497     if (__kmp_nth > __kmp_avail_proc) {
4498       __kmp_zero_bt = TRUE;
4499     }
4500   }
4501 #endif /* KMP_ADJUST_BLOCKTIME */
4502 
4503   /* actually fork it and create the new worker thread */
4504   KF_TRACE(
4505       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4506   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4507   KF_TRACE(10,
4508            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4509 
4510   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4511                 new_gtid));
4512   KMP_MB();
4513   return new_thr;
4514 }
4515 
4516 /* Reinitialize team for reuse.
4517    The hot team code calls this case at every fork barrier, so EPCC barrier
4518    test are extremely sensitive to changes in it, esp. writes to the team
4519    struct, which cause a cache invalidation in all threads.
4520    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4521 static void __kmp_reinitialize_team(kmp_team_t *team,
4522                                     kmp_internal_control_t *new_icvs,
4523                                     ident_t *loc) {
4524   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4525                 team->t.t_threads[0], team));
4526   KMP_DEBUG_ASSERT(team && new_icvs);
4527   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4528   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4529 
4530   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4531   // Copy ICVs to the primary thread's implicit taskdata
4532   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4533   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4534 
4535   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4536                 team->t.t_threads[0], team));
4537 }
4538 
4539 /* Initialize the team data structure.
4540    This assumes the t_threads and t_max_nproc are already set.
4541    Also, we don't touch the arguments */
4542 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4543                                   kmp_internal_control_t *new_icvs,
4544                                   ident_t *loc) {
4545   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4546 
4547   /* verify */
4548   KMP_DEBUG_ASSERT(team);
4549   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4550   KMP_DEBUG_ASSERT(team->t.t_threads);
4551   KMP_MB();
4552 
4553   team->t.t_master_tid = 0; /* not needed */
4554   /* team->t.t_master_bar;        not needed */
4555   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4556   team->t.t_nproc = new_nproc;
4557 
4558   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4559   team->t.t_next_pool = NULL;
4560   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4561    * up hot team */
4562 
4563   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4564   team->t.t_invoke = NULL; /* not needed */
4565 
4566   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4567   team->t.t_sched.sched = new_icvs->sched.sched;
4568 
4569 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4570   team->t.t_fp_control_saved = FALSE; /* not needed */
4571   team->t.t_x87_fpu_control_word = 0; /* not needed */
4572   team->t.t_mxcsr = 0; /* not needed */
4573 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4574 
4575   team->t.t_construct = 0;
4576 
4577   team->t.t_ordered.dt.t_value = 0;
4578   team->t.t_master_active = FALSE;
4579 
4580 #ifdef KMP_DEBUG
4581   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4582 #endif
4583 #if KMP_OS_WINDOWS
4584   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4585 #endif
4586 
4587   team->t.t_control_stack_top = NULL;
4588 
4589   __kmp_reinitialize_team(team, new_icvs, loc);
4590 
4591   KMP_MB();
4592   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4593 }
4594 
4595 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4596 /* Sets full mask for thread and returns old mask, no changes to structures. */
4597 static void
4598 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4599   if (KMP_AFFINITY_CAPABLE()) {
4600     int status;
4601     if (old_mask != NULL) {
4602       status = __kmp_get_system_affinity(old_mask, TRUE);
4603       int error = errno;
4604       if (status != 0) {
4605         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4606                     __kmp_msg_null);
4607       }
4608     }
4609     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4610   }
4611 }
4612 #endif
4613 
4614 #if KMP_AFFINITY_SUPPORTED
4615 
4616 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4617 // It calculates the worker + primary thread's partition based upon the parent
4618 // thread's partition, and binds each worker to a thread in their partition.
4619 // The primary thread's partition should already include its current binding.
4620 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4621   // Do not partition places for the hidden helper team
4622   if (KMP_HIDDEN_HELPER_TEAM(team))
4623     return;
4624   // Copy the primary thread's place partition to the team struct
4625   kmp_info_t *master_th = team->t.t_threads[0];
4626   KMP_DEBUG_ASSERT(master_th != NULL);
4627   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4628   int first_place = master_th->th.th_first_place;
4629   int last_place = master_th->th.th_last_place;
4630   int masters_place = master_th->th.th_current_place;
4631   team->t.t_first_place = first_place;
4632   team->t.t_last_place = last_place;
4633 
4634   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4635                 "bound to place %d partition = [%d,%d]\n",
4636                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4637                 team->t.t_id, masters_place, first_place, last_place));
4638 
4639   switch (proc_bind) {
4640 
4641   case proc_bind_default:
4642     // Serial teams might have the proc_bind policy set to proc_bind_default.
4643     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4644     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4645     break;
4646 
4647   case proc_bind_primary: {
4648     int f;
4649     int n_th = team->t.t_nproc;
4650     for (f = 1; f < n_th; f++) {
4651       kmp_info_t *th = team->t.t_threads[f];
4652       KMP_DEBUG_ASSERT(th != NULL);
4653       th->th.th_first_place = first_place;
4654       th->th.th_last_place = last_place;
4655       th->th.th_new_place = masters_place;
4656       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4657           team->t.t_display_affinity != 1) {
4658         team->t.t_display_affinity = 1;
4659       }
4660 
4661       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4662                      "partition = [%d,%d]\n",
4663                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4664                      f, masters_place, first_place, last_place));
4665     }
4666   } break;
4667 
4668   case proc_bind_close: {
4669     int f;
4670     int n_th = team->t.t_nproc;
4671     int n_places;
4672     if (first_place <= last_place) {
4673       n_places = last_place - first_place + 1;
4674     } else {
4675       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4676     }
4677     if (n_th <= n_places) {
4678       int place = masters_place;
4679       for (f = 1; f < n_th; f++) {
4680         kmp_info_t *th = team->t.t_threads[f];
4681         KMP_DEBUG_ASSERT(th != NULL);
4682 
4683         if (place == last_place) {
4684           place = first_place;
4685         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4686           place = 0;
4687         } else {
4688           place++;
4689         }
4690         th->th.th_first_place = first_place;
4691         th->th.th_last_place = last_place;
4692         th->th.th_new_place = place;
4693         if (__kmp_display_affinity && place != th->th.th_current_place &&
4694             team->t.t_display_affinity != 1) {
4695           team->t.t_display_affinity = 1;
4696         }
4697 
4698         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4699                        "partition = [%d,%d]\n",
4700                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4701                        team->t.t_id, f, place, first_place, last_place));
4702       }
4703     } else {
4704       int S, rem, gap, s_count;
4705       S = n_th / n_places;
4706       s_count = 0;
4707       rem = n_th - (S * n_places);
4708       gap = rem > 0 ? n_places / rem : n_places;
4709       int place = masters_place;
4710       int gap_ct = gap;
4711       for (f = 0; f < n_th; f++) {
4712         kmp_info_t *th = team->t.t_threads[f];
4713         KMP_DEBUG_ASSERT(th != NULL);
4714 
4715         th->th.th_first_place = first_place;
4716         th->th.th_last_place = last_place;
4717         th->th.th_new_place = place;
4718         if (__kmp_display_affinity && place != th->th.th_current_place &&
4719             team->t.t_display_affinity != 1) {
4720           team->t.t_display_affinity = 1;
4721         }
4722         s_count++;
4723 
4724         if ((s_count == S) && rem && (gap_ct == gap)) {
4725           // do nothing, add an extra thread to place on next iteration
4726         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4727           // we added an extra thread to this place; move to next place
4728           if (place == last_place) {
4729             place = first_place;
4730           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4731             place = 0;
4732           } else {
4733             place++;
4734           }
4735           s_count = 0;
4736           gap_ct = 1;
4737           rem--;
4738         } else if (s_count == S) { // place full; don't add extra
4739           if (place == last_place) {
4740             place = first_place;
4741           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4742             place = 0;
4743           } else {
4744             place++;
4745           }
4746           gap_ct++;
4747           s_count = 0;
4748         }
4749 
4750         KA_TRACE(100,
4751                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4752                   "partition = [%d,%d]\n",
4753                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4754                   th->th.th_new_place, first_place, last_place));
4755       }
4756       KMP_DEBUG_ASSERT(place == masters_place);
4757     }
4758   } break;
4759 
4760   case proc_bind_spread: {
4761     int f;
4762     int n_th = team->t.t_nproc;
4763     int n_places;
4764     int thidx;
4765     if (first_place <= last_place) {
4766       n_places = last_place - first_place + 1;
4767     } else {
4768       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4769     }
4770     if (n_th <= n_places) {
4771       int place = -1;
4772 
4773       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4774         int S = n_places / n_th;
4775         int s_count, rem, gap, gap_ct;
4776 
4777         place = masters_place;
4778         rem = n_places - n_th * S;
4779         gap = rem ? n_th / rem : 1;
4780         gap_ct = gap;
4781         thidx = n_th;
4782         if (update_master_only == 1)
4783           thidx = 1;
4784         for (f = 0; f < thidx; f++) {
4785           kmp_info_t *th = team->t.t_threads[f];
4786           KMP_DEBUG_ASSERT(th != NULL);
4787 
4788           th->th.th_first_place = place;
4789           th->th.th_new_place = place;
4790           if (__kmp_display_affinity && place != th->th.th_current_place &&
4791               team->t.t_display_affinity != 1) {
4792             team->t.t_display_affinity = 1;
4793           }
4794           s_count = 1;
4795           while (s_count < S) {
4796             if (place == last_place) {
4797               place = first_place;
4798             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4799               place = 0;
4800             } else {
4801               place++;
4802             }
4803             s_count++;
4804           }
4805           if (rem && (gap_ct == gap)) {
4806             if (place == last_place) {
4807               place = first_place;
4808             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4809               place = 0;
4810             } else {
4811               place++;
4812             }
4813             rem--;
4814             gap_ct = 0;
4815           }
4816           th->th.th_last_place = place;
4817           gap_ct++;
4818 
4819           if (place == last_place) {
4820             place = first_place;
4821           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4822             place = 0;
4823           } else {
4824             place++;
4825           }
4826 
4827           KA_TRACE(100,
4828                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4829                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4830                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4831                     f, th->th.th_new_place, th->th.th_first_place,
4832                     th->th.th_last_place, __kmp_affinity_num_masks));
4833         }
4834       } else {
4835         /* Having uniform space of available computation places I can create
4836            T partitions of round(P/T) size and put threads into the first
4837            place of each partition. */
4838         double current = static_cast<double>(masters_place);
4839         double spacing =
4840             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4841         int first, last;
4842         kmp_info_t *th;
4843 
4844         thidx = n_th + 1;
4845         if (update_master_only == 1)
4846           thidx = 1;
4847         for (f = 0; f < thidx; f++) {
4848           first = static_cast<int>(current);
4849           last = static_cast<int>(current + spacing) - 1;
4850           KMP_DEBUG_ASSERT(last >= first);
4851           if (first >= n_places) {
4852             if (masters_place) {
4853               first -= n_places;
4854               last -= n_places;
4855               if (first == (masters_place + 1)) {
4856                 KMP_DEBUG_ASSERT(f == n_th);
4857                 first--;
4858               }
4859               if (last == masters_place) {
4860                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4861                 last--;
4862               }
4863             } else {
4864               KMP_DEBUG_ASSERT(f == n_th);
4865               first = 0;
4866               last = 0;
4867             }
4868           }
4869           if (last >= n_places) {
4870             last = (n_places - 1);
4871           }
4872           place = first;
4873           current += spacing;
4874           if (f < n_th) {
4875             KMP_DEBUG_ASSERT(0 <= first);
4876             KMP_DEBUG_ASSERT(n_places > first);
4877             KMP_DEBUG_ASSERT(0 <= last);
4878             KMP_DEBUG_ASSERT(n_places > last);
4879             KMP_DEBUG_ASSERT(last_place >= first_place);
4880             th = team->t.t_threads[f];
4881             KMP_DEBUG_ASSERT(th);
4882             th->th.th_first_place = first;
4883             th->th.th_new_place = place;
4884             th->th.th_last_place = last;
4885             if (__kmp_display_affinity && place != th->th.th_current_place &&
4886                 team->t.t_display_affinity != 1) {
4887               team->t.t_display_affinity = 1;
4888             }
4889             KA_TRACE(100,
4890                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4891                       "partition = [%d,%d], spacing = %.4f\n",
4892                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4893                       team->t.t_id, f, th->th.th_new_place,
4894                       th->th.th_first_place, th->th.th_last_place, spacing));
4895           }
4896         }
4897       }
4898       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4899     } else {
4900       int S, rem, gap, s_count;
4901       S = n_th / n_places;
4902       s_count = 0;
4903       rem = n_th - (S * n_places);
4904       gap = rem > 0 ? n_places / rem : n_places;
4905       int place = masters_place;
4906       int gap_ct = gap;
4907       thidx = n_th;
4908       if (update_master_only == 1)
4909         thidx = 1;
4910       for (f = 0; f < thidx; f++) {
4911         kmp_info_t *th = team->t.t_threads[f];
4912         KMP_DEBUG_ASSERT(th != NULL);
4913 
4914         th->th.th_first_place = place;
4915         th->th.th_last_place = place;
4916         th->th.th_new_place = place;
4917         if (__kmp_display_affinity && place != th->th.th_current_place &&
4918             team->t.t_display_affinity != 1) {
4919           team->t.t_display_affinity = 1;
4920         }
4921         s_count++;
4922 
4923         if ((s_count == S) && rem && (gap_ct == gap)) {
4924           // do nothing, add an extra thread to place on next iteration
4925         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4926           // we added an extra thread to this place; move on to next place
4927           if (place == last_place) {
4928             place = first_place;
4929           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4930             place = 0;
4931           } else {
4932             place++;
4933           }
4934           s_count = 0;
4935           gap_ct = 1;
4936           rem--;
4937         } else if (s_count == S) { // place is full; don't add extra thread
4938           if (place == last_place) {
4939             place = first_place;
4940           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4941             place = 0;
4942           } else {
4943             place++;
4944           }
4945           gap_ct++;
4946           s_count = 0;
4947         }
4948 
4949         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4950                        "partition = [%d,%d]\n",
4951                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4952                        team->t.t_id, f, th->th.th_new_place,
4953                        th->th.th_first_place, th->th.th_last_place));
4954       }
4955       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4956     }
4957   } break;
4958 
4959   default:
4960     break;
4961   }
4962 
4963   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4964 }
4965 
4966 #endif // KMP_AFFINITY_SUPPORTED
4967 
4968 /* allocate a new team data structure to use.  take one off of the free pool if
4969    available */
4970 kmp_team_t *
4971 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4972 #if OMPT_SUPPORT
4973                     ompt_data_t ompt_parallel_data,
4974 #endif
4975                     kmp_proc_bind_t new_proc_bind,
4976                     kmp_internal_control_t *new_icvs,
4977                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4978   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4979   int f;
4980   kmp_team_t *team;
4981   int use_hot_team = !root->r.r_active;
4982   int level = 0;
4983 
4984   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4985   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4986   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4987   KMP_MB();
4988 
4989 #if KMP_NESTED_HOT_TEAMS
4990   kmp_hot_team_ptr_t *hot_teams;
4991   if (master) {
4992     team = master->th.th_team;
4993     level = team->t.t_active_level;
4994     if (master->th.th_teams_microtask) { // in teams construct?
4995       if (master->th.th_teams_size.nteams > 1 &&
4996           ( // #teams > 1
4997               team->t.t_pkfn ==
4998                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4999               master->th.th_teams_level <
5000                   team->t.t_level)) { // or nested parallel inside the teams
5001         ++level; // not increment if #teams==1, or for outer fork of the teams;
5002         // increment otherwise
5003       }
5004     }
5005     hot_teams = master->th.th_hot_teams;
5006     if (level < __kmp_hot_teams_max_level && hot_teams &&
5007         hot_teams[level].hot_team) {
5008       // hot team has already been allocated for given level
5009       use_hot_team = 1;
5010     } else {
5011       use_hot_team = 0;
5012     }
5013   } else {
5014     // check we won't access uninitialized hot_teams, just in case
5015     KMP_DEBUG_ASSERT(new_nproc == 1);
5016   }
5017 #endif
5018   // Optimization to use a "hot" team
5019   if (use_hot_team && new_nproc > 1) {
5020     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5021 #if KMP_NESTED_HOT_TEAMS
5022     team = hot_teams[level].hot_team;
5023 #else
5024     team = root->r.r_hot_team;
5025 #endif
5026 #if KMP_DEBUG
5027     if (__kmp_tasking_mode != tskm_immediate_exec) {
5028       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5029                     "task_team[1] = %p before reinit\n",
5030                     team->t.t_task_team[0], team->t.t_task_team[1]));
5031     }
5032 #endif
5033 
5034     // Has the number of threads changed?
5035     /* Let's assume the most common case is that the number of threads is
5036        unchanged, and put that case first. */
5037     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5038       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5039       // This case can mean that omp_set_num_threads() was called and the hot
5040       // team size was already reduced, so we check the special flag
5041       if (team->t.t_size_changed == -1) {
5042         team->t.t_size_changed = 1;
5043       } else {
5044         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5045       }
5046 
5047       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5048       kmp_r_sched_t new_sched = new_icvs->sched;
5049       // set primary thread's schedule as new run-time schedule
5050       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5051 
5052       __kmp_reinitialize_team(team, new_icvs,
5053                               root->r.r_uber_thread->th.th_ident);
5054 
5055       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5056                     team->t.t_threads[0], team));
5057       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5058 
5059 #if KMP_AFFINITY_SUPPORTED
5060       if ((team->t.t_size_changed == 0) &&
5061           (team->t.t_proc_bind == new_proc_bind)) {
5062         if (new_proc_bind == proc_bind_spread) {
5063           __kmp_partition_places(
5064               team, 1); // add flag to update only master for spread
5065         }
5066         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5067                        "proc_bind = %d, partition = [%d,%d]\n",
5068                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5069                        team->t.t_last_place));
5070       } else {
5071         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5072         __kmp_partition_places(team);
5073       }
5074 #else
5075       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5076 #endif /* KMP_AFFINITY_SUPPORTED */
5077     } else if (team->t.t_nproc > new_nproc) {
5078       KA_TRACE(20,
5079                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5080                 new_nproc));
5081 
5082       team->t.t_size_changed = 1;
5083 #if KMP_NESTED_HOT_TEAMS
5084       if (__kmp_hot_teams_mode == 0) {
5085         // AC: saved number of threads should correspond to team's value in this
5086         // mode, can be bigger in mode 1, when hot team has threads in reserve
5087         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5088         hot_teams[level].hot_team_nth = new_nproc;
5089 #endif // KMP_NESTED_HOT_TEAMS
5090         /* release the extra threads we don't need any more */
5091         for (f = new_nproc; f < team->t.t_nproc; f++) {
5092           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5093           if (__kmp_tasking_mode != tskm_immediate_exec) {
5094             // When decreasing team size, threads no longer in the team should
5095             // unref task team.
5096             team->t.t_threads[f]->th.th_task_team = NULL;
5097           }
5098           __kmp_free_thread(team->t.t_threads[f]);
5099           team->t.t_threads[f] = NULL;
5100         }
5101 #if KMP_NESTED_HOT_TEAMS
5102       } // (__kmp_hot_teams_mode == 0)
5103       else {
5104         // When keeping extra threads in team, switch threads to wait on own
5105         // b_go flag
5106         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5107           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5108           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5109           for (int b = 0; b < bs_last_barrier; ++b) {
5110             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5111               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5112             }
5113             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5114           }
5115         }
5116       }
5117 #endif // KMP_NESTED_HOT_TEAMS
5118       team->t.t_nproc = new_nproc;
5119       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5120       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5121       __kmp_reinitialize_team(team, new_icvs,
5122                               root->r.r_uber_thread->th.th_ident);
5123 
5124       // Update remaining threads
5125       for (f = 0; f < new_nproc; ++f) {
5126         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5127       }
5128 
5129       // restore the current task state of the primary thread: should be the
5130       // implicit task
5131       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5132                     team->t.t_threads[0], team));
5133 
5134       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5135 
5136 #ifdef KMP_DEBUG
5137       for (f = 0; f < team->t.t_nproc; f++) {
5138         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5139                          team->t.t_threads[f]->th.th_team_nproc ==
5140                              team->t.t_nproc);
5141       }
5142 #endif
5143 
5144       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5145 #if KMP_AFFINITY_SUPPORTED
5146       __kmp_partition_places(team);
5147 #endif
5148     } else { // team->t.t_nproc < new_nproc
5149 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5150       kmp_affin_mask_t *old_mask;
5151       if (KMP_AFFINITY_CAPABLE()) {
5152         KMP_CPU_ALLOC(old_mask);
5153       }
5154 #endif
5155 
5156       KA_TRACE(20,
5157                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5158                 new_nproc));
5159 
5160       team->t.t_size_changed = 1;
5161 
5162 #if KMP_NESTED_HOT_TEAMS
5163       int avail_threads = hot_teams[level].hot_team_nth;
5164       if (new_nproc < avail_threads)
5165         avail_threads = new_nproc;
5166       kmp_info_t **other_threads = team->t.t_threads;
5167       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5168         // Adjust barrier data of reserved threads (if any) of the team
5169         // Other data will be set in __kmp_initialize_info() below.
5170         int b;
5171         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5172         for (b = 0; b < bs_last_barrier; ++b) {
5173           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5174           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5175 #if USE_DEBUGGER
5176           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5177 #endif
5178         }
5179       }
5180       if (hot_teams[level].hot_team_nth >= new_nproc) {
5181         // we have all needed threads in reserve, no need to allocate any
5182         // this only possible in mode 1, cannot have reserved threads in mode 0
5183         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5184         team->t.t_nproc = new_nproc; // just get reserved threads involved
5185       } else {
5186         // we may have some threads in reserve, but not enough
5187         team->t.t_nproc =
5188             hot_teams[level]
5189                 .hot_team_nth; // get reserved threads involved if any
5190         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5191 #endif // KMP_NESTED_HOT_TEAMS
5192         if (team->t.t_max_nproc < new_nproc) {
5193           /* reallocate larger arrays */
5194           __kmp_reallocate_team_arrays(team, new_nproc);
5195           __kmp_reinitialize_team(team, new_icvs, NULL);
5196         }
5197 
5198 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5199         /* Temporarily set full mask for primary thread before creation of
5200            workers. The reason is that workers inherit the affinity from the
5201            primary thread, so if a lot of workers are created on the single
5202            core quickly, they don't get a chance to set their own affinity for
5203            a long time. */
5204         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5205 #endif
5206 
5207         /* allocate new threads for the hot team */
5208         for (f = team->t.t_nproc; f < new_nproc; f++) {
5209           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5210           KMP_DEBUG_ASSERT(new_worker);
5211           team->t.t_threads[f] = new_worker;
5212 
5213           KA_TRACE(20,
5214                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5215                     "join=%llu, plain=%llu\n",
5216                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5217                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5218                     team->t.t_bar[bs_plain_barrier].b_arrived));
5219 
5220           { // Initialize barrier data for new threads.
5221             int b;
5222             kmp_balign_t *balign = new_worker->th.th_bar;
5223             for (b = 0; b < bs_last_barrier; ++b) {
5224               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5225               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5226                                KMP_BARRIER_PARENT_FLAG);
5227 #if USE_DEBUGGER
5228               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5229 #endif
5230             }
5231           }
5232         }
5233 
5234 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5235         if (KMP_AFFINITY_CAPABLE()) {
5236           /* Restore initial primary thread's affinity mask */
5237           __kmp_set_system_affinity(old_mask, TRUE);
5238           KMP_CPU_FREE(old_mask);
5239         }
5240 #endif
5241 #if KMP_NESTED_HOT_TEAMS
5242       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5243 #endif // KMP_NESTED_HOT_TEAMS
5244       /* make sure everyone is syncronized */
5245       int old_nproc = team->t.t_nproc; // save old value and use to update only
5246       // new threads below
5247       __kmp_initialize_team(team, new_nproc, new_icvs,
5248                             root->r.r_uber_thread->th.th_ident);
5249 
5250       /* reinitialize the threads */
5251       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5252       for (f = 0; f < team->t.t_nproc; ++f)
5253         __kmp_initialize_info(team->t.t_threads[f], team, f,
5254                               __kmp_gtid_from_tid(f, team));
5255 
5256       if (level) { // set th_task_state for new threads in nested hot team
5257         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5258         // only need to set the th_task_state for the new threads. th_task_state
5259         // for primary thread will not be accurate until after this in
5260         // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5261         // get the correct value.
5262         for (f = old_nproc; f < team->t.t_nproc; ++f)
5263           team->t.t_threads[f]->th.th_task_state =
5264               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5265       } else { // set th_task_state for new threads in non-nested hot team
5266         // copy primary thread's state
5267         kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5268         for (f = old_nproc; f < team->t.t_nproc; ++f)
5269           team->t.t_threads[f]->th.th_task_state = old_state;
5270       }
5271 
5272 #ifdef KMP_DEBUG
5273       for (f = 0; f < team->t.t_nproc; ++f) {
5274         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5275                          team->t.t_threads[f]->th.th_team_nproc ==
5276                              team->t.t_nproc);
5277       }
5278 #endif
5279 
5280       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5281 #if KMP_AFFINITY_SUPPORTED
5282       __kmp_partition_places(team);
5283 #endif
5284     } // Check changes in number of threads
5285 
5286     kmp_info_t *master = team->t.t_threads[0];
5287     if (master->th.th_teams_microtask) {
5288       for (f = 1; f < new_nproc; ++f) {
5289         // propagate teams construct specific info to workers
5290         kmp_info_t *thr = team->t.t_threads[f];
5291         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5292         thr->th.th_teams_level = master->th.th_teams_level;
5293         thr->th.th_teams_size = master->th.th_teams_size;
5294       }
5295     }
5296 #if KMP_NESTED_HOT_TEAMS
5297     if (level) {
5298       // Sync barrier state for nested hot teams, not needed for outermost hot
5299       // team.
5300       for (f = 1; f < new_nproc; ++f) {
5301         kmp_info_t *thr = team->t.t_threads[f];
5302         int b;
5303         kmp_balign_t *balign = thr->th.th_bar;
5304         for (b = 0; b < bs_last_barrier; ++b) {
5305           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5306           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5307 #if USE_DEBUGGER
5308           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5309 #endif
5310         }
5311       }
5312     }
5313 #endif // KMP_NESTED_HOT_TEAMS
5314 
5315     /* reallocate space for arguments if necessary */
5316     __kmp_alloc_argv_entries(argc, team, TRUE);
5317     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5318     // The hot team re-uses the previous task team,
5319     // if untouched during the previous release->gather phase.
5320 
5321     KF_TRACE(10, (" hot_team = %p\n", team));
5322 
5323 #if KMP_DEBUG
5324     if (__kmp_tasking_mode != tskm_immediate_exec) {
5325       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5326                     "task_team[1] = %p after reinit\n",
5327                     team->t.t_task_team[0], team->t.t_task_team[1]));
5328     }
5329 #endif
5330 
5331 #if OMPT_SUPPORT
5332     __ompt_team_assign_id(team, ompt_parallel_data);
5333 #endif
5334 
5335     KMP_MB();
5336 
5337     return team;
5338   }
5339 
5340   /* next, let's try to take one from the team pool */
5341   KMP_MB();
5342   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5343     /* TODO: consider resizing undersized teams instead of reaping them, now
5344        that we have a resizing mechanism */
5345     if (team->t.t_max_nproc >= max_nproc) {
5346       /* take this team from the team pool */
5347       __kmp_team_pool = team->t.t_next_pool;
5348 
5349       /* setup the team for fresh use */
5350       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5351 
5352       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5353                     "task_team[1] %p to NULL\n",
5354                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5355       team->t.t_task_team[0] = NULL;
5356       team->t.t_task_team[1] = NULL;
5357 
5358       /* reallocate space for arguments if necessary */
5359       __kmp_alloc_argv_entries(argc, team, TRUE);
5360       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5361 
5362       KA_TRACE(
5363           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5364                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5365       { // Initialize barrier data.
5366         int b;
5367         for (b = 0; b < bs_last_barrier; ++b) {
5368           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5369 #if USE_DEBUGGER
5370           team->t.t_bar[b].b_master_arrived = 0;
5371           team->t.t_bar[b].b_team_arrived = 0;
5372 #endif
5373         }
5374       }
5375 
5376       team->t.t_proc_bind = new_proc_bind;
5377 
5378       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5379                     team->t.t_id));
5380 
5381 #if OMPT_SUPPORT
5382       __ompt_team_assign_id(team, ompt_parallel_data);
5383 #endif
5384 
5385       KMP_MB();
5386 
5387       return team;
5388     }
5389 
5390     /* reap team if it is too small, then loop back and check the next one */
5391     // not sure if this is wise, but, will be redone during the hot-teams
5392     // rewrite.
5393     /* TODO: Use technique to find the right size hot-team, don't reap them */
5394     team = __kmp_reap_team(team);
5395     __kmp_team_pool = team;
5396   }
5397 
5398   /* nothing available in the pool, no matter, make a new team! */
5399   KMP_MB();
5400   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5401 
5402   /* and set it up */
5403   team->t.t_max_nproc = max_nproc;
5404   /* NOTE well, for some reason allocating one big buffer and dividing it up
5405      seems to really hurt performance a lot on the P4, so, let's not use this */
5406   __kmp_allocate_team_arrays(team, max_nproc);
5407 
5408   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5409   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5410 
5411   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5412                 "%p to NULL\n",
5413                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5414   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5415   // memory, no need to duplicate
5416   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5417   // memory, no need to duplicate
5418 
5419   if (__kmp_storage_map) {
5420     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5421   }
5422 
5423   /* allocate space for arguments */
5424   __kmp_alloc_argv_entries(argc, team, FALSE);
5425   team->t.t_argc = argc;
5426 
5427   KA_TRACE(20,
5428            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5429             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5430   { // Initialize barrier data.
5431     int b;
5432     for (b = 0; b < bs_last_barrier; ++b) {
5433       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5434 #if USE_DEBUGGER
5435       team->t.t_bar[b].b_master_arrived = 0;
5436       team->t.t_bar[b].b_team_arrived = 0;
5437 #endif
5438     }
5439   }
5440 
5441   team->t.t_proc_bind = new_proc_bind;
5442 
5443 #if OMPT_SUPPORT
5444   __ompt_team_assign_id(team, ompt_parallel_data);
5445   team->t.ompt_serialized_team_info = NULL;
5446 #endif
5447 
5448   KMP_MB();
5449 
5450   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5451                 team->t.t_id));
5452 
5453   return team;
5454 }
5455 
5456 /* TODO implement hot-teams at all levels */
5457 /* TODO implement lazy thread release on demand (disband request) */
5458 
5459 /* free the team.  return it to the team pool.  release all the threads
5460  * associated with it */
5461 void __kmp_free_team(kmp_root_t *root,
5462                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5463   int f;
5464   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5465                 team->t.t_id));
5466 
5467   /* verify state */
5468   KMP_DEBUG_ASSERT(root);
5469   KMP_DEBUG_ASSERT(team);
5470   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5471   KMP_DEBUG_ASSERT(team->t.t_threads);
5472 
5473   int use_hot_team = team == root->r.r_hot_team;
5474 #if KMP_NESTED_HOT_TEAMS
5475   int level;
5476   kmp_hot_team_ptr_t *hot_teams;
5477   if (master) {
5478     level = team->t.t_active_level - 1;
5479     if (master->th.th_teams_microtask) { // in teams construct?
5480       if (master->th.th_teams_size.nteams > 1) {
5481         ++level; // level was not increased in teams construct for
5482         // team_of_masters
5483       }
5484       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5485           master->th.th_teams_level == team->t.t_level) {
5486         ++level; // level was not increased in teams construct for
5487         // team_of_workers before the parallel
5488       } // team->t.t_level will be increased inside parallel
5489     }
5490     hot_teams = master->th.th_hot_teams;
5491     if (level < __kmp_hot_teams_max_level) {
5492       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5493       use_hot_team = 1;
5494     }
5495   }
5496 #endif // KMP_NESTED_HOT_TEAMS
5497 
5498   /* team is done working */
5499   TCW_SYNC_PTR(team->t.t_pkfn,
5500                NULL); // Important for Debugging Support Library.
5501 #if KMP_OS_WINDOWS
5502   team->t.t_copyin_counter = 0; // init counter for possible reuse
5503 #endif
5504   // Do not reset pointer to parent team to NULL for hot teams.
5505 
5506   /* if we are non-hot team, release our threads */
5507   if (!use_hot_team) {
5508     if (__kmp_tasking_mode != tskm_immediate_exec) {
5509       // Wait for threads to reach reapable state
5510       for (f = 1; f < team->t.t_nproc; ++f) {
5511         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5512         kmp_info_t *th = team->t.t_threads[f];
5513         volatile kmp_uint32 *state = &th->th.th_reap_state;
5514         while (*state != KMP_SAFE_TO_REAP) {
5515 #if KMP_OS_WINDOWS
5516           // On Windows a thread can be killed at any time, check this
5517           DWORD ecode;
5518           if (!__kmp_is_thread_alive(th, &ecode)) {
5519             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5520             break;
5521           }
5522 #endif
5523           // first check if thread is sleeping
5524           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5525           if (fl.is_sleeping())
5526             fl.resume(__kmp_gtid_from_thread(th));
5527           KMP_CPU_PAUSE();
5528         }
5529       }
5530 
5531       // Delete task teams
5532       int tt_idx;
5533       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5534         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5535         if (task_team != NULL) {
5536           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5537             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5538             team->t.t_threads[f]->th.th_task_team = NULL;
5539           }
5540           KA_TRACE(
5541               20,
5542               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5543                __kmp_get_gtid(), task_team, team->t.t_id));
5544 #if KMP_NESTED_HOT_TEAMS
5545           __kmp_free_task_team(master, task_team);
5546 #endif
5547           team->t.t_task_team[tt_idx] = NULL;
5548         }
5549       }
5550     }
5551 
5552     // Reset pointer to parent team only for non-hot teams.
5553     team->t.t_parent = NULL;
5554     team->t.t_level = 0;
5555     team->t.t_active_level = 0;
5556 
5557     /* free the worker threads */
5558     for (f = 1; f < team->t.t_nproc; ++f) {
5559       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5560       __kmp_free_thread(team->t.t_threads[f]);
5561       team->t.t_threads[f] = NULL;
5562     }
5563 
5564     /* put the team back in the team pool */
5565     /* TODO limit size of team pool, call reap_team if pool too large */
5566     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5567     __kmp_team_pool = (volatile kmp_team_t *)team;
5568   } else { // Check if team was created for primary threads in teams construct
5569     // See if first worker is a CG root
5570     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5571                      team->t.t_threads[1]->th.th_cg_roots);
5572     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5573       // Clean up the CG root nodes on workers so that this team can be re-used
5574       for (f = 1; f < team->t.t_nproc; ++f) {
5575         kmp_info_t *thr = team->t.t_threads[f];
5576         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5577                          thr->th.th_cg_roots->cg_root == thr);
5578         // Pop current CG root off list
5579         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5580         thr->th.th_cg_roots = tmp->up;
5581         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5582                        " up to node %p. cg_nthreads was %d\n",
5583                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5584         int i = tmp->cg_nthreads--;
5585         if (i == 1) {
5586           __kmp_free(tmp); // free CG if we are the last thread in it
5587         }
5588         // Restore current task's thread_limit from CG root
5589         if (thr->th.th_cg_roots)
5590           thr->th.th_current_task->td_icvs.thread_limit =
5591               thr->th.th_cg_roots->cg_thread_limit;
5592       }
5593     }
5594   }
5595 
5596   KMP_MB();
5597 }
5598 
5599 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5600 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5601   kmp_team_t *next_pool = team->t.t_next_pool;
5602 
5603   KMP_DEBUG_ASSERT(team);
5604   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5605   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5606   KMP_DEBUG_ASSERT(team->t.t_threads);
5607   KMP_DEBUG_ASSERT(team->t.t_argv);
5608 
5609   /* TODO clean the threads that are a part of this? */
5610 
5611   /* free stuff */
5612   __kmp_free_team_arrays(team);
5613   if (team->t.t_argv != &team->t.t_inline_argv[0])
5614     __kmp_free((void *)team->t.t_argv);
5615   __kmp_free(team);
5616 
5617   KMP_MB();
5618   return next_pool;
5619 }
5620 
5621 // Free the thread.  Don't reap it, just place it on the pool of available
5622 // threads.
5623 //
5624 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5625 // binding for the affinity mechanism to be useful.
5626 //
5627 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5628 // However, we want to avoid a potential performance problem by always
5629 // scanning through the list to find the correct point at which to insert
5630 // the thread (potential N**2 behavior).  To do this we keep track of the
5631 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5632 // With single-level parallelism, threads will always be added to the tail
5633 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5634 // parallelism, all bets are off and we may need to scan through the entire
5635 // free list.
5636 //
5637 // This change also has a potentially large performance benefit, for some
5638 // applications.  Previously, as threads were freed from the hot team, they
5639 // would be placed back on the free list in inverse order.  If the hot team
5640 // grew back to it's original size, then the freed thread would be placed
5641 // back on the hot team in reverse order.  This could cause bad cache
5642 // locality problems on programs where the size of the hot team regularly
5643 // grew and shrunk.
5644 //
5645 // Now, for single-level parallelism, the OMP tid is always == gtid.
5646 void __kmp_free_thread(kmp_info_t *this_th) {
5647   int gtid;
5648   kmp_info_t **scan;
5649 
5650   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5651                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5652 
5653   KMP_DEBUG_ASSERT(this_th);
5654 
5655   // When moving thread to pool, switch thread to wait on own b_go flag, and
5656   // uninitialized (NULL team).
5657   int b;
5658   kmp_balign_t *balign = this_th->th.th_bar;
5659   for (b = 0; b < bs_last_barrier; ++b) {
5660     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5661       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5662     balign[b].bb.team = NULL;
5663     balign[b].bb.leaf_kids = 0;
5664   }
5665   this_th->th.th_task_state = 0;
5666   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5667 
5668   /* put thread back on the free pool */
5669   TCW_PTR(this_th->th.th_team, NULL);
5670   TCW_PTR(this_th->th.th_root, NULL);
5671   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5672 
5673   while (this_th->th.th_cg_roots) {
5674     this_th->th.th_cg_roots->cg_nthreads--;
5675     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5676                    " %p of thread  %p to %d\n",
5677                    this_th, this_th->th.th_cg_roots,
5678                    this_th->th.th_cg_roots->cg_root,
5679                    this_th->th.th_cg_roots->cg_nthreads));
5680     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5681     if (tmp->cg_root == this_th) { // Thread is a cg_root
5682       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5683       KA_TRACE(
5684           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5685       this_th->th.th_cg_roots = tmp->up;
5686       __kmp_free(tmp);
5687     } else { // Worker thread
5688       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5689         __kmp_free(tmp);
5690       }
5691       this_th->th.th_cg_roots = NULL;
5692       break;
5693     }
5694   }
5695 
5696   /* If the implicit task assigned to this thread can be used by other threads
5697    * -> multiple threads can share the data and try to free the task at
5698    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5699    * with higher probability when hot team is disabled but can occurs even when
5700    * the hot team is enabled */
5701   __kmp_free_implicit_task(this_th);
5702   this_th->th.th_current_task = NULL;
5703 
5704   // If the __kmp_thread_pool_insert_pt is already past the new insert
5705   // point, then we need to re-scan the entire list.
5706   gtid = this_th->th.th_info.ds.ds_gtid;
5707   if (__kmp_thread_pool_insert_pt != NULL) {
5708     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5709     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5710       __kmp_thread_pool_insert_pt = NULL;
5711     }
5712   }
5713 
5714   // Scan down the list to find the place to insert the thread.
5715   // scan is the address of a link in the list, possibly the address of
5716   // __kmp_thread_pool itself.
5717   //
5718   // In the absence of nested parallelism, the for loop will have 0 iterations.
5719   if (__kmp_thread_pool_insert_pt != NULL) {
5720     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5721   } else {
5722     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5723   }
5724   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5725        scan = &((*scan)->th.th_next_pool))
5726     ;
5727 
5728   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5729   // to its address.
5730   TCW_PTR(this_th->th.th_next_pool, *scan);
5731   __kmp_thread_pool_insert_pt = *scan = this_th;
5732   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5733                    (this_th->th.th_info.ds.ds_gtid <
5734                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5735   TCW_4(this_th->th.th_in_pool, TRUE);
5736   __kmp_suspend_initialize_thread(this_th);
5737   __kmp_lock_suspend_mx(this_th);
5738   if (this_th->th.th_active == TRUE) {
5739     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5740     this_th->th.th_active_in_pool = TRUE;
5741   }
5742 #if KMP_DEBUG
5743   else {
5744     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5745   }
5746 #endif
5747   __kmp_unlock_suspend_mx(this_th);
5748 
5749   TCW_4(__kmp_nth, __kmp_nth - 1);
5750 
5751 #ifdef KMP_ADJUST_BLOCKTIME
5752   /* Adjust blocktime back to user setting or default if necessary */
5753   /* Middle initialization might never have occurred                */
5754   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5755     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5756     if (__kmp_nth <= __kmp_avail_proc) {
5757       __kmp_zero_bt = FALSE;
5758     }
5759   }
5760 #endif /* KMP_ADJUST_BLOCKTIME */
5761 
5762   KMP_MB();
5763 }
5764 
5765 /* ------------------------------------------------------------------------ */
5766 
5767 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5768 #if OMP_PROFILING_SUPPORT
5769   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5770   // TODO: add a configuration option for time granularity
5771   if (ProfileTraceFile)
5772     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5773 #endif
5774 
5775   int gtid = this_thr->th.th_info.ds.ds_gtid;
5776   /*    void                 *stack_data;*/
5777   kmp_team_t **volatile pteam;
5778 
5779   KMP_MB();
5780   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5781 
5782   if (__kmp_env_consistency_check) {
5783     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5784   }
5785 
5786 #if OMPD_SUPPORT
5787   if (ompd_state & OMPD_ENABLE_BP)
5788     ompd_bp_thread_begin();
5789 #endif
5790 
5791 #if OMPT_SUPPORT
5792   ompt_data_t *thread_data = nullptr;
5793   if (ompt_enabled.enabled) {
5794     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5795     *thread_data = ompt_data_none;
5796 
5797     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5798     this_thr->th.ompt_thread_info.wait_id = 0;
5799     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5800     this_thr->th.ompt_thread_info.parallel_flags = 0;
5801     if (ompt_enabled.ompt_callback_thread_begin) {
5802       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5803           ompt_thread_worker, thread_data);
5804     }
5805     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5806   }
5807 #endif
5808 
5809   /* This is the place where threads wait for work */
5810   while (!TCR_4(__kmp_global.g.g_done)) {
5811     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5812     KMP_MB();
5813 
5814     /* wait for work to do */
5815     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5816 
5817     /* No tid yet since not part of a team */
5818     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5819 
5820 #if OMPT_SUPPORT
5821     if (ompt_enabled.enabled) {
5822       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5823     }
5824 #endif
5825 
5826     pteam = &this_thr->th.th_team;
5827 
5828     /* have we been allocated? */
5829     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5830       /* we were just woken up, so run our new task */
5831       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5832         int rc;
5833         KA_TRACE(20,
5834                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5835                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5836                   (*pteam)->t.t_pkfn));
5837 
5838         updateHWFPControl(*pteam);
5839 
5840 #if OMPT_SUPPORT
5841         if (ompt_enabled.enabled) {
5842           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5843         }
5844 #endif
5845 
5846         rc = (*pteam)->t.t_invoke(gtid);
5847         KMP_ASSERT(rc);
5848 
5849         KMP_MB();
5850         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5851                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5852                       (*pteam)->t.t_pkfn));
5853       }
5854 #if OMPT_SUPPORT
5855       if (ompt_enabled.enabled) {
5856         /* no frame set while outside task */
5857         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5858 
5859         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5860       }
5861 #endif
5862       /* join barrier after parallel region */
5863       __kmp_join_barrier(gtid);
5864     }
5865   }
5866   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5867 
5868 #if OMPD_SUPPORT
5869   if (ompd_state & OMPD_ENABLE_BP)
5870     ompd_bp_thread_end();
5871 #endif
5872 
5873 #if OMPT_SUPPORT
5874   if (ompt_enabled.ompt_callback_thread_end) {
5875     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5876   }
5877 #endif
5878 
5879   this_thr->th.th_task_team = NULL;
5880   /* run the destructors for the threadprivate data for this thread */
5881   __kmp_common_destroy_gtid(gtid);
5882 
5883   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5884   KMP_MB();
5885 
5886 #if OMP_PROFILING_SUPPORT
5887   llvm::timeTraceProfilerFinishThread();
5888 #endif
5889   return this_thr;
5890 }
5891 
5892 /* ------------------------------------------------------------------------ */
5893 
5894 void __kmp_internal_end_dest(void *specific_gtid) {
5895   // Make sure no significant bits are lost
5896   int gtid;
5897   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5898 
5899   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5900   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5901    * this is because 0 is reserved for the nothing-stored case */
5902 
5903   __kmp_internal_end_thread(gtid);
5904 }
5905 
5906 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5907 
5908 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5909   __kmp_internal_end_atexit();
5910 }
5911 
5912 #endif
5913 
5914 /* [Windows] josh: when the atexit handler is called, there may still be more
5915    than one thread alive */
5916 void __kmp_internal_end_atexit(void) {
5917   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5918   /* [Windows]
5919      josh: ideally, we want to completely shutdown the library in this atexit
5920      handler, but stat code that depends on thread specific data for gtid fails
5921      because that data becomes unavailable at some point during the shutdown, so
5922      we call __kmp_internal_end_thread instead. We should eventually remove the
5923      dependency on __kmp_get_specific_gtid in the stat code and use
5924      __kmp_internal_end_library to cleanly shutdown the library.
5925 
5926      // TODO: Can some of this comment about GVS be removed?
5927      I suspect that the offending stat code is executed when the calling thread
5928      tries to clean up a dead root thread's data structures, resulting in GVS
5929      code trying to close the GVS structures for that thread, but since the stat
5930      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5931      the calling thread is cleaning up itself instead of another thread, it get
5932      confused. This happens because allowing a thread to unregister and cleanup
5933      another thread is a recent modification for addressing an issue.
5934      Based on the current design (20050722), a thread may end up
5935      trying to unregister another thread only if thread death does not trigger
5936      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5937      thread specific data destructor function to detect thread death. For
5938      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5939      is nothing.  Thus, the workaround is applicable only for Windows static
5940      stat library. */
5941   __kmp_internal_end_library(-1);
5942 #if KMP_OS_WINDOWS
5943   __kmp_close_console();
5944 #endif
5945 }
5946 
5947 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5948   // It is assumed __kmp_forkjoin_lock is acquired.
5949 
5950   int gtid;
5951 
5952   KMP_DEBUG_ASSERT(thread != NULL);
5953 
5954   gtid = thread->th.th_info.ds.ds_gtid;
5955 
5956   if (!is_root) {
5957     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5958       /* Assume the threads are at the fork barrier here */
5959       KA_TRACE(
5960           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5961                gtid));
5962       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5963        * (GEH) */
5964       ANNOTATE_HAPPENS_BEFORE(thread);
5965       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5966                          thread);
5967       __kmp_release_64(&flag);
5968     }
5969 
5970     // Terminate OS thread.
5971     __kmp_reap_worker(thread);
5972 
5973     // The thread was killed asynchronously.  If it was actively
5974     // spinning in the thread pool, decrement the global count.
5975     //
5976     // There is a small timing hole here - if the worker thread was just waking
5977     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5978     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5979     // the global counter might not get updated.
5980     //
5981     // Currently, this can only happen as the library is unloaded,
5982     // so there are no harmful side effects.
5983     if (thread->th.th_active_in_pool) {
5984       thread->th.th_active_in_pool = FALSE;
5985       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5986       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5987     }
5988   }
5989 
5990   __kmp_free_implicit_task(thread);
5991 
5992 // Free the fast memory for tasking
5993 #if USE_FAST_MEMORY
5994   __kmp_free_fast_memory(thread);
5995 #endif /* USE_FAST_MEMORY */
5996 
5997   __kmp_suspend_uninitialize_thread(thread);
5998 
5999   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6000   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6001 
6002   --__kmp_all_nth;
6003   // __kmp_nth was decremented when thread is added to the pool.
6004 
6005 #ifdef KMP_ADJUST_BLOCKTIME
6006   /* Adjust blocktime back to user setting or default if necessary */
6007   /* Middle initialization might never have occurred                */
6008   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6009     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6010     if (__kmp_nth <= __kmp_avail_proc) {
6011       __kmp_zero_bt = FALSE;
6012     }
6013   }
6014 #endif /* KMP_ADJUST_BLOCKTIME */
6015 
6016   /* free the memory being used */
6017   if (__kmp_env_consistency_check) {
6018     if (thread->th.th_cons) {
6019       __kmp_free_cons_stack(thread->th.th_cons);
6020       thread->th.th_cons = NULL;
6021     }
6022   }
6023 
6024   if (thread->th.th_pri_common != NULL) {
6025     __kmp_free(thread->th.th_pri_common);
6026     thread->th.th_pri_common = NULL;
6027   }
6028 
6029   if (thread->th.th_task_state_memo_stack != NULL) {
6030     __kmp_free(thread->th.th_task_state_memo_stack);
6031     thread->th.th_task_state_memo_stack = NULL;
6032   }
6033 
6034 #if KMP_USE_BGET
6035   if (thread->th.th_local.bget_data != NULL) {
6036     __kmp_finalize_bget(thread);
6037   }
6038 #endif
6039 
6040 #if KMP_AFFINITY_SUPPORTED
6041   if (thread->th.th_affin_mask != NULL) {
6042     KMP_CPU_FREE(thread->th.th_affin_mask);
6043     thread->th.th_affin_mask = NULL;
6044   }
6045 #endif /* KMP_AFFINITY_SUPPORTED */
6046 
6047 #if KMP_USE_HIER_SCHED
6048   if (thread->th.th_hier_bar_data != NULL) {
6049     __kmp_free(thread->th.th_hier_bar_data);
6050     thread->th.th_hier_bar_data = NULL;
6051   }
6052 #endif
6053 
6054   __kmp_reap_team(thread->th.th_serial_team);
6055   thread->th.th_serial_team = NULL;
6056   __kmp_free(thread);
6057 
6058   KMP_MB();
6059 
6060 } // __kmp_reap_thread
6061 
6062 static void __kmp_internal_end(void) {
6063   int i;
6064 
6065   /* First, unregister the library */
6066   __kmp_unregister_library();
6067 
6068 #if KMP_OS_WINDOWS
6069   /* In Win static library, we can't tell when a root actually dies, so we
6070      reclaim the data structures for any root threads that have died but not
6071      unregistered themselves, in order to shut down cleanly.
6072      In Win dynamic library we also can't tell when a thread dies.  */
6073   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6074 // dead roots
6075 #endif
6076 
6077   for (i = 0; i < __kmp_threads_capacity; i++)
6078     if (__kmp_root[i])
6079       if (__kmp_root[i]->r.r_active)
6080         break;
6081   KMP_MB(); /* Flush all pending memory write invalidates.  */
6082   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6083 
6084   if (i < __kmp_threads_capacity) {
6085 #if KMP_USE_MONITOR
6086     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6087     KMP_MB(); /* Flush all pending memory write invalidates.  */
6088 
6089     // Need to check that monitor was initialized before reaping it. If we are
6090     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6091     // __kmp_monitor will appear to contain valid data, but it is only valid in
6092     // the parent process, not the child.
6093     // New behavior (201008): instead of keying off of the flag
6094     // __kmp_init_parallel, the monitor thread creation is keyed off
6095     // of the new flag __kmp_init_monitor.
6096     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6097     if (TCR_4(__kmp_init_monitor)) {
6098       __kmp_reap_monitor(&__kmp_monitor);
6099       TCW_4(__kmp_init_monitor, 0);
6100     }
6101     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6102     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6103 #endif // KMP_USE_MONITOR
6104   } else {
6105 /* TODO move this to cleanup code */
6106 #ifdef KMP_DEBUG
6107     /* make sure that everything has properly ended */
6108     for (i = 0; i < __kmp_threads_capacity; i++) {
6109       if (__kmp_root[i]) {
6110         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6111         //                    there can be uber threads alive here
6112         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6113       }
6114     }
6115 #endif
6116 
6117     KMP_MB();
6118 
6119     // Reap the worker threads.
6120     // This is valid for now, but be careful if threads are reaped sooner.
6121     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6122       // Get the next thread from the pool.
6123       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6124       __kmp_thread_pool = thread->th.th_next_pool;
6125       // Reap it.
6126       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6127       thread->th.th_next_pool = NULL;
6128       thread->th.th_in_pool = FALSE;
6129       __kmp_reap_thread(thread, 0);
6130     }
6131     __kmp_thread_pool_insert_pt = NULL;
6132 
6133     // Reap teams.
6134     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6135       // Get the next team from the pool.
6136       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6137       __kmp_team_pool = team->t.t_next_pool;
6138       // Reap it.
6139       team->t.t_next_pool = NULL;
6140       __kmp_reap_team(team);
6141     }
6142 
6143     __kmp_reap_task_teams();
6144 
6145 #if KMP_OS_UNIX
6146     // Threads that are not reaped should not access any resources since they
6147     // are going to be deallocated soon, so the shutdown sequence should wait
6148     // until all threads either exit the final spin-waiting loop or begin
6149     // sleeping after the given blocktime.
6150     for (i = 0; i < __kmp_threads_capacity; i++) {
6151       kmp_info_t *thr = __kmp_threads[i];
6152       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6153         KMP_CPU_PAUSE();
6154     }
6155 #endif
6156 
6157     for (i = 0; i < __kmp_threads_capacity; ++i) {
6158       // TBD: Add some checking...
6159       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6160     }
6161 
6162     /* Make sure all threadprivate destructors get run by joining with all
6163        worker threads before resetting this flag */
6164     TCW_SYNC_4(__kmp_init_common, FALSE);
6165 
6166     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6167     KMP_MB();
6168 
6169 #if KMP_USE_MONITOR
6170     // See note above: One of the possible fixes for CQ138434 / CQ140126
6171     //
6172     // FIXME: push both code fragments down and CSE them?
6173     // push them into __kmp_cleanup() ?
6174     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6175     if (TCR_4(__kmp_init_monitor)) {
6176       __kmp_reap_monitor(&__kmp_monitor);
6177       TCW_4(__kmp_init_monitor, 0);
6178     }
6179     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6180     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6181 #endif
6182   } /* else !__kmp_global.t_active */
6183   TCW_4(__kmp_init_gtid, FALSE);
6184   KMP_MB(); /* Flush all pending memory write invalidates.  */
6185 
6186   __kmp_cleanup();
6187 #if OMPT_SUPPORT
6188   ompt_fini();
6189 #endif
6190 }
6191 
6192 void __kmp_internal_end_library(int gtid_req) {
6193   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6194   /* this shouldn't be a race condition because __kmp_internal_end() is the
6195      only place to clear __kmp_serial_init */
6196   /* we'll check this later too, after we get the lock */
6197   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6198   // redundant, because the next check will work in any case.
6199   if (__kmp_global.g.g_abort) {
6200     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6201     /* TODO abort? */
6202     return;
6203   }
6204   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6205     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6206     return;
6207   }
6208 
6209   KMP_MB(); /* Flush all pending memory write invalidates.  */
6210   /* find out who we are and what we should do */
6211   {
6212     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6213     KA_TRACE(
6214         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6215     if (gtid == KMP_GTID_SHUTDOWN) {
6216       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6217                     "already shutdown\n"));
6218       return;
6219     } else if (gtid == KMP_GTID_MONITOR) {
6220       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6221                     "registered, or system shutdown\n"));
6222       return;
6223     } else if (gtid == KMP_GTID_DNE) {
6224       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6225                     "shutdown\n"));
6226       /* we don't know who we are, but we may still shutdown the library */
6227     } else if (KMP_UBER_GTID(gtid)) {
6228       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6229       if (__kmp_root[gtid]->r.r_active) {
6230         __kmp_global.g.g_abort = -1;
6231         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6232         __kmp_unregister_library();
6233         KA_TRACE(10,
6234                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6235                   gtid));
6236         return;
6237       } else {
6238         KA_TRACE(
6239             10,
6240             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6241         __kmp_unregister_root_current_thread(gtid);
6242       }
6243     } else {
6244 /* worker threads may call this function through the atexit handler, if they
6245  * call exit() */
6246 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6247    TODO: do a thorough shutdown instead */
6248 #ifdef DUMP_DEBUG_ON_EXIT
6249       if (__kmp_debug_buf)
6250         __kmp_dump_debug_buffer();
6251 #endif
6252       // added unregister library call here when we switch to shm linux
6253       // if we don't, it will leave lots of files in /dev/shm
6254       // cleanup shared memory file before exiting.
6255       __kmp_unregister_library();
6256       return;
6257     }
6258   }
6259   /* synchronize the termination process */
6260   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6261 
6262   /* have we already finished */
6263   if (__kmp_global.g.g_abort) {
6264     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6265     /* TODO abort? */
6266     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6267     return;
6268   }
6269   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6270     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6271     return;
6272   }
6273 
6274   /* We need this lock to enforce mutex between this reading of
6275      __kmp_threads_capacity and the writing by __kmp_register_root.
6276      Alternatively, we can use a counter of roots that is atomically updated by
6277      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6278      __kmp_internal_end_*.  */
6279   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6280 
6281   /* now we can safely conduct the actual termination */
6282   __kmp_internal_end();
6283 
6284   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6285   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6286 
6287   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6288 
6289 #ifdef DUMP_DEBUG_ON_EXIT
6290   if (__kmp_debug_buf)
6291     __kmp_dump_debug_buffer();
6292 #endif
6293 
6294 #if KMP_OS_WINDOWS
6295   __kmp_close_console();
6296 #endif
6297 
6298   __kmp_fini_allocator();
6299 
6300 } // __kmp_internal_end_library
6301 
6302 void __kmp_internal_end_thread(int gtid_req) {
6303   int i;
6304 
6305   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6306   /* this shouldn't be a race condition because __kmp_internal_end() is the
6307    * only place to clear __kmp_serial_init */
6308   /* we'll check this later too, after we get the lock */
6309   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6310   // redundant, because the next check will work in any case.
6311   if (__kmp_global.g.g_abort) {
6312     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6313     /* TODO abort? */
6314     return;
6315   }
6316   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6317     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6318     return;
6319   }
6320 
6321   // If hidden helper team has been initialized, we need to deinit it
6322   if (TCR_4(__kmp_init_hidden_helper)) {
6323     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6324     // First release the main thread to let it continue its work
6325     __kmp_hidden_helper_main_thread_release();
6326     // Wait until the hidden helper team has been destroyed
6327     __kmp_hidden_helper_threads_deinitz_wait();
6328   }
6329 
6330   KMP_MB(); /* Flush all pending memory write invalidates.  */
6331 
6332   /* find out who we are and what we should do */
6333   {
6334     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6335     KA_TRACE(10,
6336              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6337     if (gtid == KMP_GTID_SHUTDOWN) {
6338       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6339                     "already shutdown\n"));
6340       return;
6341     } else if (gtid == KMP_GTID_MONITOR) {
6342       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6343                     "registered, or system shutdown\n"));
6344       return;
6345     } else if (gtid == KMP_GTID_DNE) {
6346       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6347                     "shutdown\n"));
6348       return;
6349       /* we don't know who we are */
6350     } else if (KMP_UBER_GTID(gtid)) {
6351       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6352       if (__kmp_root[gtid]->r.r_active) {
6353         __kmp_global.g.g_abort = -1;
6354         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6355         KA_TRACE(10,
6356                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6357                   gtid));
6358         return;
6359       } else {
6360         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6361                       gtid));
6362         __kmp_unregister_root_current_thread(gtid);
6363       }
6364     } else {
6365       /* just a worker thread, let's leave */
6366       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6367 
6368       if (gtid >= 0) {
6369         __kmp_threads[gtid]->th.th_task_team = NULL;
6370       }
6371 
6372       KA_TRACE(10,
6373                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6374                 gtid));
6375       return;
6376     }
6377   }
6378 #if KMP_DYNAMIC_LIB
6379   if (__kmp_pause_status != kmp_hard_paused)
6380   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6381   // because we will better shutdown later in the library destructor.
6382   {
6383     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6384     return;
6385   }
6386 #endif
6387   /* synchronize the termination process */
6388   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6389 
6390   /* have we already finished */
6391   if (__kmp_global.g.g_abort) {
6392     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6393     /* TODO abort? */
6394     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6395     return;
6396   }
6397   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6398     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6399     return;
6400   }
6401 
6402   /* We need this lock to enforce mutex between this reading of
6403      __kmp_threads_capacity and the writing by __kmp_register_root.
6404      Alternatively, we can use a counter of roots that is atomically updated by
6405      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6406      __kmp_internal_end_*.  */
6407 
6408   /* should we finish the run-time?  are all siblings done? */
6409   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6410 
6411   for (i = 0; i < __kmp_threads_capacity; ++i) {
6412     if (KMP_UBER_GTID(i)) {
6413       KA_TRACE(
6414           10,
6415           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6416       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6417       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6418       return;
6419     }
6420   }
6421 
6422   /* now we can safely conduct the actual termination */
6423 
6424   __kmp_internal_end();
6425 
6426   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6427   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6428 
6429   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6430 
6431 #ifdef DUMP_DEBUG_ON_EXIT
6432   if (__kmp_debug_buf)
6433     __kmp_dump_debug_buffer();
6434 #endif
6435 } // __kmp_internal_end_thread
6436 
6437 // -----------------------------------------------------------------------------
6438 // Library registration stuff.
6439 
6440 static long __kmp_registration_flag = 0;
6441 // Random value used to indicate library initialization.
6442 static char *__kmp_registration_str = NULL;
6443 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6444 
6445 static inline char *__kmp_reg_status_name() {
6446 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6447    each thread. If registration and unregistration go in different threads
6448    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6449    env var can not be found, because the name will contain different pid. */
6450 // macOS* complains about name being too long with additional getuid()
6451 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6452   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6453                           (int)getuid());
6454 #else
6455   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6456 #endif
6457 } // __kmp_reg_status_get
6458 
6459 void __kmp_register_library_startup(void) {
6460 
6461   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6462   int done = 0;
6463   union {
6464     double dtime;
6465     long ltime;
6466   } time;
6467 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6468   __kmp_initialize_system_tick();
6469 #endif
6470   __kmp_read_system_time(&time.dtime);
6471   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6472   __kmp_registration_str =
6473       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6474                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6475 
6476   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6477                 __kmp_registration_str));
6478 
6479   while (!done) {
6480 
6481     char *value = NULL; // Actual value of the environment variable.
6482 
6483 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6484     char *shm_name = __kmp_str_format("/%s", name);
6485     int shm_preexist = 0;
6486     char *data1;
6487     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6488     if ((fd1 == -1) && (errno == EEXIST)) {
6489       // file didn't open because it already exists.
6490       // try opening existing file
6491       fd1 = shm_open(shm_name, O_RDWR, 0666);
6492       if (fd1 == -1) { // file didn't open
6493         // error out here
6494         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6495                     __kmp_msg_null);
6496       } else {
6497         // able to open existing file
6498         shm_preexist = 1;
6499       }
6500     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6501       // already exists.
6502       // error out here.
6503       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6504                   __kmp_msg_null);
6505     }
6506     if (shm_preexist == 0) {
6507       // we created SHM now set size
6508       if (ftruncate(fd1, SHM_SIZE) == -1) {
6509         // error occured setting size;
6510         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6511                     KMP_ERR(errno), __kmp_msg_null);
6512       }
6513     }
6514     data1 =
6515         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6516     if (data1 == MAP_FAILED) {
6517       // failed to map shared memory
6518       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6519                   __kmp_msg_null);
6520     }
6521     if (shm_preexist == 0) { // set data to SHM, set value
6522       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6523     }
6524     // Read value from either what we just wrote or existing file.
6525     value = __kmp_str_format("%s", data1); // read value from SHM
6526     munmap(data1, SHM_SIZE);
6527     close(fd1);
6528 #else // Windows and unix with static library
6529     // Set environment variable, but do not overwrite if it is exist.
6530     __kmp_env_set(name, __kmp_registration_str, 0);
6531     // read value to see if it got set
6532     value = __kmp_env_get(name);
6533 #endif
6534 
6535     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6536       done = 1; // Ok, environment variable set successfully, exit the loop.
6537     } else {
6538       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6539       // Check whether it alive or dead.
6540       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6541       char *tail = value;
6542       char *flag_addr_str = NULL;
6543       char *flag_val_str = NULL;
6544       char const *file_name = NULL;
6545       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6546       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6547       file_name = tail;
6548       if (tail != NULL) {
6549         long *flag_addr = 0;
6550         unsigned long flag_val = 0;
6551         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6552         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6553         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6554           // First, check whether environment-encoded address is mapped into
6555           // addr space.
6556           // If so, dereference it to see if it still has the right value.
6557           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6558             neighbor = 1;
6559           } else {
6560             // If not, then we know the other copy of the library is no longer
6561             // running.
6562             neighbor = 2;
6563           }
6564         }
6565       }
6566       switch (neighbor) {
6567       case 0: // Cannot parse environment variable -- neighbor status unknown.
6568         // Assume it is the incompatible format of future version of the
6569         // library. Assume the other library is alive.
6570         // WARN( ... ); // TODO: Issue a warning.
6571         file_name = "unknown library";
6572         KMP_FALLTHROUGH();
6573       // Attention! Falling to the next case. That's intentional.
6574       case 1: { // Neighbor is alive.
6575         // Check it is allowed.
6576         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6577         if (!__kmp_str_match_true(duplicate_ok)) {
6578           // That's not allowed. Issue fatal error.
6579           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6580                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6581         }
6582         KMP_INTERNAL_FREE(duplicate_ok);
6583         __kmp_duplicate_library_ok = 1;
6584         done = 1; // Exit the loop.
6585       } break;
6586       case 2: { // Neighbor is dead.
6587 
6588 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6589         // close shared memory.
6590         shm_unlink(shm_name); // this removes file in /dev/shm
6591 #else
6592         // Clear the variable and try to register library again.
6593         __kmp_env_unset(name);
6594 #endif
6595       } break;
6596       default: {
6597         KMP_DEBUG_ASSERT(0);
6598       } break;
6599       }
6600     }
6601     KMP_INTERNAL_FREE((void *)value);
6602 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6603     KMP_INTERNAL_FREE((void *)shm_name);
6604 #endif
6605   } // while
6606   KMP_INTERNAL_FREE((void *)name);
6607 
6608 } // func __kmp_register_library_startup
6609 
6610 void __kmp_unregister_library(void) {
6611 
6612   char *name = __kmp_reg_status_name();
6613   char *value = NULL;
6614 
6615 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6616   char *shm_name = __kmp_str_format("/%s", name);
6617   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6618   if (fd1 == -1) {
6619     // file did not open. return.
6620     return;
6621   }
6622   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6623   if (data1 != MAP_FAILED) {
6624     value = __kmp_str_format("%s", data1); // read value from SHM
6625     munmap(data1, SHM_SIZE);
6626   }
6627   close(fd1);
6628 #else
6629   value = __kmp_env_get(name);
6630 #endif
6631 
6632   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6633   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6634   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6635 //  Ok, this is our variable. Delete it.
6636 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6637     shm_unlink(shm_name); // this removes file in /dev/shm
6638 #else
6639     __kmp_env_unset(name);
6640 #endif
6641   }
6642 
6643 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6644   KMP_INTERNAL_FREE(shm_name);
6645 #endif
6646 
6647   KMP_INTERNAL_FREE(__kmp_registration_str);
6648   KMP_INTERNAL_FREE(value);
6649   KMP_INTERNAL_FREE(name);
6650 
6651   __kmp_registration_flag = 0;
6652   __kmp_registration_str = NULL;
6653 
6654 } // __kmp_unregister_library
6655 
6656 // End of Library registration stuff.
6657 // -----------------------------------------------------------------------------
6658 
6659 #if KMP_MIC_SUPPORTED
6660 
6661 static void __kmp_check_mic_type() {
6662   kmp_cpuid_t cpuid_state = {0};
6663   kmp_cpuid_t *cs_p = &cpuid_state;
6664   __kmp_x86_cpuid(1, 0, cs_p);
6665   // We don't support mic1 at the moment
6666   if ((cs_p->eax & 0xff0) == 0xB10) {
6667     __kmp_mic_type = mic2;
6668   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6669     __kmp_mic_type = mic3;
6670   } else {
6671     __kmp_mic_type = non_mic;
6672   }
6673 }
6674 
6675 #endif /* KMP_MIC_SUPPORTED */
6676 
6677 #if KMP_HAVE_UMWAIT
6678 static void __kmp_user_level_mwait_init() {
6679   struct kmp_cpuid buf;
6680   __kmp_x86_cpuid(7, 0, &buf);
6681   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6682   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6683                 __kmp_umwait_enabled));
6684 }
6685 #elif KMP_HAVE_MWAIT
6686 #ifndef AT_INTELPHIUSERMWAIT
6687 // Spurious, non-existent value that should always fail to return anything.
6688 // Will be replaced with the correct value when we know that.
6689 #define AT_INTELPHIUSERMWAIT 10000
6690 #endif
6691 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6692 // earlier OS is used to build the RTL, we'll use the following internal
6693 // function when the entry is not found.
6694 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6695 unsigned long getauxval(unsigned long) { return 0; }
6696 
6697 static void __kmp_user_level_mwait_init() {
6698   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6699   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6700   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6701   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6702   if (__kmp_mic_type == mic3) {
6703     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6704     if ((res & 0x1) || __kmp_user_level_mwait) {
6705       __kmp_mwait_enabled = TRUE;
6706       if (__kmp_user_level_mwait) {
6707         KMP_INFORM(EnvMwaitWarn);
6708       }
6709     } else {
6710       __kmp_mwait_enabled = FALSE;
6711     }
6712   }
6713   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6714                 "__kmp_mwait_enabled = %d\n",
6715                 __kmp_mic_type, __kmp_mwait_enabled));
6716 }
6717 #endif /* KMP_HAVE_UMWAIT */
6718 
6719 static void __kmp_do_serial_initialize(void) {
6720   int i, gtid;
6721   size_t size;
6722 
6723   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6724 
6725   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6726   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6727   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6728   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6729   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6730 
6731 #if OMPT_SUPPORT
6732   ompt_pre_init();
6733 #endif
6734 #if OMPD_SUPPORT
6735   __kmp_env_dump();
6736   ompd_init();
6737 #endif
6738 
6739   __kmp_validate_locks();
6740 
6741   /* Initialize internal memory allocator */
6742   __kmp_init_allocator();
6743 
6744   /* Register the library startup via an environment variable and check to see
6745      whether another copy of the library is already registered. */
6746 
6747   __kmp_register_library_startup();
6748 
6749   /* TODO reinitialization of library */
6750   if (TCR_4(__kmp_global.g.g_done)) {
6751     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6752   }
6753 
6754   __kmp_global.g.g_abort = 0;
6755   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6756 
6757 /* initialize the locks */
6758 #if KMP_USE_ADAPTIVE_LOCKS
6759 #if KMP_DEBUG_ADAPTIVE_LOCKS
6760   __kmp_init_speculative_stats();
6761 #endif
6762 #endif
6763 #if KMP_STATS_ENABLED
6764   __kmp_stats_init();
6765 #endif
6766   __kmp_init_lock(&__kmp_global_lock);
6767   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6768   __kmp_init_lock(&__kmp_debug_lock);
6769   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6770   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6771   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6772   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6773   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6774   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6775   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6776   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6777   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6778   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6779   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6780   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6781   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6782   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6783   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6784 #if KMP_USE_MONITOR
6785   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6786 #endif
6787   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6788 
6789   /* conduct initialization and initial setup of configuration */
6790 
6791   __kmp_runtime_initialize();
6792 
6793 #if KMP_MIC_SUPPORTED
6794   __kmp_check_mic_type();
6795 #endif
6796 
6797 // Some global variable initialization moved here from kmp_env_initialize()
6798 #ifdef KMP_DEBUG
6799   kmp_diag = 0;
6800 #endif
6801   __kmp_abort_delay = 0;
6802 
6803   // From __kmp_init_dflt_team_nth()
6804   /* assume the entire machine will be used */
6805   __kmp_dflt_team_nth_ub = __kmp_xproc;
6806   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6807     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6808   }
6809   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6810     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6811   }
6812   __kmp_max_nth = __kmp_sys_max_nth;
6813   __kmp_cg_max_nth = __kmp_sys_max_nth;
6814   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6815   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6816     __kmp_teams_max_nth = __kmp_sys_max_nth;
6817   }
6818 
6819   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6820   // part
6821   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6822 #if KMP_USE_MONITOR
6823   __kmp_monitor_wakeups =
6824       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6825   __kmp_bt_intervals =
6826       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6827 #endif
6828   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6829   __kmp_library = library_throughput;
6830   // From KMP_SCHEDULE initialization
6831   __kmp_static = kmp_sch_static_balanced;
6832 // AC: do not use analytical here, because it is non-monotonous
6833 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6834 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6835 // need to repeat assignment
6836 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6837 // bit control and barrier method control parts
6838 #if KMP_FAST_REDUCTION_BARRIER
6839 #define kmp_reduction_barrier_gather_bb ((int)1)
6840 #define kmp_reduction_barrier_release_bb ((int)1)
6841 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6842 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6843 #endif // KMP_FAST_REDUCTION_BARRIER
6844   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6845     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6846     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6847     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6848     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6849 #if KMP_FAST_REDUCTION_BARRIER
6850     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6851       // lin_64 ): hyper,1
6852       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6853       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6854       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6855       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6856     }
6857 #endif // KMP_FAST_REDUCTION_BARRIER
6858   }
6859 #if KMP_FAST_REDUCTION_BARRIER
6860 #undef kmp_reduction_barrier_release_pat
6861 #undef kmp_reduction_barrier_gather_pat
6862 #undef kmp_reduction_barrier_release_bb
6863 #undef kmp_reduction_barrier_gather_bb
6864 #endif // KMP_FAST_REDUCTION_BARRIER
6865 #if KMP_MIC_SUPPORTED
6866   if (__kmp_mic_type == mic2) { // KNC
6867     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6868     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6869     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6870         1; // forkjoin release
6871     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6872     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6873   }
6874 #if KMP_FAST_REDUCTION_BARRIER
6875   if (__kmp_mic_type == mic2) { // KNC
6876     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6877     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6878   }
6879 #endif // KMP_FAST_REDUCTION_BARRIER
6880 #endif // KMP_MIC_SUPPORTED
6881 
6882 // From KMP_CHECKS initialization
6883 #ifdef KMP_DEBUG
6884   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6885 #else
6886   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6887 #endif
6888 
6889   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6890   __kmp_foreign_tp = TRUE;
6891 
6892   __kmp_global.g.g_dynamic = FALSE;
6893   __kmp_global.g.g_dynamic_mode = dynamic_default;
6894 
6895   __kmp_init_nesting_mode();
6896 
6897   __kmp_env_initialize(NULL);
6898 
6899 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6900   __kmp_user_level_mwait_init();
6901 #endif
6902 // Print all messages in message catalog for testing purposes.
6903 #ifdef KMP_DEBUG
6904   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6905   if (__kmp_str_match_true(val)) {
6906     kmp_str_buf_t buffer;
6907     __kmp_str_buf_init(&buffer);
6908     __kmp_i18n_dump_catalog(&buffer);
6909     __kmp_printf("%s", buffer.str);
6910     __kmp_str_buf_free(&buffer);
6911   }
6912   __kmp_env_free(&val);
6913 #endif
6914 
6915   __kmp_threads_capacity =
6916       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6917   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6918   __kmp_tp_capacity = __kmp_default_tp_capacity(
6919       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6920 
6921   // If the library is shut down properly, both pools must be NULL. Just in
6922   // case, set them to NULL -- some memory may leak, but subsequent code will
6923   // work even if pools are not freed.
6924   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6925   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6926   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6927   __kmp_thread_pool = NULL;
6928   __kmp_thread_pool_insert_pt = NULL;
6929   __kmp_team_pool = NULL;
6930 
6931   /* Allocate all of the variable sized records */
6932   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6933    * expandable */
6934   /* Since allocation is cache-aligned, just add extra padding at the end */
6935   size =
6936       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6937       CACHE_LINE;
6938   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6939   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6940                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6941 
6942   /* init thread counts */
6943   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6944                    0); // Asserts fail if the library is reinitializing and
6945   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6946   __kmp_all_nth = 0;
6947   __kmp_nth = 0;
6948 
6949   /* setup the uber master thread and hierarchy */
6950   gtid = __kmp_register_root(TRUE);
6951   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6952   KMP_ASSERT(KMP_UBER_GTID(gtid));
6953   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6954 
6955   KMP_MB(); /* Flush all pending memory write invalidates.  */
6956 
6957   __kmp_common_initialize();
6958 
6959 #if KMP_OS_UNIX
6960   /* invoke the child fork handler */
6961   __kmp_register_atfork();
6962 #endif
6963 
6964 #if !KMP_DYNAMIC_LIB
6965   {
6966     /* Invoke the exit handler when the program finishes, only for static
6967        library. For dynamic library, we already have _fini and DllMain. */
6968     int rc = atexit(__kmp_internal_end_atexit);
6969     if (rc != 0) {
6970       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6971                   __kmp_msg_null);
6972     }
6973   }
6974 #endif
6975 
6976 #if KMP_HANDLE_SIGNALS
6977 #if KMP_OS_UNIX
6978   /* NOTE: make sure that this is called before the user installs their own
6979      signal handlers so that the user handlers are called first. this way they
6980      can return false, not call our handler, avoid terminating the library, and
6981      continue execution where they left off. */
6982   __kmp_install_signals(FALSE);
6983 #endif /* KMP_OS_UNIX */
6984 #if KMP_OS_WINDOWS
6985   __kmp_install_signals(TRUE);
6986 #endif /* KMP_OS_WINDOWS */
6987 #endif
6988 
6989   /* we have finished the serial initialization */
6990   __kmp_init_counter++;
6991 
6992   __kmp_init_serial = TRUE;
6993 
6994   if (__kmp_settings) {
6995     __kmp_env_print();
6996   }
6997 
6998   if (__kmp_display_env || __kmp_display_env_verbose) {
6999     __kmp_env_print_2();
7000   }
7001 
7002 #if OMPT_SUPPORT
7003   ompt_post_init();
7004 #endif
7005 
7006   KMP_MB();
7007 
7008   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7009 }
7010 
7011 void __kmp_serial_initialize(void) {
7012   if (__kmp_init_serial) {
7013     return;
7014   }
7015   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7016   if (__kmp_init_serial) {
7017     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7018     return;
7019   }
7020   __kmp_do_serial_initialize();
7021   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7022 }
7023 
7024 static void __kmp_do_middle_initialize(void) {
7025   int i, j;
7026   int prev_dflt_team_nth;
7027 
7028   if (!__kmp_init_serial) {
7029     __kmp_do_serial_initialize();
7030   }
7031 
7032   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7033 
7034   // Save the previous value for the __kmp_dflt_team_nth so that
7035   // we can avoid some reinitialization if it hasn't changed.
7036   prev_dflt_team_nth = __kmp_dflt_team_nth;
7037 
7038 #if KMP_AFFINITY_SUPPORTED
7039   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7040   // number of cores on the machine.
7041   __kmp_affinity_initialize();
7042 
7043 #endif /* KMP_AFFINITY_SUPPORTED */
7044 
7045   KMP_ASSERT(__kmp_xproc > 0);
7046   if (__kmp_avail_proc == 0) {
7047     __kmp_avail_proc = __kmp_xproc;
7048   }
7049 
7050   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7051   // correct them now
7052   j = 0;
7053   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7054     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7055         __kmp_avail_proc;
7056     j++;
7057   }
7058 
7059   if (__kmp_dflt_team_nth == 0) {
7060 #ifdef KMP_DFLT_NTH_CORES
7061     // Default #threads = #cores
7062     __kmp_dflt_team_nth = __kmp_ncores;
7063     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7064                   "__kmp_ncores (%d)\n",
7065                   __kmp_dflt_team_nth));
7066 #else
7067     // Default #threads = #available OS procs
7068     __kmp_dflt_team_nth = __kmp_avail_proc;
7069     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7070                   "__kmp_avail_proc(%d)\n",
7071                   __kmp_dflt_team_nth));
7072 #endif /* KMP_DFLT_NTH_CORES */
7073   }
7074 
7075   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7076     __kmp_dflt_team_nth = KMP_MIN_NTH;
7077   }
7078   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7079     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7080   }
7081 
7082   if (__kmp_nesting_mode > 0)
7083     __kmp_set_nesting_mode_threads();
7084 
7085   // There's no harm in continuing if the following check fails,
7086   // but it indicates an error in the previous logic.
7087   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7088 
7089   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7090     // Run through the __kmp_threads array and set the num threads icv for each
7091     // root thread that is currently registered with the RTL (which has not
7092     // already explicitly set its nthreads-var with a call to
7093     // omp_set_num_threads()).
7094     for (i = 0; i < __kmp_threads_capacity; i++) {
7095       kmp_info_t *thread = __kmp_threads[i];
7096       if (thread == NULL)
7097         continue;
7098       if (thread->th.th_current_task->td_icvs.nproc != 0)
7099         continue;
7100 
7101       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7102     }
7103   }
7104   KA_TRACE(
7105       20,
7106       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7107        __kmp_dflt_team_nth));
7108 
7109 #ifdef KMP_ADJUST_BLOCKTIME
7110   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7111   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7112     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7113     if (__kmp_nth > __kmp_avail_proc) {
7114       __kmp_zero_bt = TRUE;
7115     }
7116   }
7117 #endif /* KMP_ADJUST_BLOCKTIME */
7118 
7119   /* we have finished middle initialization */
7120   TCW_SYNC_4(__kmp_init_middle, TRUE);
7121 
7122   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7123 }
7124 
7125 void __kmp_middle_initialize(void) {
7126   if (__kmp_init_middle) {
7127     return;
7128   }
7129   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7130   if (__kmp_init_middle) {
7131     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7132     return;
7133   }
7134   __kmp_do_middle_initialize();
7135   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7136 }
7137 
7138 void __kmp_parallel_initialize(void) {
7139   int gtid = __kmp_entry_gtid(); // this might be a new root
7140 
7141   /* synchronize parallel initialization (for sibling) */
7142   if (TCR_4(__kmp_init_parallel))
7143     return;
7144   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7145   if (TCR_4(__kmp_init_parallel)) {
7146     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7147     return;
7148   }
7149 
7150   /* TODO reinitialization after we have already shut down */
7151   if (TCR_4(__kmp_global.g.g_done)) {
7152     KA_TRACE(
7153         10,
7154         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7155     __kmp_infinite_loop();
7156   }
7157 
7158   /* jc: The lock __kmp_initz_lock is already held, so calling
7159      __kmp_serial_initialize would cause a deadlock.  So we call
7160      __kmp_do_serial_initialize directly. */
7161   if (!__kmp_init_middle) {
7162     __kmp_do_middle_initialize();
7163   }
7164   __kmp_assign_root_init_mask();
7165   __kmp_resume_if_hard_paused();
7166 
7167   /* begin initialization */
7168   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7169   KMP_ASSERT(KMP_UBER_GTID(gtid));
7170 
7171 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7172   // Save the FP control regs.
7173   // Worker threads will set theirs to these values at thread startup.
7174   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7175   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7176   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7177 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7178 
7179 #if KMP_OS_UNIX
7180 #if KMP_HANDLE_SIGNALS
7181   /*  must be after __kmp_serial_initialize  */
7182   __kmp_install_signals(TRUE);
7183 #endif
7184 #endif
7185 
7186   __kmp_suspend_initialize();
7187 
7188 #if defined(USE_LOAD_BALANCE)
7189   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7190     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7191   }
7192 #else
7193   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7194     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7195   }
7196 #endif
7197 
7198   if (__kmp_version) {
7199     __kmp_print_version_2();
7200   }
7201 
7202   /* we have finished parallel initialization */
7203   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7204 
7205   KMP_MB();
7206   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7207 
7208   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7209 }
7210 
7211 void __kmp_hidden_helper_initialize() {
7212   if (TCR_4(__kmp_init_hidden_helper))
7213     return;
7214 
7215   // __kmp_parallel_initialize is required before we initialize hidden helper
7216   if (!TCR_4(__kmp_init_parallel))
7217     __kmp_parallel_initialize();
7218 
7219   // Double check. Note that this double check should not be placed before
7220   // __kmp_parallel_initialize as it will cause dead lock.
7221   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7222   if (TCR_4(__kmp_init_hidden_helper)) {
7223     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7224     return;
7225   }
7226 
7227   // Set the count of hidden helper tasks to be executed to zero
7228   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7229 
7230   // Set the global variable indicating that we're initializing hidden helper
7231   // team/threads
7232   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7233 
7234   // Platform independent initialization
7235   __kmp_do_initialize_hidden_helper_threads();
7236 
7237   // Wait here for the finish of initialization of hidden helper teams
7238   __kmp_hidden_helper_threads_initz_wait();
7239 
7240   // We have finished hidden helper initialization
7241   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7242 
7243   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7244 }
7245 
7246 /* ------------------------------------------------------------------------ */
7247 
7248 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7249                                    kmp_team_t *team) {
7250   kmp_disp_t *dispatch;
7251 
7252   KMP_MB();
7253 
7254   /* none of the threads have encountered any constructs, yet. */
7255   this_thr->th.th_local.this_construct = 0;
7256 #if KMP_CACHE_MANAGE
7257   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7258 #endif /* KMP_CACHE_MANAGE */
7259   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7260   KMP_DEBUG_ASSERT(dispatch);
7261   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7262   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7263   // this_thr->th.th_info.ds.ds_tid ] );
7264 
7265   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7266   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7267   if (__kmp_env_consistency_check)
7268     __kmp_push_parallel(gtid, team->t.t_ident);
7269 
7270   KMP_MB(); /* Flush all pending memory write invalidates.  */
7271 }
7272 
7273 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7274                                   kmp_team_t *team) {
7275   if (__kmp_env_consistency_check)
7276     __kmp_pop_parallel(gtid, team->t.t_ident);
7277 
7278   __kmp_finish_implicit_task(this_thr);
7279 }
7280 
7281 int __kmp_invoke_task_func(int gtid) {
7282   int rc;
7283   int tid = __kmp_tid_from_gtid(gtid);
7284   kmp_info_t *this_thr = __kmp_threads[gtid];
7285   kmp_team_t *team = this_thr->th.th_team;
7286 
7287   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7288 #if USE_ITT_BUILD
7289   if (__itt_stack_caller_create_ptr) {
7290     // inform ittnotify about entering user's code
7291     if (team->t.t_stack_id != NULL) {
7292       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7293     } else {
7294       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7295       __kmp_itt_stack_callee_enter(
7296           (__itt_caller)team->t.t_parent->t.t_stack_id);
7297     }
7298   }
7299 #endif /* USE_ITT_BUILD */
7300 #if INCLUDE_SSC_MARKS
7301   SSC_MARK_INVOKING();
7302 #endif
7303 
7304 #if OMPT_SUPPORT
7305   void *dummy;
7306   void **exit_frame_p;
7307   ompt_data_t *my_task_data;
7308   ompt_data_t *my_parallel_data;
7309   int ompt_team_size;
7310 
7311   if (ompt_enabled.enabled) {
7312     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7313                          .ompt_task_info.frame.exit_frame.ptr);
7314   } else {
7315     exit_frame_p = &dummy;
7316   }
7317 
7318   my_task_data =
7319       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7320   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7321   if (ompt_enabled.ompt_callback_implicit_task) {
7322     ompt_team_size = team->t.t_nproc;
7323     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7324         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7325         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7326     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7327   }
7328 #endif
7329 
7330 #if KMP_STATS_ENABLED
7331   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7332   if (previous_state == stats_state_e::TEAMS_REGION) {
7333     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7334   } else {
7335     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7336   }
7337   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7338 #endif
7339 
7340   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7341                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7342 #if OMPT_SUPPORT
7343                               ,
7344                               exit_frame_p
7345 #endif
7346   );
7347 #if OMPT_SUPPORT
7348   *exit_frame_p = NULL;
7349   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7350 #endif
7351 
7352 #if KMP_STATS_ENABLED
7353   if (previous_state == stats_state_e::TEAMS_REGION) {
7354     KMP_SET_THREAD_STATE(previous_state);
7355   }
7356   KMP_POP_PARTITIONED_TIMER();
7357 #endif
7358 
7359 #if USE_ITT_BUILD
7360   if (__itt_stack_caller_create_ptr) {
7361     // inform ittnotify about leaving user's code
7362     if (team->t.t_stack_id != NULL) {
7363       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7364     } else {
7365       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7366       __kmp_itt_stack_callee_leave(
7367           (__itt_caller)team->t.t_parent->t.t_stack_id);
7368     }
7369   }
7370 #endif /* USE_ITT_BUILD */
7371   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7372 
7373   return rc;
7374 }
7375 
7376 void __kmp_teams_master(int gtid) {
7377   // This routine is called by all primary threads in teams construct
7378   kmp_info_t *thr = __kmp_threads[gtid];
7379   kmp_team_t *team = thr->th.th_team;
7380   ident_t *loc = team->t.t_ident;
7381   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7382   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7383   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7384   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7385                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7386 
7387   // This thread is a new CG root.  Set up the proper variables.
7388   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7389   tmp->cg_root = thr; // Make thr the CG root
7390   // Init to thread limit stored when league primary threads were forked
7391   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7392   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7393   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7394                  " cg_nthreads to 1\n",
7395                  thr, tmp));
7396   tmp->up = thr->th.th_cg_roots;
7397   thr->th.th_cg_roots = tmp;
7398 
7399 // Launch league of teams now, but not let workers execute
7400 // (they hang on fork barrier until next parallel)
7401 #if INCLUDE_SSC_MARKS
7402   SSC_MARK_FORKING();
7403 #endif
7404   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7405                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7406                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7407 #if INCLUDE_SSC_MARKS
7408   SSC_MARK_JOINING();
7409 #endif
7410   // If the team size was reduced from the limit, set it to the new size
7411   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7412     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7413   // AC: last parameter "1" eliminates join barrier which won't work because
7414   // worker threads are in a fork barrier waiting for more parallel regions
7415   __kmp_join_call(loc, gtid
7416 #if OMPT_SUPPORT
7417                   ,
7418                   fork_context_intel
7419 #endif
7420                   ,
7421                   1);
7422 }
7423 
7424 int __kmp_invoke_teams_master(int gtid) {
7425   kmp_info_t *this_thr = __kmp_threads[gtid];
7426   kmp_team_t *team = this_thr->th.th_team;
7427 #if KMP_DEBUG
7428   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7429     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7430                      (void *)__kmp_teams_master);
7431 #endif
7432   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7433 #if OMPT_SUPPORT
7434   int tid = __kmp_tid_from_gtid(gtid);
7435   ompt_data_t *task_data =
7436       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7437   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7438   if (ompt_enabled.ompt_callback_implicit_task) {
7439     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7440         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7441         ompt_task_initial);
7442     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7443   }
7444 #endif
7445   __kmp_teams_master(gtid);
7446 #if OMPT_SUPPORT
7447   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7448 #endif
7449   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7450   return 1;
7451 }
7452 
7453 /* this sets the requested number of threads for the next parallel region
7454    encountered by this team. since this should be enclosed in the forkjoin
7455    critical section it should avoid race conditions with asymmetrical nested
7456    parallelism */
7457 
7458 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7459   kmp_info_t *thr = __kmp_threads[gtid];
7460 
7461   if (num_threads > 0)
7462     thr->th.th_set_nproc = num_threads;
7463 }
7464 
7465 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7466                                     int num_threads) {
7467   KMP_DEBUG_ASSERT(thr);
7468   // Remember the number of threads for inner parallel regions
7469   if (!TCR_4(__kmp_init_middle))
7470     __kmp_middle_initialize(); // get internal globals calculated
7471   __kmp_assign_root_init_mask();
7472   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7473   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7474 
7475   if (num_threads == 0) {
7476     if (__kmp_teams_thread_limit > 0) {
7477       num_threads = __kmp_teams_thread_limit;
7478     } else {
7479       num_threads = __kmp_avail_proc / num_teams;
7480     }
7481     // adjust num_threads w/o warning as it is not user setting
7482     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7483     // no thread_limit clause specified -  do not change thread-limit-var ICV
7484     if (num_threads > __kmp_dflt_team_nth) {
7485       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7486     }
7487     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7488       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7489     } // prevent team size to exceed thread-limit-var
7490     if (num_teams * num_threads > __kmp_teams_max_nth) {
7491       num_threads = __kmp_teams_max_nth / num_teams;
7492     }
7493     if (num_threads == 0) {
7494       num_threads = 1;
7495     }
7496   } else {
7497     // This thread will be the primary thread of the league primary threads
7498     // Store new thread limit; old limit is saved in th_cg_roots list
7499     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7500     // num_threads = min(num_threads, nthreads-var)
7501     if (num_threads > __kmp_dflt_team_nth) {
7502       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7503     }
7504     if (num_teams * num_threads > __kmp_teams_max_nth) {
7505       int new_threads = __kmp_teams_max_nth / num_teams;
7506       if (new_threads == 0) {
7507         new_threads = 1;
7508       }
7509       if (new_threads != num_threads) {
7510         if (!__kmp_reserve_warn) { // user asked for too many threads
7511           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7512           __kmp_msg(kmp_ms_warning,
7513                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7514                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7515         }
7516       }
7517       num_threads = new_threads;
7518     }
7519   }
7520   thr->th.th_teams_size.nth = num_threads;
7521 }
7522 
7523 /* this sets the requested number of teams for the teams region and/or
7524    the number of threads for the next parallel region encountered  */
7525 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7526                           int num_threads) {
7527   kmp_info_t *thr = __kmp_threads[gtid];
7528   KMP_DEBUG_ASSERT(num_teams >= 0);
7529   KMP_DEBUG_ASSERT(num_threads >= 0);
7530 
7531   if (num_teams == 0) {
7532     if (__kmp_nteams > 0) {
7533       num_teams = __kmp_nteams;
7534     } else {
7535       num_teams = 1; // default number of teams is 1.
7536     }
7537   }
7538   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7539     if (!__kmp_reserve_warn) {
7540       __kmp_reserve_warn = 1;
7541       __kmp_msg(kmp_ms_warning,
7542                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7543                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7544     }
7545     num_teams = __kmp_teams_max_nth;
7546   }
7547   // Set number of teams (number of threads in the outer "parallel" of the
7548   // teams)
7549   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7550 
7551   __kmp_push_thread_limit(thr, num_teams, num_threads);
7552 }
7553 
7554 /* This sets the requested number of teams for the teams region and/or
7555    the number of threads for the next parallel region encountered  */
7556 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7557                              int num_teams_ub, int num_threads) {
7558   kmp_info_t *thr = __kmp_threads[gtid];
7559   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7560   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7561   KMP_DEBUG_ASSERT(num_threads >= 0);
7562 
7563   if (num_teams_lb > num_teams_ub) {
7564     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7565                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7566   }
7567 
7568   int num_teams = 1; // defalt number of teams is 1.
7569 
7570   if (num_teams_lb == 0 && num_teams_ub > 0)
7571     num_teams_lb = num_teams_ub;
7572 
7573   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7574     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7575     if (num_teams > __kmp_teams_max_nth) {
7576       if (!__kmp_reserve_warn) {
7577         __kmp_reserve_warn = 1;
7578         __kmp_msg(kmp_ms_warning,
7579                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7580                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7581       }
7582       num_teams = __kmp_teams_max_nth;
7583     }
7584   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7585     num_teams = num_teams_ub;
7586   } else { // num_teams_lb <= num_teams <= num_teams_ub
7587     if (num_threads == 0) {
7588       if (num_teams_ub > __kmp_teams_max_nth) {
7589         num_teams = num_teams_lb;
7590       } else {
7591         num_teams = num_teams_ub;
7592       }
7593     } else {
7594       num_teams = (num_threads > __kmp_teams_max_nth)
7595                       ? num_teams
7596                       : __kmp_teams_max_nth / num_threads;
7597       if (num_teams < num_teams_lb) {
7598         num_teams = num_teams_lb;
7599       } else if (num_teams > num_teams_ub) {
7600         num_teams = num_teams_ub;
7601       }
7602     }
7603   }
7604   // Set number of teams (number of threads in the outer "parallel" of the
7605   // teams)
7606   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7607 
7608   __kmp_push_thread_limit(thr, num_teams, num_threads);
7609 }
7610 
7611 // Set the proc_bind var to use in the following parallel region.
7612 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7613   kmp_info_t *thr = __kmp_threads[gtid];
7614   thr->th.th_set_proc_bind = proc_bind;
7615 }
7616 
7617 /* Launch the worker threads into the microtask. */
7618 
7619 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7620   kmp_info_t *this_thr = __kmp_threads[gtid];
7621 
7622 #ifdef KMP_DEBUG
7623   int f;
7624 #endif /* KMP_DEBUG */
7625 
7626   KMP_DEBUG_ASSERT(team);
7627   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7628   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7629   KMP_MB(); /* Flush all pending memory write invalidates.  */
7630 
7631   team->t.t_construct = 0; /* no single directives seen yet */
7632   team->t.t_ordered.dt.t_value =
7633       0; /* thread 0 enters the ordered section first */
7634 
7635   /* Reset the identifiers on the dispatch buffer */
7636   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7637   if (team->t.t_max_nproc > 1) {
7638     int i;
7639     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7640       team->t.t_disp_buffer[i].buffer_index = i;
7641       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7642     }
7643   } else {
7644     team->t.t_disp_buffer[0].buffer_index = 0;
7645     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7646   }
7647 
7648   KMP_MB(); /* Flush all pending memory write invalidates.  */
7649   KMP_ASSERT(this_thr->th.th_team == team);
7650 
7651 #ifdef KMP_DEBUG
7652   for (f = 0; f < team->t.t_nproc; f++) {
7653     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7654                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7655   }
7656 #endif /* KMP_DEBUG */
7657 
7658   /* release the worker threads so they may begin working */
7659   __kmp_fork_barrier(gtid, 0);
7660 }
7661 
7662 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7663   kmp_info_t *this_thr = __kmp_threads[gtid];
7664 
7665   KMP_DEBUG_ASSERT(team);
7666   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7667   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7668   KMP_MB(); /* Flush all pending memory write invalidates.  */
7669 
7670   /* Join barrier after fork */
7671 
7672 #ifdef KMP_DEBUG
7673   if (__kmp_threads[gtid] &&
7674       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7675     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7676                  __kmp_threads[gtid]);
7677     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7678                  "team->t.t_nproc=%d\n",
7679                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7680                  team->t.t_nproc);
7681     __kmp_print_structure();
7682   }
7683   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7684                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7685 #endif /* KMP_DEBUG */
7686 
7687   __kmp_join_barrier(gtid); /* wait for everyone */
7688 #if OMPT_SUPPORT
7689   if (ompt_enabled.enabled &&
7690       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7691     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7692     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7693     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7694 #if OMPT_OPTIONAL
7695     void *codeptr = NULL;
7696     if (KMP_MASTER_TID(ds_tid) &&
7697         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7698          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7699       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7700 
7701     if (ompt_enabled.ompt_callback_sync_region_wait) {
7702       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7703           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7704           codeptr);
7705     }
7706     if (ompt_enabled.ompt_callback_sync_region) {
7707       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7708           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7709           codeptr);
7710     }
7711 #endif
7712     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7713       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7714           ompt_scope_end, NULL, task_data, 0, ds_tid,
7715           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7716     }
7717   }
7718 #endif
7719 
7720   KMP_MB(); /* Flush all pending memory write invalidates.  */
7721   KMP_ASSERT(this_thr->th.th_team == team);
7722 }
7723 
7724 /* ------------------------------------------------------------------------ */
7725 
7726 #ifdef USE_LOAD_BALANCE
7727 
7728 // Return the worker threads actively spinning in the hot team, if we
7729 // are at the outermost level of parallelism.  Otherwise, return 0.
7730 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7731   int i;
7732   int retval;
7733   kmp_team_t *hot_team;
7734 
7735   if (root->r.r_active) {
7736     return 0;
7737   }
7738   hot_team = root->r.r_hot_team;
7739   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7740     return hot_team->t.t_nproc - 1; // Don't count primary thread
7741   }
7742 
7743   // Skip the primary thread - it is accounted for elsewhere.
7744   retval = 0;
7745   for (i = 1; i < hot_team->t.t_nproc; i++) {
7746     if (hot_team->t.t_threads[i]->th.th_active) {
7747       retval++;
7748     }
7749   }
7750   return retval;
7751 }
7752 
7753 // Perform an automatic adjustment to the number of
7754 // threads used by the next parallel region.
7755 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7756   int retval;
7757   int pool_active;
7758   int hot_team_active;
7759   int team_curr_active;
7760   int system_active;
7761 
7762   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7763                 set_nproc));
7764   KMP_DEBUG_ASSERT(root);
7765   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7766                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7767   KMP_DEBUG_ASSERT(set_nproc > 1);
7768 
7769   if (set_nproc == 1) {
7770     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7771     return 1;
7772   }
7773 
7774   // Threads that are active in the thread pool, active in the hot team for this
7775   // particular root (if we are at the outer par level), and the currently
7776   // executing thread (to become the primary thread) are available to add to the
7777   // new team, but are currently contributing to the system load, and must be
7778   // accounted for.
7779   pool_active = __kmp_thread_pool_active_nth;
7780   hot_team_active = __kmp_active_hot_team_nproc(root);
7781   team_curr_active = pool_active + hot_team_active + 1;
7782 
7783   // Check the system load.
7784   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7785   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7786                 "hot team active = %d\n",
7787                 system_active, pool_active, hot_team_active));
7788 
7789   if (system_active < 0) {
7790     // There was an error reading the necessary info from /proc, so use the
7791     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7792     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7793     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7794     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7795 
7796     // Make this call behave like the thread limit algorithm.
7797     retval = __kmp_avail_proc - __kmp_nth +
7798              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7799     if (retval > set_nproc) {
7800       retval = set_nproc;
7801     }
7802     if (retval < KMP_MIN_NTH) {
7803       retval = KMP_MIN_NTH;
7804     }
7805 
7806     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7807                   retval));
7808     return retval;
7809   }
7810 
7811   // There is a slight delay in the load balance algorithm in detecting new
7812   // running procs. The real system load at this instant should be at least as
7813   // large as the #active omp thread that are available to add to the team.
7814   if (system_active < team_curr_active) {
7815     system_active = team_curr_active;
7816   }
7817   retval = __kmp_avail_proc - system_active + team_curr_active;
7818   if (retval > set_nproc) {
7819     retval = set_nproc;
7820   }
7821   if (retval < KMP_MIN_NTH) {
7822     retval = KMP_MIN_NTH;
7823   }
7824 
7825   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7826   return retval;
7827 } // __kmp_load_balance_nproc()
7828 
7829 #endif /* USE_LOAD_BALANCE */
7830 
7831 /* ------------------------------------------------------------------------ */
7832 
7833 /* NOTE: this is called with the __kmp_init_lock held */
7834 void __kmp_cleanup(void) {
7835   int f;
7836 
7837   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7838 
7839   if (TCR_4(__kmp_init_parallel)) {
7840 #if KMP_HANDLE_SIGNALS
7841     __kmp_remove_signals();
7842 #endif
7843     TCW_4(__kmp_init_parallel, FALSE);
7844   }
7845 
7846   if (TCR_4(__kmp_init_middle)) {
7847 #if KMP_AFFINITY_SUPPORTED
7848     __kmp_affinity_uninitialize();
7849 #endif /* KMP_AFFINITY_SUPPORTED */
7850     __kmp_cleanup_hierarchy();
7851     TCW_4(__kmp_init_middle, FALSE);
7852   }
7853 
7854   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7855 
7856   if (__kmp_init_serial) {
7857     __kmp_runtime_destroy();
7858     __kmp_init_serial = FALSE;
7859   }
7860 
7861   __kmp_cleanup_threadprivate_caches();
7862 
7863   for (f = 0; f < __kmp_threads_capacity; f++) {
7864     if (__kmp_root[f] != NULL) {
7865       __kmp_free(__kmp_root[f]);
7866       __kmp_root[f] = NULL;
7867     }
7868   }
7869   __kmp_free(__kmp_threads);
7870   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7871   // there is no need in freeing __kmp_root.
7872   __kmp_threads = NULL;
7873   __kmp_root = NULL;
7874   __kmp_threads_capacity = 0;
7875 
7876 #if KMP_USE_DYNAMIC_LOCK
7877   __kmp_cleanup_indirect_user_locks();
7878 #else
7879   __kmp_cleanup_user_locks();
7880 #endif
7881 #if OMPD_SUPPORT
7882   if (ompd_state) {
7883     __kmp_free(ompd_env_block);
7884     ompd_env_block = NULL;
7885     ompd_env_block_size = 0;
7886   }
7887 #endif
7888 
7889 #if KMP_AFFINITY_SUPPORTED
7890   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7891   __kmp_cpuinfo_file = NULL;
7892 #endif /* KMP_AFFINITY_SUPPORTED */
7893 
7894 #if KMP_USE_ADAPTIVE_LOCKS
7895 #if KMP_DEBUG_ADAPTIVE_LOCKS
7896   __kmp_print_speculative_stats();
7897 #endif
7898 #endif
7899   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7900   __kmp_nested_nth.nth = NULL;
7901   __kmp_nested_nth.size = 0;
7902   __kmp_nested_nth.used = 0;
7903   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7904   __kmp_nested_proc_bind.bind_types = NULL;
7905   __kmp_nested_proc_bind.size = 0;
7906   __kmp_nested_proc_bind.used = 0;
7907   if (__kmp_affinity_format) {
7908     KMP_INTERNAL_FREE(__kmp_affinity_format);
7909     __kmp_affinity_format = NULL;
7910   }
7911 
7912   __kmp_i18n_catclose();
7913 
7914 #if KMP_USE_HIER_SCHED
7915   __kmp_hier_scheds.deallocate();
7916 #endif
7917 
7918 #if KMP_STATS_ENABLED
7919   __kmp_stats_fini();
7920 #endif
7921 
7922   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7923 }
7924 
7925 /* ------------------------------------------------------------------------ */
7926 
7927 int __kmp_ignore_mppbeg(void) {
7928   char *env;
7929 
7930   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7931     if (__kmp_str_match_false(env))
7932       return FALSE;
7933   }
7934   // By default __kmpc_begin() is no-op.
7935   return TRUE;
7936 }
7937 
7938 int __kmp_ignore_mppend(void) {
7939   char *env;
7940 
7941   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7942     if (__kmp_str_match_false(env))
7943       return FALSE;
7944   }
7945   // By default __kmpc_end() is no-op.
7946   return TRUE;
7947 }
7948 
7949 void __kmp_internal_begin(void) {
7950   int gtid;
7951   kmp_root_t *root;
7952 
7953   /* this is a very important step as it will register new sibling threads
7954      and assign these new uber threads a new gtid */
7955   gtid = __kmp_entry_gtid();
7956   root = __kmp_threads[gtid]->th.th_root;
7957   KMP_ASSERT(KMP_UBER_GTID(gtid));
7958 
7959   if (root->r.r_begin)
7960     return;
7961   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7962   if (root->r.r_begin) {
7963     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7964     return;
7965   }
7966 
7967   root->r.r_begin = TRUE;
7968 
7969   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7970 }
7971 
7972 /* ------------------------------------------------------------------------ */
7973 
7974 void __kmp_user_set_library(enum library_type arg) {
7975   int gtid;
7976   kmp_root_t *root;
7977   kmp_info_t *thread;
7978 
7979   /* first, make sure we are initialized so we can get our gtid */
7980 
7981   gtid = __kmp_entry_gtid();
7982   thread = __kmp_threads[gtid];
7983 
7984   root = thread->th.th_root;
7985 
7986   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7987                 library_serial));
7988   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7989                                   thread */
7990     KMP_WARNING(SetLibraryIncorrectCall);
7991     return;
7992   }
7993 
7994   switch (arg) {
7995   case library_serial:
7996     thread->th.th_set_nproc = 0;
7997     set__nproc(thread, 1);
7998     break;
7999   case library_turnaround:
8000     thread->th.th_set_nproc = 0;
8001     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8002                                            : __kmp_dflt_team_nth_ub);
8003     break;
8004   case library_throughput:
8005     thread->th.th_set_nproc = 0;
8006     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8007                                            : __kmp_dflt_team_nth_ub);
8008     break;
8009   default:
8010     KMP_FATAL(UnknownLibraryType, arg);
8011   }
8012 
8013   __kmp_aux_set_library(arg);
8014 }
8015 
8016 void __kmp_aux_set_stacksize(size_t arg) {
8017   if (!__kmp_init_serial)
8018     __kmp_serial_initialize();
8019 
8020 #if KMP_OS_DARWIN
8021   if (arg & (0x1000 - 1)) {
8022     arg &= ~(0x1000 - 1);
8023     if (arg + 0x1000) /* check for overflow if we round up */
8024       arg += 0x1000;
8025   }
8026 #endif
8027   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8028 
8029   /* only change the default stacksize before the first parallel region */
8030   if (!TCR_4(__kmp_init_parallel)) {
8031     size_t value = arg; /* argument is in bytes */
8032 
8033     if (value < __kmp_sys_min_stksize)
8034       value = __kmp_sys_min_stksize;
8035     else if (value > KMP_MAX_STKSIZE)
8036       value = KMP_MAX_STKSIZE;
8037 
8038     __kmp_stksize = value;
8039 
8040     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8041   }
8042 
8043   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8044 }
8045 
8046 /* set the behaviour of the runtime library */
8047 /* TODO this can cause some odd behaviour with sibling parallelism... */
8048 void __kmp_aux_set_library(enum library_type arg) {
8049   __kmp_library = arg;
8050 
8051   switch (__kmp_library) {
8052   case library_serial: {
8053     KMP_INFORM(LibraryIsSerial);
8054   } break;
8055   case library_turnaround:
8056     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8057       __kmp_use_yield = 2; // only yield when oversubscribed
8058     break;
8059   case library_throughput:
8060     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8061       __kmp_dflt_blocktime = 200;
8062     break;
8063   default:
8064     KMP_FATAL(UnknownLibraryType, arg);
8065   }
8066 }
8067 
8068 /* Getting team information common for all team API */
8069 // Returns NULL if not in teams construct
8070 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8071   kmp_info_t *thr = __kmp_entry_thread();
8072   teams_serialized = 0;
8073   if (thr->th.th_teams_microtask) {
8074     kmp_team_t *team = thr->th.th_team;
8075     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8076     int ii = team->t.t_level;
8077     teams_serialized = team->t.t_serialized;
8078     int level = tlevel + 1;
8079     KMP_DEBUG_ASSERT(ii >= tlevel);
8080     while (ii > level) {
8081       for (teams_serialized = team->t.t_serialized;
8082            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8083       }
8084       if (team->t.t_serialized && (!teams_serialized)) {
8085         team = team->t.t_parent;
8086         continue;
8087       }
8088       if (ii > level) {
8089         team = team->t.t_parent;
8090         ii--;
8091       }
8092     }
8093     return team;
8094   }
8095   return NULL;
8096 }
8097 
8098 int __kmp_aux_get_team_num() {
8099   int serialized;
8100   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8101   if (team) {
8102     if (serialized > 1) {
8103       return 0; // teams region is serialized ( 1 team of 1 thread ).
8104     } else {
8105       return team->t.t_master_tid;
8106     }
8107   }
8108   return 0;
8109 }
8110 
8111 int __kmp_aux_get_num_teams() {
8112   int serialized;
8113   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8114   if (team) {
8115     if (serialized > 1) {
8116       return 1;
8117     } else {
8118       return team->t.t_parent->t.t_nproc;
8119     }
8120   }
8121   return 1;
8122 }
8123 
8124 /* ------------------------------------------------------------------------ */
8125 
8126 /*
8127  * Affinity Format Parser
8128  *
8129  * Field is in form of: %[[[0].]size]type
8130  * % and type are required (%% means print a literal '%')
8131  * type is either single char or long name surrounded by {},
8132  * e.g., N or {num_threads}
8133  * 0 => leading zeros
8134  * . => right justified when size is specified
8135  * by default output is left justified
8136  * size is the *minimum* field length
8137  * All other characters are printed as is
8138  *
8139  * Available field types:
8140  * L {thread_level}      - omp_get_level()
8141  * n {thread_num}        - omp_get_thread_num()
8142  * h {host}              - name of host machine
8143  * P {process_id}        - process id (integer)
8144  * T {thread_identifier} - native thread identifier (integer)
8145  * N {num_threads}       - omp_get_num_threads()
8146  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8147  * a {thread_affinity}   - comma separated list of integers or integer ranges
8148  *                         (values of affinity mask)
8149  *
8150  * Implementation-specific field types can be added
8151  * If a type is unknown, print "undefined"
8152  */
8153 
8154 // Structure holding the short name, long name, and corresponding data type
8155 // for snprintf.  A table of these will represent the entire valid keyword
8156 // field types.
8157 typedef struct kmp_affinity_format_field_t {
8158   char short_name; // from spec e.g., L -> thread level
8159   const char *long_name; // from spec thread_level -> thread level
8160   char field_format; // data type for snprintf (typically 'd' or 's'
8161   // for integer or string)
8162 } kmp_affinity_format_field_t;
8163 
8164 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8165 #if KMP_AFFINITY_SUPPORTED
8166     {'A', "thread_affinity", 's'},
8167 #endif
8168     {'t', "team_num", 'd'},
8169     {'T', "num_teams", 'd'},
8170     {'L', "nesting_level", 'd'},
8171     {'n', "thread_num", 'd'},
8172     {'N', "num_threads", 'd'},
8173     {'a', "ancestor_tnum", 'd'},
8174     {'H', "host", 's'},
8175     {'P', "process_id", 'd'},
8176     {'i', "native_thread_id", 'd'}};
8177 
8178 // Return the number of characters it takes to hold field
8179 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8180                                             const char **ptr,
8181                                             kmp_str_buf_t *field_buffer) {
8182   int rc, format_index, field_value;
8183   const char *width_left, *width_right;
8184   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8185   static const int FORMAT_SIZE = 20;
8186   char format[FORMAT_SIZE] = {0};
8187   char absolute_short_name = 0;
8188 
8189   KMP_DEBUG_ASSERT(gtid >= 0);
8190   KMP_DEBUG_ASSERT(th);
8191   KMP_DEBUG_ASSERT(**ptr == '%');
8192   KMP_DEBUG_ASSERT(field_buffer);
8193 
8194   __kmp_str_buf_clear(field_buffer);
8195 
8196   // Skip the initial %
8197   (*ptr)++;
8198 
8199   // Check for %% first
8200   if (**ptr == '%') {
8201     __kmp_str_buf_cat(field_buffer, "%", 1);
8202     (*ptr)++; // skip over the second %
8203     return 1;
8204   }
8205 
8206   // Parse field modifiers if they are present
8207   pad_zeros = false;
8208   if (**ptr == '0') {
8209     pad_zeros = true;
8210     (*ptr)++; // skip over 0
8211   }
8212   right_justify = false;
8213   if (**ptr == '.') {
8214     right_justify = true;
8215     (*ptr)++; // skip over .
8216   }
8217   // Parse width of field: [width_left, width_right)
8218   width_left = width_right = NULL;
8219   if (**ptr >= '0' && **ptr <= '9') {
8220     width_left = *ptr;
8221     SKIP_DIGITS(*ptr);
8222     width_right = *ptr;
8223   }
8224 
8225   // Create the format for KMP_SNPRINTF based on flags parsed above
8226   format_index = 0;
8227   format[format_index++] = '%';
8228   if (!right_justify)
8229     format[format_index++] = '-';
8230   if (pad_zeros)
8231     format[format_index++] = '0';
8232   if (width_left && width_right) {
8233     int i = 0;
8234     // Only allow 8 digit number widths.
8235     // This also prevents overflowing format variable
8236     while (i < 8 && width_left < width_right) {
8237       format[format_index++] = *width_left;
8238       width_left++;
8239       i++;
8240     }
8241   }
8242 
8243   // Parse a name (long or short)
8244   // Canonicalize the name into absolute_short_name
8245   found_valid_name = false;
8246   parse_long_name = (**ptr == '{');
8247   if (parse_long_name)
8248     (*ptr)++; // skip initial left brace
8249   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8250                              sizeof(__kmp_affinity_format_table[0]);
8251        ++i) {
8252     char short_name = __kmp_affinity_format_table[i].short_name;
8253     const char *long_name = __kmp_affinity_format_table[i].long_name;
8254     char field_format = __kmp_affinity_format_table[i].field_format;
8255     if (parse_long_name) {
8256       size_t length = KMP_STRLEN(long_name);
8257       if (strncmp(*ptr, long_name, length) == 0) {
8258         found_valid_name = true;
8259         (*ptr) += length; // skip the long name
8260       }
8261     } else if (**ptr == short_name) {
8262       found_valid_name = true;
8263       (*ptr)++; // skip the short name
8264     }
8265     if (found_valid_name) {
8266       format[format_index++] = field_format;
8267       format[format_index++] = '\0';
8268       absolute_short_name = short_name;
8269       break;
8270     }
8271   }
8272   if (parse_long_name) {
8273     if (**ptr != '}') {
8274       absolute_short_name = 0;
8275     } else {
8276       (*ptr)++; // skip over the right brace
8277     }
8278   }
8279 
8280   // Attempt to fill the buffer with the requested
8281   // value using snprintf within __kmp_str_buf_print()
8282   switch (absolute_short_name) {
8283   case 't':
8284     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8285     break;
8286   case 'T':
8287     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8288     break;
8289   case 'L':
8290     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8291     break;
8292   case 'n':
8293     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8294     break;
8295   case 'H': {
8296     static const int BUFFER_SIZE = 256;
8297     char buf[BUFFER_SIZE];
8298     __kmp_expand_host_name(buf, BUFFER_SIZE);
8299     rc = __kmp_str_buf_print(field_buffer, format, buf);
8300   } break;
8301   case 'P':
8302     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8303     break;
8304   case 'i':
8305     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8306     break;
8307   case 'N':
8308     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8309     break;
8310   case 'a':
8311     field_value =
8312         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8313     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8314     break;
8315 #if KMP_AFFINITY_SUPPORTED
8316   case 'A': {
8317     kmp_str_buf_t buf;
8318     __kmp_str_buf_init(&buf);
8319     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8320     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8321     __kmp_str_buf_free(&buf);
8322   } break;
8323 #endif
8324   default:
8325     // According to spec, If an implementation does not have info for field
8326     // type, then "undefined" is printed
8327     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8328     // Skip the field
8329     if (parse_long_name) {
8330       SKIP_TOKEN(*ptr);
8331       if (**ptr == '}')
8332         (*ptr)++;
8333     } else {
8334       (*ptr)++;
8335     }
8336   }
8337 
8338   KMP_ASSERT(format_index <= FORMAT_SIZE);
8339   return rc;
8340 }
8341 
8342 /*
8343  * Return number of characters needed to hold the affinity string
8344  * (not including null byte character)
8345  * The resultant string is printed to buffer, which the caller can then
8346  * handle afterwards
8347  */
8348 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8349                                   kmp_str_buf_t *buffer) {
8350   const char *parse_ptr;
8351   size_t retval;
8352   const kmp_info_t *th;
8353   kmp_str_buf_t field;
8354 
8355   KMP_DEBUG_ASSERT(buffer);
8356   KMP_DEBUG_ASSERT(gtid >= 0);
8357 
8358   __kmp_str_buf_init(&field);
8359   __kmp_str_buf_clear(buffer);
8360 
8361   th = __kmp_threads[gtid];
8362   retval = 0;
8363 
8364   // If format is NULL or zero-length string, then we use
8365   // affinity-format-var ICV
8366   parse_ptr = format;
8367   if (parse_ptr == NULL || *parse_ptr == '\0') {
8368     parse_ptr = __kmp_affinity_format;
8369   }
8370   KMP_DEBUG_ASSERT(parse_ptr);
8371 
8372   while (*parse_ptr != '\0') {
8373     // Parse a field
8374     if (*parse_ptr == '%') {
8375       // Put field in the buffer
8376       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8377       __kmp_str_buf_catbuf(buffer, &field);
8378       retval += rc;
8379     } else {
8380       // Put literal character in buffer
8381       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8382       retval++;
8383       parse_ptr++;
8384     }
8385   }
8386   __kmp_str_buf_free(&field);
8387   return retval;
8388 }
8389 
8390 // Displays the affinity string to stdout
8391 void __kmp_aux_display_affinity(int gtid, const char *format) {
8392   kmp_str_buf_t buf;
8393   __kmp_str_buf_init(&buf);
8394   __kmp_aux_capture_affinity(gtid, format, &buf);
8395   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8396   __kmp_str_buf_free(&buf);
8397 }
8398 
8399 /* ------------------------------------------------------------------------ */
8400 
8401 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8402   int blocktime = arg; /* argument is in milliseconds */
8403 #if KMP_USE_MONITOR
8404   int bt_intervals;
8405 #endif
8406   kmp_int8 bt_set;
8407 
8408   __kmp_save_internal_controls(thread);
8409 
8410   /* Normalize and set blocktime for the teams */
8411   if (blocktime < KMP_MIN_BLOCKTIME)
8412     blocktime = KMP_MIN_BLOCKTIME;
8413   else if (blocktime > KMP_MAX_BLOCKTIME)
8414     blocktime = KMP_MAX_BLOCKTIME;
8415 
8416   set__blocktime_team(thread->th.th_team, tid, blocktime);
8417   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8418 
8419 #if KMP_USE_MONITOR
8420   /* Calculate and set blocktime intervals for the teams */
8421   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8422 
8423   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8424   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8425 #endif
8426 
8427   /* Set whether blocktime has been set to "TRUE" */
8428   bt_set = TRUE;
8429 
8430   set__bt_set_team(thread->th.th_team, tid, bt_set);
8431   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8432 #if KMP_USE_MONITOR
8433   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8434                 "bt_intervals=%d, monitor_updates=%d\n",
8435                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8436                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8437                 __kmp_monitor_wakeups));
8438 #else
8439   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8440                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8441                 thread->th.th_team->t.t_id, tid, blocktime));
8442 #endif
8443 }
8444 
8445 void __kmp_aux_set_defaults(char const *str, size_t len) {
8446   if (!__kmp_init_serial) {
8447     __kmp_serial_initialize();
8448   }
8449   __kmp_env_initialize(str);
8450 
8451   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8452     __kmp_env_print();
8453   }
8454 } // __kmp_aux_set_defaults
8455 
8456 /* ------------------------------------------------------------------------ */
8457 /* internal fast reduction routines */
8458 
8459 PACKED_REDUCTION_METHOD_T
8460 __kmp_determine_reduction_method(
8461     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8462     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8463     kmp_critical_name *lck) {
8464 
8465   // Default reduction method: critical construct ( lck != NULL, like in current
8466   // PAROPT )
8467   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8468   // can be selected by RTL
8469   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8470   // can be selected by RTL
8471   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8472   // among generated by PAROPT.
8473 
8474   PACKED_REDUCTION_METHOD_T retval;
8475 
8476   int team_size;
8477 
8478   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8479   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8480 
8481 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8482   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8483 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8484 
8485   retval = critical_reduce_block;
8486 
8487   // another choice of getting a team size (with 1 dynamic deference) is slower
8488   team_size = __kmp_get_team_num_threads(global_tid);
8489   if (team_size == 1) {
8490 
8491     retval = empty_reduce_block;
8492 
8493   } else {
8494 
8495     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8496 
8497 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8498     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8499 
8500 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8501     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8502 
8503     int teamsize_cutoff = 4;
8504 
8505 #if KMP_MIC_SUPPORTED
8506     if (__kmp_mic_type != non_mic) {
8507       teamsize_cutoff = 8;
8508     }
8509 #endif
8510     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8511     if (tree_available) {
8512       if (team_size <= teamsize_cutoff) {
8513         if (atomic_available) {
8514           retval = atomic_reduce_block;
8515         }
8516       } else {
8517         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8518       }
8519     } else if (atomic_available) {
8520       retval = atomic_reduce_block;
8521     }
8522 #else
8523 #error "Unknown or unsupported OS"
8524 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8525        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8526 
8527 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8528 
8529 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8530 
8531     // basic tuning
8532 
8533     if (atomic_available) {
8534       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8535         retval = atomic_reduce_block;
8536       }
8537     } // otherwise: use critical section
8538 
8539 #elif KMP_OS_DARWIN
8540 
8541     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8542     if (atomic_available && (num_vars <= 3)) {
8543       retval = atomic_reduce_block;
8544     } else if (tree_available) {
8545       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8546           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8547         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8548       }
8549     } // otherwise: use critical section
8550 
8551 #else
8552 #error "Unknown or unsupported OS"
8553 #endif
8554 
8555 #else
8556 #error "Unknown or unsupported architecture"
8557 #endif
8558   }
8559 
8560   // KMP_FORCE_REDUCTION
8561 
8562   // If the team is serialized (team_size == 1), ignore the forced reduction
8563   // method and stay with the unsynchronized method (empty_reduce_block)
8564   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8565       team_size != 1) {
8566 
8567     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8568 
8569     int atomic_available, tree_available;
8570 
8571     switch ((forced_retval = __kmp_force_reduction_method)) {
8572     case critical_reduce_block:
8573       KMP_ASSERT(lck); // lck should be != 0
8574       break;
8575 
8576     case atomic_reduce_block:
8577       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8578       if (!atomic_available) {
8579         KMP_WARNING(RedMethodNotSupported, "atomic");
8580         forced_retval = critical_reduce_block;
8581       }
8582       break;
8583 
8584     case tree_reduce_block:
8585       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8586       if (!tree_available) {
8587         KMP_WARNING(RedMethodNotSupported, "tree");
8588         forced_retval = critical_reduce_block;
8589       } else {
8590 #if KMP_FAST_REDUCTION_BARRIER
8591         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8592 #endif
8593       }
8594       break;
8595 
8596     default:
8597       KMP_ASSERT(0); // "unsupported method specified"
8598     }
8599 
8600     retval = forced_retval;
8601   }
8602 
8603   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8604 
8605 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8606 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8607 
8608   return (retval);
8609 }
8610 // this function is for testing set/get/determine reduce method
8611 kmp_int32 __kmp_get_reduce_method(void) {
8612   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8613 }
8614 
8615 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8616 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8617 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8618 
8619 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8620 // OpenMP is used subsequently.
8621 void __kmp_hard_pause() {
8622   __kmp_pause_status = kmp_hard_paused;
8623   __kmp_internal_end_thread(-1);
8624 }
8625 
8626 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8627 void __kmp_resume_if_soft_paused() {
8628   if (__kmp_pause_status == kmp_soft_paused) {
8629     __kmp_pause_status = kmp_not_paused;
8630 
8631     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8632       kmp_info_t *thread = __kmp_threads[gtid];
8633       if (thread) { // Wake it if sleeping
8634         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8635                          thread);
8636         if (fl.is_sleeping())
8637           fl.resume(gtid);
8638         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8639           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8640         } else { // thread holds the lock and may sleep soon
8641           do { // until either the thread sleeps, or we can get the lock
8642             if (fl.is_sleeping()) {
8643               fl.resume(gtid);
8644               break;
8645             } else if (__kmp_try_suspend_mx(thread)) {
8646               __kmp_unlock_suspend_mx(thread);
8647               break;
8648             }
8649           } while (1);
8650         }
8651       }
8652     }
8653   }
8654 }
8655 
8656 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8657 // TODO: add warning messages
8658 int __kmp_pause_resource(kmp_pause_status_t level) {
8659   if (level == kmp_not_paused) { // requesting resume
8660     if (__kmp_pause_status == kmp_not_paused) {
8661       // error message about runtime not being paused, so can't resume
8662       return 1;
8663     } else {
8664       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8665                        __kmp_pause_status == kmp_hard_paused);
8666       __kmp_pause_status = kmp_not_paused;
8667       return 0;
8668     }
8669   } else if (level == kmp_soft_paused) { // requesting soft pause
8670     if (__kmp_pause_status != kmp_not_paused) {
8671       // error message about already being paused
8672       return 1;
8673     } else {
8674       __kmp_soft_pause();
8675       return 0;
8676     }
8677   } else if (level == kmp_hard_paused) { // requesting hard pause
8678     if (__kmp_pause_status != kmp_not_paused) {
8679       // error message about already being paused
8680       return 1;
8681     } else {
8682       __kmp_hard_pause();
8683       return 0;
8684     }
8685   } else {
8686     // error message about invalid level
8687     return 1;
8688   }
8689 }
8690 
8691 void __kmp_omp_display_env(int verbose) {
8692   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8693   if (__kmp_init_serial == 0)
8694     __kmp_do_serial_initialize();
8695   __kmp_display_env_impl(!verbose, verbose);
8696   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8697 }
8698 
8699 // Globals and functions for hidden helper task
8700 kmp_info_t **__kmp_hidden_helper_threads;
8701 kmp_info_t *__kmp_hidden_helper_main_thread;
8702 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8703 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8704 #if KMP_OS_LINUX
8705 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8706 #else
8707 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8708 #endif
8709 
8710 namespace {
8711 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8712 
8713 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8714   // This is an explicit synchronization on all hidden helper threads in case
8715   // that when a regular thread pushes a hidden helper task to one hidden
8716   // helper thread, the thread has not been awaken once since they're released
8717   // by the main thread after creating the team.
8718   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8719   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8720          __kmp_hidden_helper_threads_num)
8721     ;
8722 
8723   // If main thread, then wait for signal
8724   if (__kmpc_master(nullptr, *gtid)) {
8725     // First, unset the initial state and release the initial thread
8726     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8727     __kmp_hidden_helper_initz_release();
8728     __kmp_hidden_helper_main_thread_wait();
8729     // Now wake up all worker threads
8730     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8731       __kmp_hidden_helper_worker_thread_signal();
8732     }
8733   }
8734 }
8735 } // namespace
8736 
8737 void __kmp_hidden_helper_threads_initz_routine() {
8738   // Create a new root for hidden helper team/threads
8739   const int gtid = __kmp_register_root(TRUE);
8740   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8741   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8742   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8743       __kmp_hidden_helper_threads_num;
8744 
8745   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8746 
8747   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8748 
8749   // Set the initialization flag to FALSE
8750   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8751 
8752   __kmp_hidden_helper_threads_deinitz_release();
8753 }
8754 
8755 /* Nesting Mode:
8756    Set via KMP_NESTING_MODE, which takes an integer.
8757    Note: we skip duplicate topology levels, and skip levels with only
8758       one entity.
8759    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
8760    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
8761       in the topology, and initializes the number of threads at each of those
8762       levels to the number of entities at each level, respectively, below the
8763       entity at the parent level.
8764    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
8765       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
8766       the user to turn nesting on explicitly. This is an even more experimental
8767       option to this experimental feature, and may change or go away in the
8768       future.
8769 */
8770 
8771 // Allocate space to store nesting levels
8772 void __kmp_init_nesting_mode() {
8773   int levels = KMP_HW_LAST;
8774   __kmp_nesting_mode_nlevels = levels;
8775   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
8776   for (int i = 0; i < levels; ++i)
8777     __kmp_nesting_nth_level[i] = 0;
8778   if (__kmp_nested_nth.size < levels) {
8779     __kmp_nested_nth.nth =
8780         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
8781     __kmp_nested_nth.size = levels;
8782   }
8783 }
8784 
8785 // Set # threads for top levels of nesting; must be called after topology set
8786 void __kmp_set_nesting_mode_threads() {
8787   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
8788 
8789   if (__kmp_nesting_mode == 1)
8790     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
8791   else if (__kmp_nesting_mode > 1)
8792     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8793 
8794   if (__kmp_topology) { // use topology info
8795     int loc, hw_level;
8796     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
8797                                 loc < __kmp_nesting_mode_nlevels;
8798          loc++, hw_level++) {
8799       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
8800       if (__kmp_nesting_nth_level[loc] == 1)
8801         loc--;
8802     }
8803     // Make sure all cores are used
8804     if (__kmp_nesting_mode > 1 && loc > 1) {
8805       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
8806       int num_cores = __kmp_topology->get_count(core_level);
8807       int upper_levels = 1;
8808       for (int level = 0; level < loc - 1; ++level)
8809         upper_levels *= __kmp_nesting_nth_level[level];
8810       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
8811         __kmp_nesting_nth_level[loc - 1] =
8812             num_cores / __kmp_nesting_nth_level[loc - 2];
8813     }
8814     __kmp_nesting_mode_nlevels = loc;
8815     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8816   } else { // no topology info available; provide a reasonable guesstimation
8817     if (__kmp_avail_proc >= 4) {
8818       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
8819       __kmp_nesting_nth_level[1] = 2;
8820       __kmp_nesting_mode_nlevels = 2;
8821     } else {
8822       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
8823       __kmp_nesting_mode_nlevels = 1;
8824     }
8825     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8826   }
8827   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
8828     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
8829   }
8830   set__nproc(thread, __kmp_nesting_nth_level[0]);
8831   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
8832     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8833   if (get__max_active_levels(thread) > 1) {
8834     // if max levels was set, set nesting mode levels to same
8835     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
8836   }
8837   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
8838     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
8839 }
8840