1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #include "tsan_annotations.h"
51 
52 #if KMP_OS_WINDOWS
53 // windows does not need include files as it doesn't use shared memory
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60 
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63     KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] =
67     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68 
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71     KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75 
76 /* ------------------------------------------------------------------------ */
77 
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81 
82 /* Forward declarations */
83 
84 void __kmp_cleanup(void);
85 
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87                                   int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89                                   kmp_internal_control_t *new_icvs,
90                                   ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93                                    int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99                           kmp_internal_control_t *new_icvs, ident_t *loc);
100 
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104 
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111 
112 /* Calculate the identifier of the current thread */
113 /* fast (and somewhat portable) way to get unique identifier of executing
114    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
115 int __kmp_get_global_thread_id() {
116   int i;
117   kmp_info_t **other_threads;
118   size_t stack_data;
119   char *stack_addr;
120   size_t stack_size;
121   char *stack_base;
122 
123   KA_TRACE(
124       1000,
125       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
126        __kmp_nth, __kmp_all_nth));
127 
128   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
129      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
130      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
131      __kmp_init_gtid for this to work. */
132 
133   if (!TCR_4(__kmp_init_gtid))
134     return KMP_GTID_DNE;
135 
136 #ifdef KMP_TDATA_GTID
137   if (TCR_4(__kmp_gtid_mode) >= 3) {
138     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
139     return __kmp_gtid;
140   }
141 #endif
142   if (TCR_4(__kmp_gtid_mode) >= 2) {
143     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
144     return __kmp_gtid_get_specific();
145   }
146   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
147 
148   stack_addr = (char *)&stack_data;
149   other_threads = __kmp_threads;
150 
151   /* ATT: The code below is a source of potential bugs due to unsynchronized
152      access to __kmp_threads array. For example:
153      1. Current thread loads other_threads[i] to thr and checks it, it is
154         non-NULL.
155      2. Current thread is suspended by OS.
156      3. Another thread unregisters and finishes (debug versions of free()
157         may fill memory with something like 0xEF).
158      4. Current thread is resumed.
159      5. Current thread reads junk from *thr.
160      TODO: Fix it.  --ln  */
161 
162   for (i = 0; i < __kmp_threads_capacity; i++) {
163 
164     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
165     if (!thr)
166       continue;
167 
168     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
169     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
170 
171     /* stack grows down -- search through all of the active threads */
172 
173     if (stack_addr <= stack_base) {
174       size_t stack_diff = stack_base - stack_addr;
175 
176       if (stack_diff <= stack_size) {
177         /* The only way we can be closer than the allocated */
178         /* stack size is if we are running on this thread. */
179         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
180         return i;
181       }
182     }
183   }
184 
185   /* get specific to try and determine our gtid */
186   KA_TRACE(1000,
187            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
188             "thread, using TLS\n"));
189   i = __kmp_gtid_get_specific();
190 
191   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
192 
193   /* if we havn't been assigned a gtid, then return code */
194   if (i < 0)
195     return i;
196 
197   /* dynamically updated stack window for uber threads to avoid get_specific
198      call */
199   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
200     KMP_FATAL(StackOverflow, i);
201   }
202 
203   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204   if (stack_addr > stack_base) {
205     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
206     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
207             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
208                 stack_base);
209   } else {
210     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
211             stack_base - stack_addr);
212   }
213 
214   /* Reprint stack bounds for ubermaster since they have been refined */
215   if (__kmp_storage_map) {
216     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
217     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
218     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
219                                  other_threads[i]->th.th_info.ds.ds_stacksize,
220                                  "th_%d stack (refinement)", i);
221   }
222   return i;
223 }
224 
225 int __kmp_get_global_thread_id_reg() {
226   int gtid;
227 
228   if (!__kmp_init_serial) {
229     gtid = KMP_GTID_DNE;
230   } else
231 #ifdef KMP_TDATA_GTID
232       if (TCR_4(__kmp_gtid_mode) >= 3) {
233     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
234     gtid = __kmp_gtid;
235   } else
236 #endif
237       if (TCR_4(__kmp_gtid_mode) >= 2) {
238     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
239     gtid = __kmp_gtid_get_specific();
240   } else {
241     KA_TRACE(1000,
242              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
243     gtid = __kmp_get_global_thread_id();
244   }
245 
246   /* we must be a new uber master sibling thread */
247   if (gtid == KMP_GTID_DNE) {
248     KA_TRACE(10,
249              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
250               "Registering a new gtid.\n"));
251     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
252     if (!__kmp_init_serial) {
253       __kmp_do_serial_initialize();
254       gtid = __kmp_gtid_get_specific();
255     } else {
256       gtid = __kmp_register_root(FALSE);
257     }
258     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
259     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
260   }
261 
262   KMP_DEBUG_ASSERT(gtid >= 0);
263 
264   return gtid;
265 }
266 
267 /* caller must hold forkjoin_lock */
268 void __kmp_check_stack_overlap(kmp_info_t *th) {
269   int f;
270   char *stack_beg = NULL;
271   char *stack_end = NULL;
272   int gtid;
273 
274   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
275   if (__kmp_storage_map) {
276     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
277     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
278 
279     gtid = __kmp_gtid_from_thread(th);
280 
281     if (gtid == KMP_GTID_MONITOR) {
282       __kmp_print_storage_map_gtid(
283           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
284           "th_%s stack (%s)", "mon",
285           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
286     } else {
287       __kmp_print_storage_map_gtid(
288           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
289           "th_%d stack (%s)", gtid,
290           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
291     }
292   }
293 
294   /* No point in checking ubermaster threads since they use refinement and
295    * cannot overlap */
296   gtid = __kmp_gtid_from_thread(th);
297   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
298     KA_TRACE(10,
299              ("__kmp_check_stack_overlap: performing extensive checking\n"));
300     if (stack_beg == NULL) {
301       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
302       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
303     }
304 
305     for (f = 0; f < __kmp_threads_capacity; f++) {
306       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
307 
308       if (f_th && f_th != th) {
309         char *other_stack_end =
310             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
311         char *other_stack_beg =
312             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
313         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
314             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
315 
316           /* Print the other stack values before the abort */
317           if (__kmp_storage_map)
318             __kmp_print_storage_map_gtid(
319                 -1, other_stack_beg, other_stack_end,
320                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
321                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
322 
323           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
324                       __kmp_msg_null);
325         }
326       }
327     }
328   }
329   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
330 }
331 
332 /* ------------------------------------------------------------------------ */
333 
334 void __kmp_infinite_loop(void) {
335   static int done = FALSE;
336 
337   while (!done) {
338     KMP_YIELD(TRUE);
339   }
340 }
341 
342 #define MAX_MESSAGE 512
343 
344 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
345                                   char const *format, ...) {
346   char buffer[MAX_MESSAGE];
347   va_list ap;
348 
349   va_start(ap, format);
350   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
351                p2, (unsigned long)size, format);
352   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
353   __kmp_vprintf(kmp_err, buffer, ap);
354 #if KMP_PRINT_DATA_PLACEMENT
355   int node;
356   if (gtid >= 0) {
357     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
358       if (__kmp_storage_map_verbose) {
359         node = __kmp_get_host_node(p1);
360         if (node < 0) /* doesn't work, so don't try this next time */
361           __kmp_storage_map_verbose = FALSE;
362         else {
363           char *last;
364           int lastNode;
365           int localProc = __kmp_get_cpu_from_gtid(gtid);
366 
367           const int page_size = KMP_GET_PAGE_SIZE();
368 
369           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
370           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
371           if (localProc >= 0)
372             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
373                                  localProc >> 1);
374           else
375             __kmp_printf_no_lock("  GTID %d\n", gtid);
376 #if KMP_USE_PRCTL
377           /* The more elaborate format is disabled for now because of the prctl
378            * hanging bug. */
379           do {
380             last = p1;
381             lastNode = node;
382             /* This loop collates adjacent pages with the same host node. */
383             do {
384               (char *)p1 += page_size;
385             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
386             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
387                                  lastNode);
388           } while (p1 <= p2);
389 #else
390           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
391                                (char *)p1 + (page_size - 1),
392                                __kmp_get_host_node(p1));
393           if (p1 < p2) {
394             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
395                                  (char *)p2 + (page_size - 1),
396                                  __kmp_get_host_node(p2));
397           }
398 #endif
399         }
400       }
401     } else
402       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
403   }
404 #endif /* KMP_PRINT_DATA_PLACEMENT */
405   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406 }
407 
408 void __kmp_warn(char const *format, ...) {
409   char buffer[MAX_MESSAGE];
410   va_list ap;
411 
412   if (__kmp_generate_warnings == kmp_warnings_off) {
413     return;
414   }
415 
416   va_start(ap, format);
417 
418   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
419   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
420   __kmp_vprintf(kmp_err, buffer, ap);
421   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
422 
423   va_end(ap);
424 }
425 
426 void __kmp_abort_process() {
427   // Later threads may stall here, but that's ok because abort() will kill them.
428   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
429 
430   if (__kmp_debug_buf) {
431     __kmp_dump_debug_buffer();
432   }
433 
434   if (KMP_OS_WINDOWS) {
435     // Let other threads know of abnormal termination and prevent deadlock
436     // if abort happened during library initialization or shutdown
437     __kmp_global.g.g_abort = SIGABRT;
438 
439     /* On Windows* OS by default abort() causes pop-up error box, which stalls
440        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
441        boxes. _set_abort_behavior() works well, but this function is not
442        available in VS7 (this is not problem for DLL, but it is a problem for
443        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
444        help, at least in some versions of MS C RTL.
445 
446        It seems following sequence is the only way to simulate abort() and
447        avoid pop-up error box. */
448     raise(SIGABRT);
449     _exit(3); // Just in case, if signal ignored, exit anyway.
450   } else {
451     __kmp_unregister_library();
452     abort();
453   }
454 
455   __kmp_infinite_loop();
456   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
457 
458 } // __kmp_abort_process
459 
460 void __kmp_abort_thread(void) {
461   // TODO: Eliminate g_abort global variable and this function.
462   // In case of abort just call abort(), it will kill all the threads.
463   __kmp_infinite_loop();
464 } // __kmp_abort_thread
465 
466 /* Print out the storage map for the major kmp_info_t thread data structures
467    that are allocated together. */
468 
469 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
470   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
471                                gtid);
472 
473   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
474                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
475 
476   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
477                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
478 
479   __kmp_print_storage_map_gtid(
480       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
481       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
482 
483   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
484                                &thr->th.th_bar[bs_plain_barrier + 1],
485                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
486                                gtid);
487 
488   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
489                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
490                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
491                                gtid);
492 
493 #if KMP_FAST_REDUCTION_BARRIER
494   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
495                                &thr->th.th_bar[bs_reduction_barrier + 1],
496                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
497                                gtid);
498 #endif // KMP_FAST_REDUCTION_BARRIER
499 }
500 
501 /* Print out the storage map for the major kmp_team_t team data structures
502    that are allocated together. */
503 
504 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
505                                          int team_id, int num_thr) {
506   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
507   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
508                                header, team_id);
509 
510   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
511                                &team->t.t_bar[bs_last_barrier],
512                                sizeof(kmp_balign_team_t) * bs_last_barrier,
513                                "%s_%d.t_bar", header, team_id);
514 
515   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
516                                &team->t.t_bar[bs_plain_barrier + 1],
517                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
518                                header, team_id);
519 
520   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
521                                &team->t.t_bar[bs_forkjoin_barrier + 1],
522                                sizeof(kmp_balign_team_t),
523                                "%s_%d.t_bar[forkjoin]", header, team_id);
524 
525 #if KMP_FAST_REDUCTION_BARRIER
526   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
527                                &team->t.t_bar[bs_reduction_barrier + 1],
528                                sizeof(kmp_balign_team_t),
529                                "%s_%d.t_bar[reduction]", header, team_id);
530 #endif // KMP_FAST_REDUCTION_BARRIER
531 
532   __kmp_print_storage_map_gtid(
533       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
534       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
535 
536   __kmp_print_storage_map_gtid(
537       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
538       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
539 
540   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
541                                &team->t.t_disp_buffer[num_disp_buff],
542                                sizeof(dispatch_shared_info_t) * num_disp_buff,
543                                "%s_%d.t_disp_buffer", header, team_id);
544 }
545 
546 static void __kmp_init_allocator() {
547   __kmp_init_memkind();
548   __kmp_init_target_mem();
549 }
550 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
551 
552 /* ------------------------------------------------------------------------ */
553 
554 #if KMP_DYNAMIC_LIB
555 #if KMP_OS_WINDOWS
556 
557 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
558   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
559 
560   switch (fdwReason) {
561 
562   case DLL_PROCESS_ATTACH:
563     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
564 
565     return TRUE;
566 
567   case DLL_PROCESS_DETACH:
568     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
569 
570     // According to Windows* documentation for DllMain entry point:
571     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
572     //   lpReserved == NULL when FreeLibrary() is called,
573     //   lpReserved != NULL when the process is terminated.
574     // When FreeLibrary() is called, worker threads remain alive. So the
575     // runtime's state is consistent and executing proper shutdown is OK.
576     // When the process is terminated, worker threads have exited or been
577     // forcefully terminated by the OS and only the shutdown thread remains.
578     // This can leave the runtime in an inconsistent state.
579     // Hence, only attempt proper cleanup when FreeLibrary() is called.
580     // Otherwise, rely on OS to reclaim resources.
581     if (lpReserved == NULL)
582       __kmp_internal_end_library(__kmp_gtid_get_specific());
583 
584     return TRUE;
585 
586   case DLL_THREAD_ATTACH:
587     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
588 
589     /* if we want to register new siblings all the time here call
590      * __kmp_get_gtid(); */
591     return TRUE;
592 
593   case DLL_THREAD_DETACH:
594     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
595 
596     __kmp_internal_end_thread(__kmp_gtid_get_specific());
597     return TRUE;
598   }
599 
600   return TRUE;
601 }
602 
603 #endif /* KMP_OS_WINDOWS */
604 #endif /* KMP_DYNAMIC_LIB */
605 
606 /* __kmp_parallel_deo -- Wait until it's our turn. */
607 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
608   int gtid = *gtid_ref;
609 #ifdef BUILD_PARALLEL_ORDERED
610   kmp_team_t *team = __kmp_team_from_gtid(gtid);
611 #endif /* BUILD_PARALLEL_ORDERED */
612 
613   if (__kmp_env_consistency_check) {
614     if (__kmp_threads[gtid]->th.th_root->r.r_active)
615 #if KMP_USE_DYNAMIC_LOCK
616       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
617 #else
618       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
619 #endif
620   }
621 #ifdef BUILD_PARALLEL_ORDERED
622   if (!team->t.t_serialized) {
623     KMP_MB();
624     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
625              NULL);
626     KMP_MB();
627   }
628 #endif /* BUILD_PARALLEL_ORDERED */
629 }
630 
631 /* __kmp_parallel_dxo -- Signal the next task. */
632 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
633   int gtid = *gtid_ref;
634 #ifdef BUILD_PARALLEL_ORDERED
635   int tid = __kmp_tid_from_gtid(gtid);
636   kmp_team_t *team = __kmp_team_from_gtid(gtid);
637 #endif /* BUILD_PARALLEL_ORDERED */
638 
639   if (__kmp_env_consistency_check) {
640     if (__kmp_threads[gtid]->th.th_root->r.r_active)
641       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
642   }
643 #ifdef BUILD_PARALLEL_ORDERED
644   if (!team->t.t_serialized) {
645     KMP_MB(); /* Flush all pending memory write invalidates.  */
646 
647     /* use the tid of the next thread in this team */
648     /* TODO replace with general release procedure */
649     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
650 
651     KMP_MB(); /* Flush all pending memory write invalidates.  */
652   }
653 #endif /* BUILD_PARALLEL_ORDERED */
654 }
655 
656 /* ------------------------------------------------------------------------ */
657 /* The BARRIER for a SINGLE process section is always explicit   */
658 
659 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
660   int status;
661   kmp_info_t *th;
662   kmp_team_t *team;
663 
664   if (!TCR_4(__kmp_init_parallel))
665     __kmp_parallel_initialize();
666   __kmp_resume_if_soft_paused();
667 
668   th = __kmp_threads[gtid];
669   team = th->th.th_team;
670   status = 0;
671 
672   th->th.th_ident = id_ref;
673 
674   if (team->t.t_serialized) {
675     status = 1;
676   } else {
677     kmp_int32 old_this = th->th.th_local.this_construct;
678 
679     ++th->th.th_local.this_construct;
680     /* try to set team count to thread count--success means thread got the
681        single block */
682     /* TODO: Should this be acquire or release? */
683     if (team->t.t_construct == old_this) {
684       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
685                                               th->th.th_local.this_construct);
686     }
687 #if USE_ITT_BUILD
688     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
689         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
690         team->t.t_active_level == 1) {
691       // Only report metadata by primary thread of active team at level 1
692       __kmp_itt_metadata_single(id_ref);
693     }
694 #endif /* USE_ITT_BUILD */
695   }
696 
697   if (__kmp_env_consistency_check) {
698     if (status && push_ws) {
699       __kmp_push_workshare(gtid, ct_psingle, id_ref);
700     } else {
701       __kmp_check_workshare(gtid, ct_psingle, id_ref);
702     }
703   }
704 #if USE_ITT_BUILD
705   if (status) {
706     __kmp_itt_single_start(gtid);
707   }
708 #endif /* USE_ITT_BUILD */
709   return status;
710 }
711 
712 void __kmp_exit_single(int gtid) {
713 #if USE_ITT_BUILD
714   __kmp_itt_single_end(gtid);
715 #endif /* USE_ITT_BUILD */
716   if (__kmp_env_consistency_check)
717     __kmp_pop_workshare(gtid, ct_psingle, NULL);
718 }
719 
720 /* determine if we can go parallel or must use a serialized parallel region and
721  * how many threads we can use
722  * set_nproc is the number of threads requested for the team
723  * returns 0 if we should serialize or only use one thread,
724  * otherwise the number of threads to use
725  * The forkjoin lock is held by the caller. */
726 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
727                                  int master_tid, int set_nthreads,
728                                  int enter_teams) {
729   int capacity;
730   int new_nthreads;
731   KMP_DEBUG_ASSERT(__kmp_init_serial);
732   KMP_DEBUG_ASSERT(root && parent_team);
733   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
734 
735   // If dyn-var is set, dynamically adjust the number of desired threads,
736   // according to the method specified by dynamic_mode.
737   new_nthreads = set_nthreads;
738   if (!get__dynamic_2(parent_team, master_tid)) {
739     ;
740   }
741 #ifdef USE_LOAD_BALANCE
742   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
743     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
744     if (new_nthreads == 1) {
745       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
746                     "reservation to 1 thread\n",
747                     master_tid));
748       return 1;
749     }
750     if (new_nthreads < set_nthreads) {
751       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
752                     "reservation to %d threads\n",
753                     master_tid, new_nthreads));
754     }
755   }
756 #endif /* USE_LOAD_BALANCE */
757   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
758     new_nthreads = __kmp_avail_proc - __kmp_nth +
759                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
760     if (new_nthreads <= 1) {
761       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
762                     "reservation to 1 thread\n",
763                     master_tid));
764       return 1;
765     }
766     if (new_nthreads < set_nthreads) {
767       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
768                     "reservation to %d threads\n",
769                     master_tid, new_nthreads));
770     } else {
771       new_nthreads = set_nthreads;
772     }
773   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
774     if (set_nthreads > 2) {
775       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
776       new_nthreads = (new_nthreads % set_nthreads) + 1;
777       if (new_nthreads == 1) {
778         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
779                       "reservation to 1 thread\n",
780                       master_tid));
781         return 1;
782       }
783       if (new_nthreads < set_nthreads) {
784         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
785                       "reservation to %d threads\n",
786                       master_tid, new_nthreads));
787       }
788     }
789   } else {
790     KMP_ASSERT(0);
791   }
792 
793   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
794   if (__kmp_nth + new_nthreads -
795           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
796       __kmp_max_nth) {
797     int tl_nthreads = __kmp_max_nth - __kmp_nth +
798                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
799     if (tl_nthreads <= 0) {
800       tl_nthreads = 1;
801     }
802 
803     // If dyn-var is false, emit a 1-time warning.
804     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
805       __kmp_reserve_warn = 1;
806       __kmp_msg(kmp_ms_warning,
807                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
808                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
809     }
810     if (tl_nthreads == 1) {
811       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
812                     "reduced reservation to 1 thread\n",
813                     master_tid));
814       return 1;
815     }
816     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
817                   "reservation to %d threads\n",
818                   master_tid, tl_nthreads));
819     new_nthreads = tl_nthreads;
820   }
821 
822   // Respect OMP_THREAD_LIMIT
823   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
824   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
825   if (cg_nthreads + new_nthreads -
826           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
827       max_cg_threads) {
828     int tl_nthreads = max_cg_threads - cg_nthreads +
829                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
830     if (tl_nthreads <= 0) {
831       tl_nthreads = 1;
832     }
833 
834     // If dyn-var is false, emit a 1-time warning.
835     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
836       __kmp_reserve_warn = 1;
837       __kmp_msg(kmp_ms_warning,
838                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
839                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
840     }
841     if (tl_nthreads == 1) {
842       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
843                     "reduced reservation to 1 thread\n",
844                     master_tid));
845       return 1;
846     }
847     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
848                   "reservation to %d threads\n",
849                   master_tid, tl_nthreads));
850     new_nthreads = tl_nthreads;
851   }
852 
853   // Check if the threads array is large enough, or needs expanding.
854   // See comment in __kmp_register_root() about the adjustment if
855   // __kmp_threads[0] == NULL.
856   capacity = __kmp_threads_capacity;
857   if (TCR_PTR(__kmp_threads[0]) == NULL) {
858     --capacity;
859   }
860   // If it is not for initializing the hidden helper team, we need to take
861   // __kmp_hidden_helper_threads_num out of the capacity because it is included
862   // in __kmp_threads_capacity.
863   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
864     capacity -= __kmp_hidden_helper_threads_num;
865   }
866   if (__kmp_nth + new_nthreads -
867           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
868       capacity) {
869     // Expand the threads array.
870     int slotsRequired = __kmp_nth + new_nthreads -
871                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
872                         capacity;
873     int slotsAdded = __kmp_expand_threads(slotsRequired);
874     if (slotsAdded < slotsRequired) {
875       // The threads array was not expanded enough.
876       new_nthreads -= (slotsRequired - slotsAdded);
877       KMP_ASSERT(new_nthreads >= 1);
878 
879       // If dyn-var is false, emit a 1-time warning.
880       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
881         __kmp_reserve_warn = 1;
882         if (__kmp_tp_cached) {
883           __kmp_msg(kmp_ms_warning,
884                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
885                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
886                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
887         } else {
888           __kmp_msg(kmp_ms_warning,
889                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
890                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
891         }
892       }
893     }
894   }
895 
896 #ifdef KMP_DEBUG
897   if (new_nthreads == 1) {
898     KC_TRACE(10,
899              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
900               "dead roots and rechecking; requested %d threads\n",
901               __kmp_get_gtid(), set_nthreads));
902   } else {
903     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
904                   " %d threads\n",
905                   __kmp_get_gtid(), new_nthreads, set_nthreads));
906   }
907 #endif // KMP_DEBUG
908   return new_nthreads;
909 }
910 
911 /* Allocate threads from the thread pool and assign them to the new team. We are
912    assured that there are enough threads available, because we checked on that
913    earlier within critical section forkjoin */
914 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
915                                     kmp_info_t *master_th, int master_gtid) {
916   int i;
917   int use_hot_team;
918 
919   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
920   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
921   KMP_MB();
922 
923   /* first, let's setup the primary thread */
924   master_th->th.th_info.ds.ds_tid = 0;
925   master_th->th.th_team = team;
926   master_th->th.th_team_nproc = team->t.t_nproc;
927   master_th->th.th_team_master = master_th;
928   master_th->th.th_team_serialized = FALSE;
929   master_th->th.th_dispatch = &team->t.t_dispatch[0];
930 
931 /* make sure we are not the optimized hot team */
932 #if KMP_NESTED_HOT_TEAMS
933   use_hot_team = 0;
934   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
935   if (hot_teams) { // hot teams array is not allocated if
936     // KMP_HOT_TEAMS_MAX_LEVEL=0
937     int level = team->t.t_active_level - 1; // index in array of hot teams
938     if (master_th->th.th_teams_microtask) { // are we inside the teams?
939       if (master_th->th.th_teams_size.nteams > 1) {
940         ++level; // level was not increased in teams construct for
941         // team_of_masters
942       }
943       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
944           master_th->th.th_teams_level == team->t.t_level) {
945         ++level; // level was not increased in teams construct for
946         // team_of_workers before the parallel
947       } // team->t.t_level will be increased inside parallel
948     }
949     if (level < __kmp_hot_teams_max_level) {
950       if (hot_teams[level].hot_team) {
951         // hot team has already been allocated for given level
952         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
953         use_hot_team = 1; // the team is ready to use
954       } else {
955         use_hot_team = 0; // AC: threads are not allocated yet
956         hot_teams[level].hot_team = team; // remember new hot team
957         hot_teams[level].hot_team_nth = team->t.t_nproc;
958       }
959     } else {
960       use_hot_team = 0;
961     }
962   }
963 #else
964   use_hot_team = team == root->r.r_hot_team;
965 #endif
966   if (!use_hot_team) {
967 
968     /* install the primary thread */
969     team->t.t_threads[0] = master_th;
970     __kmp_initialize_info(master_th, team, 0, master_gtid);
971 
972     /* now, install the worker threads */
973     for (i = 1; i < team->t.t_nproc; i++) {
974 
975       /* fork or reallocate a new thread and install it in team */
976       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
977       team->t.t_threads[i] = thr;
978       KMP_DEBUG_ASSERT(thr);
979       KMP_DEBUG_ASSERT(thr->th.th_team == team);
980       /* align team and thread arrived states */
981       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
982                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
983                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
984                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
985                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
986                     team->t.t_bar[bs_plain_barrier].b_arrived));
987       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
988       thr->th.th_teams_level = master_th->th.th_teams_level;
989       thr->th.th_teams_size = master_th->th.th_teams_size;
990       { // Initialize threads' barrier data.
991         int b;
992         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
993         for (b = 0; b < bs_last_barrier; ++b) {
994           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
995           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
996 #if USE_DEBUGGER
997           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
998 #endif
999         }
1000       }
1001     }
1002 
1003 #if KMP_AFFINITY_SUPPORTED
1004     __kmp_partition_places(team);
1005 #endif
1006   }
1007 
1008   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1009     for (i = 0; i < team->t.t_nproc; i++) {
1010       kmp_info_t *thr = team->t.t_threads[i];
1011       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1012           thr->th.th_prev_level != team->t.t_level) {
1013         team->t.t_display_affinity = 1;
1014         break;
1015       }
1016     }
1017   }
1018 
1019   KMP_MB();
1020 }
1021 
1022 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1023 // Propagate any changes to the floating point control registers out to the team
1024 // We try to avoid unnecessary writes to the relevant cache line in the team
1025 // structure, so we don't make changes unless they are needed.
1026 inline static void propagateFPControl(kmp_team_t *team) {
1027   if (__kmp_inherit_fp_control) {
1028     kmp_int16 x87_fpu_control_word;
1029     kmp_uint32 mxcsr;
1030 
1031     // Get primary thread's values of FPU control flags (both X87 and vector)
1032     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1033     __kmp_store_mxcsr(&mxcsr);
1034     mxcsr &= KMP_X86_MXCSR_MASK;
1035 
1036     // There is no point looking at t_fp_control_saved here.
1037     // If it is TRUE, we still have to update the values if they are different
1038     // from those we now have. If it is FALSE we didn't save anything yet, but
1039     // our objective is the same. We have to ensure that the values in the team
1040     // are the same as those we have.
1041     // So, this code achieves what we need whether or not t_fp_control_saved is
1042     // true. By checking whether the value needs updating we avoid unnecessary
1043     // writes that would put the cache-line into a written state, causing all
1044     // threads in the team to have to read it again.
1045     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1046     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1047     // Although we don't use this value, other code in the runtime wants to know
1048     // whether it should restore them. So we must ensure it is correct.
1049     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1050   } else {
1051     // Similarly here. Don't write to this cache-line in the team structure
1052     // unless we have to.
1053     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1054   }
1055 }
1056 
1057 // Do the opposite, setting the hardware registers to the updated values from
1058 // the team.
1059 inline static void updateHWFPControl(kmp_team_t *team) {
1060   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1061     // Only reset the fp control regs if they have been changed in the team.
1062     // the parallel region that we are exiting.
1063     kmp_int16 x87_fpu_control_word;
1064     kmp_uint32 mxcsr;
1065     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1066     __kmp_store_mxcsr(&mxcsr);
1067     mxcsr &= KMP_X86_MXCSR_MASK;
1068 
1069     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1070       __kmp_clear_x87_fpu_status_word();
1071       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1072     }
1073 
1074     if (team->t.t_mxcsr != mxcsr) {
1075       __kmp_load_mxcsr(&team->t.t_mxcsr);
1076     }
1077   }
1078 }
1079 #else
1080 #define propagateFPControl(x) ((void)0)
1081 #define updateHWFPControl(x) ((void)0)
1082 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1083 
1084 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1085                                      int realloc); // forward declaration
1086 
1087 /* Run a parallel region that has been serialized, so runs only in a team of the
1088    single primary thread. */
1089 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1090   kmp_info_t *this_thr;
1091   kmp_team_t *serial_team;
1092 
1093   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1094 
1095   /* Skip all this code for autopar serialized loops since it results in
1096      unacceptable overhead */
1097   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1098     return;
1099 
1100   if (!TCR_4(__kmp_init_parallel))
1101     __kmp_parallel_initialize();
1102   __kmp_resume_if_soft_paused();
1103 
1104   this_thr = __kmp_threads[global_tid];
1105   serial_team = this_thr->th.th_serial_team;
1106 
1107   /* utilize the serialized team held by this thread */
1108   KMP_DEBUG_ASSERT(serial_team);
1109   KMP_MB();
1110 
1111   if (__kmp_tasking_mode != tskm_immediate_exec) {
1112     KMP_DEBUG_ASSERT(
1113         this_thr->th.th_task_team ==
1114         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1115     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1116                      NULL);
1117     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1118                   "team %p, new task_team = NULL\n",
1119                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1120     this_thr->th.th_task_team = NULL;
1121   }
1122 
1123   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1124   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1125     proc_bind = proc_bind_false;
1126   } else if (proc_bind == proc_bind_default) {
1127     // No proc_bind clause was specified, so use the current value
1128     // of proc-bind-var for this parallel region.
1129     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1130   }
1131   // Reset for next parallel region
1132   this_thr->th.th_set_proc_bind = proc_bind_default;
1133 
1134 #if OMPT_SUPPORT
1135   ompt_data_t ompt_parallel_data = ompt_data_none;
1136   ompt_data_t *implicit_task_data;
1137   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1138   if (ompt_enabled.enabled &&
1139       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1140 
1141     ompt_task_info_t *parent_task_info;
1142     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1143 
1144     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1145     if (ompt_enabled.ompt_callback_parallel_begin) {
1146       int team_size = 1;
1147 
1148       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1149           &(parent_task_info->task_data), &(parent_task_info->frame),
1150           &ompt_parallel_data, team_size,
1151           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1152     }
1153   }
1154 #endif // OMPT_SUPPORT
1155 
1156   if (this_thr->th.th_team != serial_team) {
1157     // Nested level will be an index in the nested nthreads array
1158     int level = this_thr->th.th_team->t.t_level;
1159 
1160     if (serial_team->t.t_serialized) {
1161       /* this serial team was already used
1162          TODO increase performance by making this locks more specific */
1163       kmp_team_t *new_team;
1164 
1165       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1166 
1167       new_team =
1168           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1169 #if OMPT_SUPPORT
1170                               ompt_parallel_data,
1171 #endif
1172                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1173                               0 USE_NESTED_HOT_ARG(NULL));
1174       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1175       KMP_ASSERT(new_team);
1176 
1177       /* setup new serialized team and install it */
1178       new_team->t.t_threads[0] = this_thr;
1179       new_team->t.t_parent = this_thr->th.th_team;
1180       serial_team = new_team;
1181       this_thr->th.th_serial_team = serial_team;
1182 
1183       KF_TRACE(
1184           10,
1185           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1186            global_tid, serial_team));
1187 
1188       /* TODO the above breaks the requirement that if we run out of resources,
1189          then we can still guarantee that serialized teams are ok, since we may
1190          need to allocate a new one */
1191     } else {
1192       KF_TRACE(
1193           10,
1194           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1195            global_tid, serial_team));
1196     }
1197 
1198     /* we have to initialize this serial team */
1199     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1200     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1201     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1202     serial_team->t.t_ident = loc;
1203     serial_team->t.t_serialized = 1;
1204     serial_team->t.t_nproc = 1;
1205     serial_team->t.t_parent = this_thr->th.th_team;
1206     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1207     this_thr->th.th_team = serial_team;
1208     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1209 
1210     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1211                   this_thr->th.th_current_task));
1212     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1213     this_thr->th.th_current_task->td_flags.executing = 0;
1214 
1215     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1216 
1217     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1218        implicit task for each serialized task represented by
1219        team->t.t_serialized? */
1220     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1221               &this_thr->th.th_current_task->td_parent->td_icvs);
1222 
1223     // Thread value exists in the nested nthreads array for the next nested
1224     // level
1225     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1226       this_thr->th.th_current_task->td_icvs.nproc =
1227           __kmp_nested_nth.nth[level + 1];
1228     }
1229 
1230     if (__kmp_nested_proc_bind.used &&
1231         (level + 1 < __kmp_nested_proc_bind.used)) {
1232       this_thr->th.th_current_task->td_icvs.proc_bind =
1233           __kmp_nested_proc_bind.bind_types[level + 1];
1234     }
1235 
1236 #if USE_DEBUGGER
1237     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1238 #endif
1239     this_thr->th.th_info.ds.ds_tid = 0;
1240 
1241     /* set thread cache values */
1242     this_thr->th.th_team_nproc = 1;
1243     this_thr->th.th_team_master = this_thr;
1244     this_thr->th.th_team_serialized = 1;
1245 
1246     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1247     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1248     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1249 
1250     propagateFPControl(serial_team);
1251 
1252     /* check if we need to allocate dispatch buffers stack */
1253     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1254     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1255       serial_team->t.t_dispatch->th_disp_buffer =
1256           (dispatch_private_info_t *)__kmp_allocate(
1257               sizeof(dispatch_private_info_t));
1258     }
1259     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1260 
1261     KMP_MB();
1262 
1263   } else {
1264     /* this serialized team is already being used,
1265      * that's fine, just add another nested level */
1266     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1267     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1268     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1269     ++serial_team->t.t_serialized;
1270     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1271 
1272     // Nested level will be an index in the nested nthreads array
1273     int level = this_thr->th.th_team->t.t_level;
1274     // Thread value exists in the nested nthreads array for the next nested
1275     // level
1276     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1277       this_thr->th.th_current_task->td_icvs.nproc =
1278           __kmp_nested_nth.nth[level + 1];
1279     }
1280     serial_team->t.t_level++;
1281     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1282                   "of serial team %p to %d\n",
1283                   global_tid, serial_team, serial_team->t.t_level));
1284 
1285     /* allocate/push dispatch buffers stack */
1286     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1287     {
1288       dispatch_private_info_t *disp_buffer =
1289           (dispatch_private_info_t *)__kmp_allocate(
1290               sizeof(dispatch_private_info_t));
1291       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1292       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1293     }
1294     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1295 
1296     KMP_MB();
1297   }
1298   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1299 
1300   // Perform the display affinity functionality for
1301   // serialized parallel regions
1302   if (__kmp_display_affinity) {
1303     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1304         this_thr->th.th_prev_num_threads != 1) {
1305       // NULL means use the affinity-format-var ICV
1306       __kmp_aux_display_affinity(global_tid, NULL);
1307       this_thr->th.th_prev_level = serial_team->t.t_level;
1308       this_thr->th.th_prev_num_threads = 1;
1309     }
1310   }
1311 
1312   if (__kmp_env_consistency_check)
1313     __kmp_push_parallel(global_tid, NULL);
1314 #if OMPT_SUPPORT
1315   serial_team->t.ompt_team_info.master_return_address = codeptr;
1316   if (ompt_enabled.enabled &&
1317       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1318     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1319         OMPT_GET_FRAME_ADDRESS(0);
1320 
1321     ompt_lw_taskteam_t lw_taskteam;
1322     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1323                             &ompt_parallel_data, codeptr);
1324 
1325     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1326     // don't use lw_taskteam after linking. content was swaped
1327 
1328     /* OMPT implicit task begin */
1329     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1330     if (ompt_enabled.ompt_callback_implicit_task) {
1331       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1332           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1333           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1334           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1335       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1336           __kmp_tid_from_gtid(global_tid);
1337     }
1338 
1339     /* OMPT state */
1340     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1341     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1342         OMPT_GET_FRAME_ADDRESS(0);
1343   }
1344 #endif
1345 }
1346 
1347 /* most of the work for a fork */
1348 /* return true if we really went parallel, false if serialized */
1349 int __kmp_fork_call(ident_t *loc, int gtid,
1350                     enum fork_context_e call_context, // Intel, GNU, ...
1351                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1352                     kmp_va_list ap) {
1353   void **argv;
1354   int i;
1355   int master_tid;
1356   int master_this_cons;
1357   kmp_team_t *team;
1358   kmp_team_t *parent_team;
1359   kmp_info_t *master_th;
1360   kmp_root_t *root;
1361   int nthreads;
1362   int master_active;
1363   int master_set_numthreads;
1364   int level;
1365   int active_level;
1366   int teams_level;
1367 #if KMP_NESTED_HOT_TEAMS
1368   kmp_hot_team_ptr_t **p_hot_teams;
1369 #endif
1370   { // KMP_TIME_BLOCK
1371     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1372     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1373 
1374     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1375     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1376       /* Some systems prefer the stack for the root thread(s) to start with */
1377       /* some gap from the parent stack to prevent false sharing. */
1378       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1379       /* These 2 lines below are so this does not get optimized out */
1380       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1381         __kmp_stkpadding += (short)((kmp_int64)dummy);
1382     }
1383 
1384     /* initialize if needed */
1385     KMP_DEBUG_ASSERT(
1386         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1387     if (!TCR_4(__kmp_init_parallel))
1388       __kmp_parallel_initialize();
1389     __kmp_resume_if_soft_paused();
1390 
1391     /* setup current data */
1392     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1393     // shutdown
1394     parent_team = master_th->th.th_team;
1395     master_tid = master_th->th.th_info.ds.ds_tid;
1396     master_this_cons = master_th->th.th_local.this_construct;
1397     root = master_th->th.th_root;
1398     master_active = root->r.r_active;
1399     master_set_numthreads = master_th->th.th_set_nproc;
1400 
1401 #if OMPT_SUPPORT
1402     ompt_data_t ompt_parallel_data = ompt_data_none;
1403     ompt_data_t *parent_task_data;
1404     ompt_frame_t *ompt_frame;
1405     ompt_data_t *implicit_task_data;
1406     void *return_address = NULL;
1407 
1408     if (ompt_enabled.enabled) {
1409       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1410                                     NULL, NULL);
1411       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1412     }
1413 #endif
1414 
1415     // Nested level will be an index in the nested nthreads array
1416     level = parent_team->t.t_level;
1417     // used to launch non-serial teams even if nested is not allowed
1418     active_level = parent_team->t.t_active_level;
1419     // needed to check nesting inside the teams
1420     teams_level = master_th->th.th_teams_level;
1421 #if KMP_NESTED_HOT_TEAMS
1422     p_hot_teams = &master_th->th.th_hot_teams;
1423     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1424       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1425           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1426       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1427       // it is either actual or not needed (when active_level > 0)
1428       (*p_hot_teams)[0].hot_team_nth = 1;
1429     }
1430 #endif
1431 
1432 #if OMPT_SUPPORT
1433     if (ompt_enabled.enabled) {
1434       if (ompt_enabled.ompt_callback_parallel_begin) {
1435         int team_size = master_set_numthreads
1436                             ? master_set_numthreads
1437                             : get__nproc_2(parent_team, master_tid);
1438         int flags = OMPT_INVOKER(call_context) |
1439                     ((microtask == (microtask_t)__kmp_teams_master)
1440                          ? ompt_parallel_league
1441                          : ompt_parallel_team);
1442         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1443             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1444             return_address);
1445       }
1446       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1447     }
1448 #endif
1449 
1450     master_th->th.th_ident = loc;
1451 
1452     if (master_th->th.th_teams_microtask && ap &&
1453         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1454       // AC: This is start of parallel that is nested inside teams construct.
1455       // The team is actual (hot), all workers are ready at the fork barrier.
1456       // No lock needed to initialize the team a bit, then free workers.
1457       parent_team->t.t_ident = loc;
1458       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1459       parent_team->t.t_argc = argc;
1460       argv = (void **)parent_team->t.t_argv;
1461       for (i = argc - 1; i >= 0; --i)
1462         *argv++ = va_arg(kmp_va_deref(ap), void *);
1463       // Increment our nested depth levels, but not increase the serialization
1464       if (parent_team == master_th->th.th_serial_team) {
1465         // AC: we are in serialized parallel
1466         __kmpc_serialized_parallel(loc, gtid);
1467         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1468 
1469         if (call_context == fork_context_gnu) {
1470           // AC: need to decrement t_serialized for enquiry functions to work
1471           // correctly, will restore at join time
1472           parent_team->t.t_serialized--;
1473           return TRUE;
1474         }
1475 
1476 #if OMPD_SUPPORT
1477         parent_team->t.t_pkfn = microtask;
1478 #endif
1479 
1480 #if OMPT_SUPPORT
1481         void *dummy;
1482         void **exit_frame_p;
1483 
1484         ompt_lw_taskteam_t lw_taskteam;
1485 
1486         if (ompt_enabled.enabled) {
1487           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1488                                   &ompt_parallel_data, return_address);
1489           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1490 
1491           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1492           // don't use lw_taskteam after linking. content was swaped
1493 
1494           /* OMPT implicit task begin */
1495           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1496           if (ompt_enabled.ompt_callback_implicit_task) {
1497             OMPT_CUR_TASK_INFO(master_th)->thread_num =
1498                 __kmp_tid_from_gtid(gtid);
1499             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1500                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1501                 implicit_task_data, 1,
1502                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1503           }
1504 
1505           /* OMPT state */
1506           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1507         } else {
1508           exit_frame_p = &dummy;
1509         }
1510 #endif
1511         // AC: need to decrement t_serialized for enquiry functions to work
1512         // correctly, will restore at join time
1513         parent_team->t.t_serialized--;
1514 
1515         {
1516           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1517           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1518           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1519 #if OMPT_SUPPORT
1520                                  ,
1521                                  exit_frame_p
1522 #endif
1523           );
1524         }
1525 
1526 #if OMPT_SUPPORT
1527         if (ompt_enabled.enabled) {
1528           *exit_frame_p = NULL;
1529           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1530           if (ompt_enabled.ompt_callback_implicit_task) {
1531             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1532                 ompt_scope_end, NULL, implicit_task_data, 1,
1533                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1534           }
1535           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1536           __ompt_lw_taskteam_unlink(master_th);
1537           if (ompt_enabled.ompt_callback_parallel_end) {
1538             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1539                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1540                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1541                 return_address);
1542           }
1543           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1544         }
1545 #endif
1546         return TRUE;
1547       }
1548 
1549       parent_team->t.t_pkfn = microtask;
1550       parent_team->t.t_invoke = invoker;
1551       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1552       parent_team->t.t_active_level++;
1553       parent_team->t.t_level++;
1554       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1555 
1556 #if OMPT_SUPPORT
1557       if (ompt_enabled.enabled) {
1558         ompt_lw_taskteam_t lw_taskteam;
1559         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1560                                 &ompt_parallel_data, return_address);
1561         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1562       }
1563 #endif
1564 
1565       /* Change number of threads in the team if requested */
1566       if (master_set_numthreads) { // The parallel has num_threads clause
1567         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1568           // AC: only can reduce number of threads dynamically, can't increase
1569           kmp_info_t **other_threads = parent_team->t.t_threads;
1570           parent_team->t.t_nproc = master_set_numthreads;
1571           for (i = 0; i < master_set_numthreads; ++i) {
1572             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1573           }
1574           // Keep extra threads hot in the team for possible next parallels
1575         }
1576         master_th->th.th_set_nproc = 0;
1577       }
1578 
1579 #if USE_DEBUGGER
1580       if (__kmp_debugging) { // Let debugger override number of threads.
1581         int nth = __kmp_omp_num_threads(loc);
1582         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1583           master_set_numthreads = nth;
1584         }
1585       }
1586 #endif
1587 
1588 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1589       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1590            KMP_ITT_DEBUG) &&
1591           __kmp_forkjoin_frames_mode == 3 &&
1592           parent_team->t.t_active_level == 1 // only report frames at level 1
1593           && master_th->th.th_teams_size.nteams == 1) {
1594         kmp_uint64 tmp_time = __itt_get_timestamp();
1595         master_th->th.th_frame_time = tmp_time;
1596         parent_team->t.t_region_time = tmp_time;
1597       }
1598       if (__itt_stack_caller_create_ptr) {
1599         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1600         // create new stack stitching id before entering fork barrier
1601         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1602       }
1603 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1604 
1605       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1606                     "master_th=%p, gtid=%d\n",
1607                     root, parent_team, master_th, gtid));
1608       __kmp_internal_fork(loc, gtid, parent_team);
1609       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1610                     "master_th=%p, gtid=%d\n",
1611                     root, parent_team, master_th, gtid));
1612 
1613       if (call_context == fork_context_gnu)
1614         return TRUE;
1615 
1616       /* Invoke microtask for PRIMARY thread */
1617       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1618                     parent_team->t.t_id, parent_team->t.t_pkfn));
1619 
1620       if (!parent_team->t.t_invoke(gtid)) {
1621         KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1622       }
1623       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1624                     parent_team->t.t_id, parent_team->t.t_pkfn));
1625       KMP_MB(); /* Flush all pending memory write invalidates.  */
1626 
1627       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1628 
1629       return TRUE;
1630     } // Parallel closely nested in teams construct
1631 
1632 #if KMP_DEBUG
1633     if (__kmp_tasking_mode != tskm_immediate_exec) {
1634       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1635                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1636     }
1637 #endif
1638 
1639     int enter_teams = 0;
1640     if (parent_team->t.t_active_level >=
1641         master_th->th.th_current_task->td_icvs.max_active_levels) {
1642       nthreads = 1;
1643     } else {
1644       enter_teams = ((ap == NULL && active_level == 0) ||
1645                      (ap && teams_level > 0 && teams_level == level));
1646       nthreads =
1647           master_set_numthreads
1648               ? master_set_numthreads
1649               : get__nproc_2(
1650                     parent_team,
1651                     master_tid); // TODO: get nproc directly from current task
1652 
1653       // Check if we need to take forkjoin lock? (no need for serialized
1654       // parallel out of teams construct). This code moved here from
1655       // __kmp_reserve_threads() to speedup nested serialized parallels.
1656       if (nthreads > 1) {
1657         if ((get__max_active_levels(master_th) == 1 &&
1658              (root->r.r_in_parallel && !enter_teams)) ||
1659             (__kmp_library == library_serial)) {
1660           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1661                         " threads\n",
1662                         gtid, nthreads));
1663           nthreads = 1;
1664         }
1665       }
1666       if (nthreads > 1) {
1667         /* determine how many new threads we can use */
1668         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1669         /* AC: If we execute teams from parallel region (on host), then teams
1670            should be created but each can only have 1 thread if nesting is
1671            disabled. If teams called from serial region, then teams and their
1672            threads should be created regardless of the nesting setting. */
1673         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1674                                          nthreads, enter_teams);
1675         if (nthreads == 1) {
1676           // Free lock for single thread execution here; for multi-thread
1677           // execution it will be freed later after team of threads created
1678           // and initialized
1679           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1680         }
1681       }
1682     }
1683     KMP_DEBUG_ASSERT(nthreads > 0);
1684 
1685     // If we temporarily changed the set number of threads then restore it now
1686     master_th->th.th_set_nproc = 0;
1687 
1688     /* create a serialized parallel region? */
1689     if (nthreads == 1) {
1690 /* josh todo: hypothetical question: what do we do for OS X*? */
1691 #if KMP_OS_LINUX &&                                                            \
1692     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1693       void *args[argc];
1694 #else
1695       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1696 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1697           KMP_ARCH_AARCH64) */
1698 
1699       KA_TRACE(20,
1700                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1701 
1702       __kmpc_serialized_parallel(loc, gtid);
1703 
1704 #if OMPD_SUPPORT
1705       master_th->th.th_serial_team->t.t_pkfn = microtask;
1706 #endif
1707 
1708       if (call_context == fork_context_intel) {
1709         /* TODO this sucks, use the compiler itself to pass args! :) */
1710         master_th->th.th_serial_team->t.t_ident = loc;
1711         if (!ap) {
1712           // revert change made in __kmpc_serialized_parallel()
1713           master_th->th.th_serial_team->t.t_level--;
1714           // Get args from parent team for teams construct
1715 
1716 #if OMPT_SUPPORT
1717           void *dummy;
1718           void **exit_frame_p;
1719           ompt_task_info_t *task_info;
1720 
1721           ompt_lw_taskteam_t lw_taskteam;
1722 
1723           if (ompt_enabled.enabled) {
1724             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1725                                     &ompt_parallel_data, return_address);
1726 
1727             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1728             // don't use lw_taskteam after linking. content was swaped
1729 
1730             task_info = OMPT_CUR_TASK_INFO(master_th);
1731             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1732             if (ompt_enabled.ompt_callback_implicit_task) {
1733               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1734                   __kmp_tid_from_gtid(gtid);
1735               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1736                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1737                   &(task_info->task_data), 1,
1738                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1739                   ompt_task_implicit);
1740             }
1741 
1742             /* OMPT state */
1743             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1744           } else {
1745             exit_frame_p = &dummy;
1746           }
1747 #endif
1748 
1749           {
1750             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1751             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1752             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1753                                    parent_team->t.t_argv
1754 #if OMPT_SUPPORT
1755                                    ,
1756                                    exit_frame_p
1757 #endif
1758             );
1759           }
1760 
1761 #if OMPT_SUPPORT
1762           if (ompt_enabled.enabled) {
1763             *exit_frame_p = NULL;
1764             if (ompt_enabled.ompt_callback_implicit_task) {
1765               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1766                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1767                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1768                   ompt_task_implicit);
1769             }
1770             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1771             __ompt_lw_taskteam_unlink(master_th);
1772             if (ompt_enabled.ompt_callback_parallel_end) {
1773               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1774                   &ompt_parallel_data, parent_task_data,
1775                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1776                   return_address);
1777             }
1778             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1779           }
1780 #endif
1781         } else if (microtask == (microtask_t)__kmp_teams_master) {
1782           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1783                            master_th->th.th_serial_team);
1784           team = master_th->th.th_team;
1785           // team->t.t_pkfn = microtask;
1786           team->t.t_invoke = invoker;
1787           __kmp_alloc_argv_entries(argc, team, TRUE);
1788           team->t.t_argc = argc;
1789           argv = (void **)team->t.t_argv;
1790           if (ap) {
1791             for (i = argc - 1; i >= 0; --i)
1792               *argv++ = va_arg(kmp_va_deref(ap), void *);
1793           } else {
1794             for (i = 0; i < argc; ++i)
1795               // Get args from parent team for teams construct
1796               argv[i] = parent_team->t.t_argv[i];
1797           }
1798           // AC: revert change made in __kmpc_serialized_parallel()
1799           //     because initial code in teams should have level=0
1800           team->t.t_level--;
1801           // AC: call special invoker for outer "parallel" of teams construct
1802           invoker(gtid);
1803 #if OMPT_SUPPORT
1804           if (ompt_enabled.enabled) {
1805             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1806             if (ompt_enabled.ompt_callback_implicit_task) {
1807               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1808                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1809                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1810             }
1811             if (ompt_enabled.ompt_callback_parallel_end) {
1812               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1813                   &ompt_parallel_data, parent_task_data,
1814                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1815                   return_address);
1816             }
1817             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1818           }
1819 #endif
1820         } else {
1821           argv = args;
1822           for (i = argc - 1; i >= 0; --i)
1823             *argv++ = va_arg(kmp_va_deref(ap), void *);
1824           KMP_MB();
1825 
1826 #if OMPT_SUPPORT
1827           void *dummy;
1828           void **exit_frame_p;
1829           ompt_task_info_t *task_info;
1830 
1831           ompt_lw_taskteam_t lw_taskteam;
1832 
1833           if (ompt_enabled.enabled) {
1834             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1835                                     &ompt_parallel_data, return_address);
1836             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1837             // don't use lw_taskteam after linking. content was swaped
1838             task_info = OMPT_CUR_TASK_INFO(master_th);
1839             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1840 
1841             /* OMPT implicit task begin */
1842             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1843             if (ompt_enabled.ompt_callback_implicit_task) {
1844               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1845                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1846                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1847                   ompt_task_implicit);
1848               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1849                   __kmp_tid_from_gtid(gtid);
1850             }
1851 
1852             /* OMPT state */
1853             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1854           } else {
1855             exit_frame_p = &dummy;
1856           }
1857 #endif
1858 
1859           {
1860             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1861             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1862             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1863 #if OMPT_SUPPORT
1864                                    ,
1865                                    exit_frame_p
1866 #endif
1867             );
1868           }
1869 
1870 #if OMPT_SUPPORT
1871           if (ompt_enabled.enabled) {
1872             *exit_frame_p = NULL;
1873             if (ompt_enabled.ompt_callback_implicit_task) {
1874               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1875                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1876                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1877                   ompt_task_implicit);
1878             }
1879 
1880             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1881             __ompt_lw_taskteam_unlink(master_th);
1882             if (ompt_enabled.ompt_callback_parallel_end) {
1883               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1884                   &ompt_parallel_data, parent_task_data,
1885                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1886                   return_address);
1887             }
1888             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1889           }
1890 #endif
1891         }
1892       } else if (call_context == fork_context_gnu) {
1893 #if OMPT_SUPPORT
1894         ompt_lw_taskteam_t lwt;
1895         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1896                                 return_address);
1897 
1898         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1899         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1900 // don't use lw_taskteam after linking. content was swaped
1901 #endif
1902 
1903         // we were called from GNU native code
1904         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1905         return FALSE;
1906       } else {
1907         KMP_ASSERT2(call_context < fork_context_last,
1908                     "__kmp_fork_call: unknown fork_context parameter");
1909       }
1910 
1911       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1912       KMP_MB();
1913       return FALSE;
1914     } // if (nthreads == 1)
1915 
1916     // GEH: only modify the executing flag in the case when not serialized
1917     //      serialized case is handled in kmpc_serialized_parallel
1918     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1919                   "curtask=%p, curtask_max_aclevel=%d\n",
1920                   parent_team->t.t_active_level, master_th,
1921                   master_th->th.th_current_task,
1922                   master_th->th.th_current_task->td_icvs.max_active_levels));
1923     // TODO: GEH - cannot do this assertion because root thread not set up as
1924     // executing
1925     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1926     master_th->th.th_current_task->td_flags.executing = 0;
1927 
1928     if (!master_th->th.th_teams_microtask || level > teams_level) {
1929       /* Increment our nested depth level */
1930       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1931     }
1932 
1933     // See if we need to make a copy of the ICVs.
1934     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1935     if ((level + 1 < __kmp_nested_nth.used) &&
1936         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1937       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1938     } else {
1939       nthreads_icv = 0; // don't update
1940     }
1941 
1942     // Figure out the proc_bind_policy for the new team.
1943     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1944     kmp_proc_bind_t proc_bind_icv =
1945         proc_bind_default; // proc_bind_default means don't update
1946     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1947       proc_bind = proc_bind_false;
1948     } else {
1949       if (proc_bind == proc_bind_default) {
1950         // No proc_bind clause specified; use current proc-bind-var for this
1951         // parallel region
1952         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1953       }
1954       /* else: The proc_bind policy was specified explicitly on parallel clause.
1955          This overrides proc-bind-var for this parallel region, but does not
1956          change proc-bind-var. */
1957       // Figure the value of proc-bind-var for the child threads.
1958       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1959           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1960            master_th->th.th_current_task->td_icvs.proc_bind)) {
1961         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1962       }
1963     }
1964 
1965     // Reset for next parallel region
1966     master_th->th.th_set_proc_bind = proc_bind_default;
1967 
1968     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1969       kmp_internal_control_t new_icvs;
1970       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1971       new_icvs.next = NULL;
1972       if (nthreads_icv > 0) {
1973         new_icvs.nproc = nthreads_icv;
1974       }
1975       if (proc_bind_icv != proc_bind_default) {
1976         new_icvs.proc_bind = proc_bind_icv;
1977       }
1978 
1979       /* allocate a new parallel team */
1980       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1981       team = __kmp_allocate_team(root, nthreads, nthreads,
1982 #if OMPT_SUPPORT
1983                                  ompt_parallel_data,
1984 #endif
1985                                  proc_bind, &new_icvs,
1986                                  argc USE_NESTED_HOT_ARG(master_th));
1987     } else {
1988       /* allocate a new parallel team */
1989       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1990       team = __kmp_allocate_team(root, nthreads, nthreads,
1991 #if OMPT_SUPPORT
1992                                  ompt_parallel_data,
1993 #endif
1994                                  proc_bind,
1995                                  &master_th->th.th_current_task->td_icvs,
1996                                  argc USE_NESTED_HOT_ARG(master_th));
1997     }
1998     KF_TRACE(
1999         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2000 
2001     /* setup the new team */
2002     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2003     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2004     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2005     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2006     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2007 #if OMPT_SUPPORT
2008     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2009                           return_address);
2010 #endif
2011     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2012     // TODO: parent_team->t.t_level == INT_MAX ???
2013     if (!master_th->th.th_teams_microtask || level > teams_level) {
2014       int new_level = parent_team->t.t_level + 1;
2015       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2016       new_level = parent_team->t.t_active_level + 1;
2017       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2018     } else {
2019       // AC: Do not increase parallel level at start of the teams construct
2020       int new_level = parent_team->t.t_level;
2021       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2022       new_level = parent_team->t.t_active_level;
2023       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2024     }
2025     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2026     // set primary thread's schedule as new run-time schedule
2027     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2028 
2029     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2030     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2031 
2032     // Update the floating point rounding in the team if required.
2033     propagateFPControl(team);
2034 #if OMPD_SUPPORT
2035     if (ompd_state & OMPD_ENABLE_BP)
2036       ompd_bp_parallel_begin();
2037 #endif
2038 
2039     if (__kmp_tasking_mode != tskm_immediate_exec) {
2040       // Set primary thread's task team to team's task team. Unless this is hot
2041       // team, it should be NULL.
2042       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2043                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2044       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2045                     "%p, new task_team %p / team %p\n",
2046                     __kmp_gtid_from_thread(master_th),
2047                     master_th->th.th_task_team, parent_team,
2048                     team->t.t_task_team[master_th->th.th_task_state], team));
2049 
2050       if (active_level || master_th->th.th_task_team) {
2051         // Take a memo of primary thread's task_state
2052         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2053         if (master_th->th.th_task_state_top >=
2054             master_th->th.th_task_state_stack_sz) { // increase size
2055           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2056           kmp_uint8 *old_stack, *new_stack;
2057           kmp_uint32 i;
2058           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2059           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2060             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2061           }
2062           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2063                ++i) { // zero-init rest of stack
2064             new_stack[i] = 0;
2065           }
2066           old_stack = master_th->th.th_task_state_memo_stack;
2067           master_th->th.th_task_state_memo_stack = new_stack;
2068           master_th->th.th_task_state_stack_sz = new_size;
2069           __kmp_free(old_stack);
2070         }
2071         // Store primary thread's task_state on stack
2072         master_th->th
2073             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2074             master_th->th.th_task_state;
2075         master_th->th.th_task_state_top++;
2076 #if KMP_NESTED_HOT_TEAMS
2077         if (master_th->th.th_hot_teams &&
2078             active_level < __kmp_hot_teams_max_level &&
2079             team == master_th->th.th_hot_teams[active_level].hot_team) {
2080           // Restore primary thread's nested state if nested hot team
2081           master_th->th.th_task_state =
2082               master_th->th
2083                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2084         } else {
2085 #endif
2086           master_th->th.th_task_state = 0;
2087 #if KMP_NESTED_HOT_TEAMS
2088         }
2089 #endif
2090       }
2091 #if !KMP_NESTED_HOT_TEAMS
2092       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2093                        (team == root->r.r_hot_team));
2094 #endif
2095     }
2096 
2097     KA_TRACE(
2098         20,
2099         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2100          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2101          team->t.t_nproc));
2102     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2103                      (team->t.t_master_tid == 0 &&
2104                       (team->t.t_parent == root->r.r_root_team ||
2105                        team->t.t_parent->t.t_serialized)));
2106     KMP_MB();
2107 
2108     /* now, setup the arguments */
2109     argv = (void **)team->t.t_argv;
2110     if (ap) {
2111       for (i = argc - 1; i >= 0; --i) {
2112         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2113         KMP_CHECK_UPDATE(*argv, new_argv);
2114         argv++;
2115       }
2116     } else {
2117       for (i = 0; i < argc; ++i) {
2118         // Get args from parent team for teams construct
2119         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2120       }
2121     }
2122 
2123     /* now actually fork the threads */
2124     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2125     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2126       root->r.r_active = TRUE;
2127 
2128     __kmp_fork_team_threads(root, team, master_th, gtid);
2129     __kmp_setup_icv_copy(team, nthreads,
2130                          &master_th->th.th_current_task->td_icvs, loc);
2131 
2132 #if OMPT_SUPPORT
2133     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2134 #endif
2135 
2136     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2137 
2138 #if USE_ITT_BUILD
2139     if (team->t.t_active_level == 1 // only report frames at level 1
2140         && !master_th->th.th_teams_microtask) { // not in teams construct
2141 #if USE_ITT_NOTIFY
2142       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2143           (__kmp_forkjoin_frames_mode == 3 ||
2144            __kmp_forkjoin_frames_mode == 1)) {
2145         kmp_uint64 tmp_time = 0;
2146         if (__itt_get_timestamp_ptr)
2147           tmp_time = __itt_get_timestamp();
2148         // Internal fork - report frame begin
2149         master_th->th.th_frame_time = tmp_time;
2150         if (__kmp_forkjoin_frames_mode == 3)
2151           team->t.t_region_time = tmp_time;
2152       } else
2153 // only one notification scheme (either "submit" or "forking/joined", not both)
2154 #endif /* USE_ITT_NOTIFY */
2155           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2156               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2157         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2158         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2159       }
2160     }
2161 #endif /* USE_ITT_BUILD */
2162 
2163     /* now go on and do the work */
2164     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2165     KMP_MB();
2166     KF_TRACE(10,
2167              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2168               root, team, master_th, gtid));
2169 
2170 #if USE_ITT_BUILD
2171     if (__itt_stack_caller_create_ptr) {
2172       // create new stack stitching id before entering fork barrier
2173       if (!enter_teams) {
2174         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2175         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2176       } else if (parent_team->t.t_serialized) {
2177         // keep stack stitching id in the serialized parent_team;
2178         // current team will be used for parallel inside the teams;
2179         // if parent_team is active, then it already keeps stack stitching id
2180         // for the league of teams
2181         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2182         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2183       }
2184     }
2185 #endif /* USE_ITT_BUILD */
2186 
2187     // AC: skip __kmp_internal_fork at teams construct, let only primary
2188     // threads execute
2189     if (ap) {
2190       __kmp_internal_fork(loc, gtid, team);
2191       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2192                     "master_th=%p, gtid=%d\n",
2193                     root, team, master_th, gtid));
2194     }
2195 
2196     if (call_context == fork_context_gnu) {
2197       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2198       return TRUE;
2199     }
2200 
2201     /* Invoke microtask for PRIMARY thread */
2202     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2203                   team->t.t_id, team->t.t_pkfn));
2204   } // END of timer KMP_fork_call block
2205 
2206 #if KMP_STATS_ENABLED
2207   // If beginning a teams construct, then change thread state
2208   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2209   if (!ap) {
2210     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2211   }
2212 #endif
2213 
2214   if (!team->t.t_invoke(gtid)) {
2215     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2216   }
2217 
2218 #if KMP_STATS_ENABLED
2219   // If was beginning of a teams construct, then reset thread state
2220   if (!ap) {
2221     KMP_SET_THREAD_STATE(previous_state);
2222   }
2223 #endif
2224 
2225   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2226                 team->t.t_id, team->t.t_pkfn));
2227   KMP_MB(); /* Flush all pending memory write invalidates.  */
2228 
2229   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2230 #if OMPT_SUPPORT
2231   if (ompt_enabled.enabled) {
2232     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2233   }
2234 #endif
2235 
2236   return TRUE;
2237 }
2238 
2239 #if OMPT_SUPPORT
2240 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2241                                             kmp_team_t *team) {
2242   // restore state outside the region
2243   thread->th.ompt_thread_info.state =
2244       ((team->t.t_serialized) ? ompt_state_work_serial
2245                               : ompt_state_work_parallel);
2246 }
2247 
2248 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2249                                    kmp_team_t *team, ompt_data_t *parallel_data,
2250                                    int flags, void *codeptr) {
2251   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2252   if (ompt_enabled.ompt_callback_parallel_end) {
2253     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2254         parallel_data, &(task_info->task_data), flags, codeptr);
2255   }
2256 
2257   task_info->frame.enter_frame = ompt_data_none;
2258   __kmp_join_restore_state(thread, team);
2259 }
2260 #endif
2261 
2262 void __kmp_join_call(ident_t *loc, int gtid
2263 #if OMPT_SUPPORT
2264                      ,
2265                      enum fork_context_e fork_context
2266 #endif
2267                      ,
2268                      int exit_teams) {
2269   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2270   kmp_team_t *team;
2271   kmp_team_t *parent_team;
2272   kmp_info_t *master_th;
2273   kmp_root_t *root;
2274   int master_active;
2275 
2276   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2277 
2278   /* setup current data */
2279   master_th = __kmp_threads[gtid];
2280   root = master_th->th.th_root;
2281   team = master_th->th.th_team;
2282   parent_team = team->t.t_parent;
2283 
2284   master_th->th.th_ident = loc;
2285 
2286 #if OMPT_SUPPORT
2287   void *team_microtask = (void *)team->t.t_pkfn;
2288   // For GOMP interface with serialized parallel, need the
2289   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2290   // and end-parallel events.
2291   if (ompt_enabled.enabled &&
2292       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2293     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2294   }
2295 #endif
2296 
2297 #if KMP_DEBUG
2298   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2299     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2300                   "th_task_team = %p\n",
2301                   __kmp_gtid_from_thread(master_th), team,
2302                   team->t.t_task_team[master_th->th.th_task_state],
2303                   master_th->th.th_task_team));
2304     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2305                      team->t.t_task_team[master_th->th.th_task_state]);
2306   }
2307 #endif
2308 
2309   if (team->t.t_serialized) {
2310     if (master_th->th.th_teams_microtask) {
2311       // We are in teams construct
2312       int level = team->t.t_level;
2313       int tlevel = master_th->th.th_teams_level;
2314       if (level == tlevel) {
2315         // AC: we haven't incremented it earlier at start of teams construct,
2316         //     so do it here - at the end of teams construct
2317         team->t.t_level++;
2318       } else if (level == tlevel + 1) {
2319         // AC: we are exiting parallel inside teams, need to increment
2320         // serialization in order to restore it in the next call to
2321         // __kmpc_end_serialized_parallel
2322         team->t.t_serialized++;
2323       }
2324     }
2325     __kmpc_end_serialized_parallel(loc, gtid);
2326 
2327 #if OMPT_SUPPORT
2328     if (ompt_enabled.enabled) {
2329       __kmp_join_restore_state(master_th, parent_team);
2330     }
2331 #endif
2332 
2333     return;
2334   }
2335 
2336   master_active = team->t.t_master_active;
2337 
2338   if (!exit_teams) {
2339     // AC: No barrier for internal teams at exit from teams construct.
2340     //     But there is barrier for external team (league).
2341     __kmp_internal_join(loc, gtid, team);
2342 #if USE_ITT_BUILD
2343     if (__itt_stack_caller_create_ptr) {
2344       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2345       // destroy the stack stitching id after join barrier
2346       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2347       team->t.t_stack_id = NULL;
2348     }
2349 #endif
2350   } else {
2351     master_th->th.th_task_state =
2352         0; // AC: no tasking in teams (out of any parallel)
2353 #if USE_ITT_BUILD
2354     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2355       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2356       // destroy the stack stitching id on exit from the teams construct
2357       // if parent_team is active, then the id will be destroyed later on
2358       // by master of the league of teams
2359       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2360       parent_team->t.t_stack_id = NULL;
2361     }
2362 #endif
2363   }
2364 
2365   KMP_MB();
2366 
2367 #if OMPT_SUPPORT
2368   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2369   void *codeptr = team->t.ompt_team_info.master_return_address;
2370 #endif
2371 
2372 #if USE_ITT_BUILD
2373   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2374   if (team->t.t_active_level == 1 &&
2375       (!master_th->th.th_teams_microtask || /* not in teams construct */
2376        master_th->th.th_teams_size.nteams == 1)) {
2377     master_th->th.th_ident = loc;
2378     // only one notification scheme (either "submit" or "forking/joined", not
2379     // both)
2380     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2381         __kmp_forkjoin_frames_mode == 3)
2382       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2383                              master_th->th.th_frame_time, 0, loc,
2384                              master_th->th.th_team_nproc, 1);
2385     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2386              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2387       __kmp_itt_region_joined(gtid);
2388   } // active_level == 1
2389 #endif /* USE_ITT_BUILD */
2390 
2391   if (master_th->th.th_teams_microtask && !exit_teams &&
2392       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2393       team->t.t_level == master_th->th.th_teams_level + 1) {
2394 // AC: We need to leave the team structure intact at the end of parallel
2395 // inside the teams construct, so that at the next parallel same (hot) team
2396 // works, only adjust nesting levels
2397 #if OMPT_SUPPORT
2398     ompt_data_t ompt_parallel_data = ompt_data_none;
2399     if (ompt_enabled.enabled) {
2400       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2401       if (ompt_enabled.ompt_callback_implicit_task) {
2402         int ompt_team_size = team->t.t_nproc;
2403         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2404             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2405             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2406       }
2407       task_info->frame.exit_frame = ompt_data_none;
2408       task_info->task_data = ompt_data_none;
2409       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2410       __ompt_lw_taskteam_unlink(master_th);
2411     }
2412 #endif
2413     /* Decrement our nested depth level */
2414     team->t.t_level--;
2415     team->t.t_active_level--;
2416     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2417 
2418     // Restore number of threads in the team if needed. This code relies on
2419     // the proper adjustment of th_teams_size.nth after the fork in
2420     // __kmp_teams_master on each teams primary thread in the case that
2421     // __kmp_reserve_threads reduced it.
2422     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2423       int old_num = master_th->th.th_team_nproc;
2424       int new_num = master_th->th.th_teams_size.nth;
2425       kmp_info_t **other_threads = team->t.t_threads;
2426       team->t.t_nproc = new_num;
2427       for (int i = 0; i < old_num; ++i) {
2428         other_threads[i]->th.th_team_nproc = new_num;
2429       }
2430       // Adjust states of non-used threads of the team
2431       for (int i = old_num; i < new_num; ++i) {
2432         // Re-initialize thread's barrier data.
2433         KMP_DEBUG_ASSERT(other_threads[i]);
2434         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2435         for (int b = 0; b < bs_last_barrier; ++b) {
2436           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2437           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2438 #if USE_DEBUGGER
2439           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2440 #endif
2441         }
2442         if (__kmp_tasking_mode != tskm_immediate_exec) {
2443           // Synchronize thread's task state
2444           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2445         }
2446       }
2447     }
2448 
2449 #if OMPT_SUPPORT
2450     if (ompt_enabled.enabled) {
2451       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2452                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2453     }
2454 #endif
2455 
2456     return;
2457   }
2458 
2459   /* do cleanup and restore the parent team */
2460   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2461   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2462 
2463   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2464 
2465   /* jc: The following lock has instructions with REL and ACQ semantics,
2466      separating the parallel user code called in this parallel region
2467      from the serial user code called after this function returns. */
2468   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2469 
2470   if (!master_th->th.th_teams_microtask ||
2471       team->t.t_level > master_th->th.th_teams_level) {
2472     /* Decrement our nested depth level */
2473     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2474   }
2475   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2476 
2477 #if OMPT_SUPPORT
2478   if (ompt_enabled.enabled) {
2479     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2480     if (ompt_enabled.ompt_callback_implicit_task) {
2481       int flags = (team_microtask == (void *)__kmp_teams_master)
2482                       ? ompt_task_initial
2483                       : ompt_task_implicit;
2484       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2485       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2486           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2487           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2488     }
2489     task_info->frame.exit_frame = ompt_data_none;
2490     task_info->task_data = ompt_data_none;
2491   }
2492 #endif
2493 
2494   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2495                 master_th, team));
2496   __kmp_pop_current_task_from_thread(master_th);
2497 
2498 #if KMP_AFFINITY_SUPPORTED
2499   // Restore master thread's partition.
2500   master_th->th.th_first_place = team->t.t_first_place;
2501   master_th->th.th_last_place = team->t.t_last_place;
2502 #endif // KMP_AFFINITY_SUPPORTED
2503   master_th->th.th_def_allocator = team->t.t_def_allocator;
2504 
2505 #if OMPD_SUPPORT
2506   if (ompd_state & OMPD_ENABLE_BP)
2507     ompd_bp_parallel_end();
2508 #endif
2509   updateHWFPControl(team);
2510 
2511   if (root->r.r_active != master_active)
2512     root->r.r_active = master_active;
2513 
2514   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2515                             master_th)); // this will free worker threads
2516 
2517   /* this race was fun to find. make sure the following is in the critical
2518      region otherwise assertions may fail occasionally since the old team may be
2519      reallocated and the hierarchy appears inconsistent. it is actually safe to
2520      run and won't cause any bugs, but will cause those assertion failures. it's
2521      only one deref&assign so might as well put this in the critical region */
2522   master_th->th.th_team = parent_team;
2523   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2524   master_th->th.th_team_master = parent_team->t.t_threads[0];
2525   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2526 
2527   /* restore serialized team, if need be */
2528   if (parent_team->t.t_serialized &&
2529       parent_team != master_th->th.th_serial_team &&
2530       parent_team != root->r.r_root_team) {
2531     __kmp_free_team(root,
2532                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2533     master_th->th.th_serial_team = parent_team;
2534   }
2535 
2536   if (__kmp_tasking_mode != tskm_immediate_exec) {
2537     if (master_th->th.th_task_state_top >
2538         0) { // Restore task state from memo stack
2539       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2540       // Remember primary thread's state if we re-use this nested hot team
2541       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2542           master_th->th.th_task_state;
2543       --master_th->th.th_task_state_top; // pop
2544       // Now restore state at this level
2545       master_th->th.th_task_state =
2546           master_th->th
2547               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2548     }
2549     // Copy the task team from the parent team to the primary thread
2550     master_th->th.th_task_team =
2551         parent_team->t.t_task_team[master_th->th.th_task_state];
2552     KA_TRACE(20,
2553              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2554               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2555               parent_team));
2556   }
2557 
2558   // TODO: GEH - cannot do this assertion because root thread not set up as
2559   // executing
2560   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2561   master_th->th.th_current_task->td_flags.executing = 1;
2562 
2563   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2564 
2565 #if OMPT_SUPPORT
2566   int flags =
2567       OMPT_INVOKER(fork_context) |
2568       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2569                                                       : ompt_parallel_team);
2570   if (ompt_enabled.enabled) {
2571     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2572                     codeptr);
2573   }
2574 #endif
2575 
2576   KMP_MB();
2577   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2578 }
2579 
2580 /* Check whether we should push an internal control record onto the
2581    serial team stack.  If so, do it.  */
2582 void __kmp_save_internal_controls(kmp_info_t *thread) {
2583 
2584   if (thread->th.th_team != thread->th.th_serial_team) {
2585     return;
2586   }
2587   if (thread->th.th_team->t.t_serialized > 1) {
2588     int push = 0;
2589 
2590     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2591       push = 1;
2592     } else {
2593       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2594           thread->th.th_team->t.t_serialized) {
2595         push = 1;
2596       }
2597     }
2598     if (push) { /* push a record on the serial team's stack */
2599       kmp_internal_control_t *control =
2600           (kmp_internal_control_t *)__kmp_allocate(
2601               sizeof(kmp_internal_control_t));
2602 
2603       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2604 
2605       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2606 
2607       control->next = thread->th.th_team->t.t_control_stack_top;
2608       thread->th.th_team->t.t_control_stack_top = control;
2609     }
2610   }
2611 }
2612 
2613 /* Changes set_nproc */
2614 void __kmp_set_num_threads(int new_nth, int gtid) {
2615   kmp_info_t *thread;
2616   kmp_root_t *root;
2617 
2618   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2619   KMP_DEBUG_ASSERT(__kmp_init_serial);
2620 
2621   if (new_nth < 1)
2622     new_nth = 1;
2623   else if (new_nth > __kmp_max_nth)
2624     new_nth = __kmp_max_nth;
2625 
2626   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2627   thread = __kmp_threads[gtid];
2628   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2629     return; // nothing to do
2630 
2631   __kmp_save_internal_controls(thread);
2632 
2633   set__nproc(thread, new_nth);
2634 
2635   // If this omp_set_num_threads() call will cause the hot team size to be
2636   // reduced (in the absence of a num_threads clause), then reduce it now,
2637   // rather than waiting for the next parallel region.
2638   root = thread->th.th_root;
2639   if (__kmp_init_parallel && (!root->r.r_active) &&
2640       (root->r.r_hot_team->t.t_nproc > new_nth)
2641 #if KMP_NESTED_HOT_TEAMS
2642       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2643 #endif
2644   ) {
2645     kmp_team_t *hot_team = root->r.r_hot_team;
2646     int f;
2647 
2648     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2649 
2650     // Release the extra threads we don't need any more.
2651     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2652       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2653       if (__kmp_tasking_mode != tskm_immediate_exec) {
2654         // When decreasing team size, threads no longer in the team should unref
2655         // task team.
2656         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2657       }
2658       __kmp_free_thread(hot_team->t.t_threads[f]);
2659       hot_team->t.t_threads[f] = NULL;
2660     }
2661     hot_team->t.t_nproc = new_nth;
2662 #if KMP_NESTED_HOT_TEAMS
2663     if (thread->th.th_hot_teams) {
2664       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2665       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2666     }
2667 #endif
2668 
2669     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2670 
2671     // Update the t_nproc field in the threads that are still active.
2672     for (f = 0; f < new_nth; f++) {
2673       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2674       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2675     }
2676     // Special flag in case omp_set_num_threads() call
2677     hot_team->t.t_size_changed = -1;
2678   }
2679 }
2680 
2681 /* Changes max_active_levels */
2682 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2683   kmp_info_t *thread;
2684 
2685   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2686                 "%d = (%d)\n",
2687                 gtid, max_active_levels));
2688   KMP_DEBUG_ASSERT(__kmp_init_serial);
2689 
2690   // validate max_active_levels
2691   if (max_active_levels < 0) {
2692     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2693     // We ignore this call if the user has specified a negative value.
2694     // The current setting won't be changed. The last valid setting will be
2695     // used. A warning will be issued (if warnings are allowed as controlled by
2696     // the KMP_WARNINGS env var).
2697     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2698                   "max_active_levels for thread %d = (%d)\n",
2699                   gtid, max_active_levels));
2700     return;
2701   }
2702   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2703     // it's OK, the max_active_levels is within the valid range: [ 0;
2704     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2705     // We allow a zero value. (implementation defined behavior)
2706   } else {
2707     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2708                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2709     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2710     // Current upper limit is MAX_INT. (implementation defined behavior)
2711     // If the input exceeds the upper limit, we correct the input to be the
2712     // upper limit. (implementation defined behavior)
2713     // Actually, the flow should never get here until we use MAX_INT limit.
2714   }
2715   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2716                 "max_active_levels for thread %d = (%d)\n",
2717                 gtid, max_active_levels));
2718 
2719   thread = __kmp_threads[gtid];
2720 
2721   __kmp_save_internal_controls(thread);
2722 
2723   set__max_active_levels(thread, max_active_levels);
2724 }
2725 
2726 /* Gets max_active_levels */
2727 int __kmp_get_max_active_levels(int gtid) {
2728   kmp_info_t *thread;
2729 
2730   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2731   KMP_DEBUG_ASSERT(__kmp_init_serial);
2732 
2733   thread = __kmp_threads[gtid];
2734   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2735   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2736                 "curtask_maxaclevel=%d\n",
2737                 gtid, thread->th.th_current_task,
2738                 thread->th.th_current_task->td_icvs.max_active_levels));
2739   return thread->th.th_current_task->td_icvs.max_active_levels;
2740 }
2741 
2742 // nteams-var per-device ICV
2743 void __kmp_set_num_teams(int num_teams) {
2744   if (num_teams > 0)
2745     __kmp_nteams = num_teams;
2746 }
2747 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2748 // teams-thread-limit-var per-device ICV
2749 void __kmp_set_teams_thread_limit(int limit) {
2750   if (limit > 0)
2751     __kmp_teams_thread_limit = limit;
2752 }
2753 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2754 
2755 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2756 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2757 
2758 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2759 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2760   kmp_info_t *thread;
2761   kmp_sched_t orig_kind;
2762   //    kmp_team_t *team;
2763 
2764   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2765                 gtid, (int)kind, chunk));
2766   KMP_DEBUG_ASSERT(__kmp_init_serial);
2767 
2768   // Check if the kind parameter is valid, correct if needed.
2769   // Valid parameters should fit in one of two intervals - standard or extended:
2770   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2771   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2772   orig_kind = kind;
2773   kind = __kmp_sched_without_mods(kind);
2774 
2775   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2776       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2777     // TODO: Hint needs attention in case we change the default schedule.
2778     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2779               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2780               __kmp_msg_null);
2781     kind = kmp_sched_default;
2782     chunk = 0; // ignore chunk value in case of bad kind
2783   }
2784 
2785   thread = __kmp_threads[gtid];
2786 
2787   __kmp_save_internal_controls(thread);
2788 
2789   if (kind < kmp_sched_upper_std) {
2790     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2791       // differ static chunked vs. unchunked:  chunk should be invalid to
2792       // indicate unchunked schedule (which is the default)
2793       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2794     } else {
2795       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2796           __kmp_sch_map[kind - kmp_sched_lower - 1];
2797     }
2798   } else {
2799     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2800     //    kmp_sched_lower - 2 ];
2801     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2802         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2803                       kmp_sched_lower - 2];
2804   }
2805   __kmp_sched_apply_mods_intkind(
2806       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2807   if (kind == kmp_sched_auto || chunk < 1) {
2808     // ignore parameter chunk for schedule auto
2809     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2810   } else {
2811     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2812   }
2813 }
2814 
2815 /* Gets def_sched_var ICV values */
2816 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2817   kmp_info_t *thread;
2818   enum sched_type th_type;
2819 
2820   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2821   KMP_DEBUG_ASSERT(__kmp_init_serial);
2822 
2823   thread = __kmp_threads[gtid];
2824 
2825   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2826   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2827   case kmp_sch_static:
2828   case kmp_sch_static_greedy:
2829   case kmp_sch_static_balanced:
2830     *kind = kmp_sched_static;
2831     __kmp_sched_apply_mods_stdkind(kind, th_type);
2832     *chunk = 0; // chunk was not set, try to show this fact via zero value
2833     return;
2834   case kmp_sch_static_chunked:
2835     *kind = kmp_sched_static;
2836     break;
2837   case kmp_sch_dynamic_chunked:
2838     *kind = kmp_sched_dynamic;
2839     break;
2840   case kmp_sch_guided_chunked:
2841   case kmp_sch_guided_iterative_chunked:
2842   case kmp_sch_guided_analytical_chunked:
2843     *kind = kmp_sched_guided;
2844     break;
2845   case kmp_sch_auto:
2846     *kind = kmp_sched_auto;
2847     break;
2848   case kmp_sch_trapezoidal:
2849     *kind = kmp_sched_trapezoidal;
2850     break;
2851 #if KMP_STATIC_STEAL_ENABLED
2852   case kmp_sch_static_steal:
2853     *kind = kmp_sched_static_steal;
2854     break;
2855 #endif
2856   default:
2857     KMP_FATAL(UnknownSchedulingType, th_type);
2858   }
2859 
2860   __kmp_sched_apply_mods_stdkind(kind, th_type);
2861   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2862 }
2863 
2864 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2865 
2866   int ii, dd;
2867   kmp_team_t *team;
2868   kmp_info_t *thr;
2869 
2870   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2871   KMP_DEBUG_ASSERT(__kmp_init_serial);
2872 
2873   // validate level
2874   if (level == 0)
2875     return 0;
2876   if (level < 0)
2877     return -1;
2878   thr = __kmp_threads[gtid];
2879   team = thr->th.th_team;
2880   ii = team->t.t_level;
2881   if (level > ii)
2882     return -1;
2883 
2884   if (thr->th.th_teams_microtask) {
2885     // AC: we are in teams region where multiple nested teams have same level
2886     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2887     if (level <=
2888         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2889       KMP_DEBUG_ASSERT(ii >= tlevel);
2890       // AC: As we need to pass by the teams league, we need to artificially
2891       // increase ii
2892       if (ii == tlevel) {
2893         ii += 2; // three teams have same level
2894       } else {
2895         ii++; // two teams have same level
2896       }
2897     }
2898   }
2899 
2900   if (ii == level)
2901     return __kmp_tid_from_gtid(gtid);
2902 
2903   dd = team->t.t_serialized;
2904   level++;
2905   while (ii > level) {
2906     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2907     }
2908     if ((team->t.t_serialized) && (!dd)) {
2909       team = team->t.t_parent;
2910       continue;
2911     }
2912     if (ii > level) {
2913       team = team->t.t_parent;
2914       dd = team->t.t_serialized;
2915       ii--;
2916     }
2917   }
2918 
2919   return (dd > 1) ? (0) : (team->t.t_master_tid);
2920 }
2921 
2922 int __kmp_get_team_size(int gtid, int level) {
2923 
2924   int ii, dd;
2925   kmp_team_t *team;
2926   kmp_info_t *thr;
2927 
2928   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2929   KMP_DEBUG_ASSERT(__kmp_init_serial);
2930 
2931   // validate level
2932   if (level == 0)
2933     return 1;
2934   if (level < 0)
2935     return -1;
2936   thr = __kmp_threads[gtid];
2937   team = thr->th.th_team;
2938   ii = team->t.t_level;
2939   if (level > ii)
2940     return -1;
2941 
2942   if (thr->th.th_teams_microtask) {
2943     // AC: we are in teams region where multiple nested teams have same level
2944     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2945     if (level <=
2946         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2947       KMP_DEBUG_ASSERT(ii >= tlevel);
2948       // AC: As we need to pass by the teams league, we need to artificially
2949       // increase ii
2950       if (ii == tlevel) {
2951         ii += 2; // three teams have same level
2952       } else {
2953         ii++; // two teams have same level
2954       }
2955     }
2956   }
2957 
2958   while (ii > level) {
2959     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2960     }
2961     if (team->t.t_serialized && (!dd)) {
2962       team = team->t.t_parent;
2963       continue;
2964     }
2965     if (ii > level) {
2966       team = team->t.t_parent;
2967       ii--;
2968     }
2969   }
2970 
2971   return team->t.t_nproc;
2972 }
2973 
2974 kmp_r_sched_t __kmp_get_schedule_global() {
2975   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2976   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2977   // independently. So one can get the updated schedule here.
2978 
2979   kmp_r_sched_t r_sched;
2980 
2981   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2982   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2983   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2984   // different roots (even in OMP 2.5)
2985   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2986   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2987   if (s == kmp_sch_static) {
2988     // replace STATIC with more detailed schedule (balanced or greedy)
2989     r_sched.r_sched_type = __kmp_static;
2990   } else if (s == kmp_sch_guided_chunked) {
2991     // replace GUIDED with more detailed schedule (iterative or analytical)
2992     r_sched.r_sched_type = __kmp_guided;
2993   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2994     r_sched.r_sched_type = __kmp_sched;
2995   }
2996   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2997 
2998   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2999     // __kmp_chunk may be wrong here (if it was not ever set)
3000     r_sched.chunk = KMP_DEFAULT_CHUNK;
3001   } else {
3002     r_sched.chunk = __kmp_chunk;
3003   }
3004 
3005   return r_sched;
3006 }
3007 
3008 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3009    at least argc number of *t_argv entries for the requested team. */
3010 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3011 
3012   KMP_DEBUG_ASSERT(team);
3013   if (!realloc || argc > team->t.t_max_argc) {
3014 
3015     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3016                    "current entries=%d\n",
3017                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3018     /* if previously allocated heap space for args, free them */
3019     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3020       __kmp_free((void *)team->t.t_argv);
3021 
3022     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3023       /* use unused space in the cache line for arguments */
3024       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3025       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3026                      "argv entries\n",
3027                      team->t.t_id, team->t.t_max_argc));
3028       team->t.t_argv = &team->t.t_inline_argv[0];
3029       if (__kmp_storage_map) {
3030         __kmp_print_storage_map_gtid(
3031             -1, &team->t.t_inline_argv[0],
3032             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3033             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3034             team->t.t_id);
3035       }
3036     } else {
3037       /* allocate space for arguments in the heap */
3038       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3039                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3040                                : 2 * argc;
3041       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3042                      "argv entries\n",
3043                      team->t.t_id, team->t.t_max_argc));
3044       team->t.t_argv =
3045           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3046       if (__kmp_storage_map) {
3047         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3048                                      &team->t.t_argv[team->t.t_max_argc],
3049                                      sizeof(void *) * team->t.t_max_argc,
3050                                      "team_%d.t_argv", team->t.t_id);
3051       }
3052     }
3053   }
3054 }
3055 
3056 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3057   int i;
3058   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3059   team->t.t_threads =
3060       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3061   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3062       sizeof(dispatch_shared_info_t) * num_disp_buff);
3063   team->t.t_dispatch =
3064       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3065   team->t.t_implicit_task_taskdata =
3066       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3067   team->t.t_max_nproc = max_nth;
3068 
3069   /* setup dispatch buffers */
3070   for (i = 0; i < num_disp_buff; ++i) {
3071     team->t.t_disp_buffer[i].buffer_index = i;
3072     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3073   }
3074 }
3075 
3076 static void __kmp_free_team_arrays(kmp_team_t *team) {
3077   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3078   int i;
3079   for (i = 0; i < team->t.t_max_nproc; ++i) {
3080     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3081       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3082       team->t.t_dispatch[i].th_disp_buffer = NULL;
3083     }
3084   }
3085 #if KMP_USE_HIER_SCHED
3086   __kmp_dispatch_free_hierarchies(team);
3087 #endif
3088   __kmp_free(team->t.t_threads);
3089   __kmp_free(team->t.t_disp_buffer);
3090   __kmp_free(team->t.t_dispatch);
3091   __kmp_free(team->t.t_implicit_task_taskdata);
3092   team->t.t_threads = NULL;
3093   team->t.t_disp_buffer = NULL;
3094   team->t.t_dispatch = NULL;
3095   team->t.t_implicit_task_taskdata = 0;
3096 }
3097 
3098 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3099   kmp_info_t **oldThreads = team->t.t_threads;
3100 
3101   __kmp_free(team->t.t_disp_buffer);
3102   __kmp_free(team->t.t_dispatch);
3103   __kmp_free(team->t.t_implicit_task_taskdata);
3104   __kmp_allocate_team_arrays(team, max_nth);
3105 
3106   KMP_MEMCPY(team->t.t_threads, oldThreads,
3107              team->t.t_nproc * sizeof(kmp_info_t *));
3108 
3109   __kmp_free(oldThreads);
3110 }
3111 
3112 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3113 
3114   kmp_r_sched_t r_sched =
3115       __kmp_get_schedule_global(); // get current state of scheduling globals
3116 
3117   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3118 
3119   kmp_internal_control_t g_icvs = {
3120     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3121     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3122     // adjustment of threads (per thread)
3123     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3124     // whether blocktime is explicitly set
3125     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3126 #if KMP_USE_MONITOR
3127     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3128 // intervals
3129 #endif
3130     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3131     // next parallel region (per thread)
3132     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3133     __kmp_cg_max_nth, // int thread_limit;
3134     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3135     // for max_active_levels
3136     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3137     // {sched,chunk} pair
3138     __kmp_nested_proc_bind.bind_types[0],
3139     __kmp_default_device,
3140     NULL // struct kmp_internal_control *next;
3141   };
3142 
3143   return g_icvs;
3144 }
3145 
3146 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3147 
3148   kmp_internal_control_t gx_icvs;
3149   gx_icvs.serial_nesting_level =
3150       0; // probably =team->t.t_serial like in save_inter_controls
3151   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3152   gx_icvs.next = NULL;
3153 
3154   return gx_icvs;
3155 }
3156 
3157 static void __kmp_initialize_root(kmp_root_t *root) {
3158   int f;
3159   kmp_team_t *root_team;
3160   kmp_team_t *hot_team;
3161   int hot_team_max_nth;
3162   kmp_r_sched_t r_sched =
3163       __kmp_get_schedule_global(); // get current state of scheduling globals
3164   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3165   KMP_DEBUG_ASSERT(root);
3166   KMP_ASSERT(!root->r.r_begin);
3167 
3168   /* setup the root state structure */
3169   __kmp_init_lock(&root->r.r_begin_lock);
3170   root->r.r_begin = FALSE;
3171   root->r.r_active = FALSE;
3172   root->r.r_in_parallel = 0;
3173   root->r.r_blocktime = __kmp_dflt_blocktime;
3174 
3175   /* setup the root team for this task */
3176   /* allocate the root team structure */
3177   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3178 
3179   root_team =
3180       __kmp_allocate_team(root,
3181                           1, // new_nproc
3182                           1, // max_nproc
3183 #if OMPT_SUPPORT
3184                           ompt_data_none, // root parallel id
3185 #endif
3186                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3187                           0 // argc
3188                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3189                           );
3190 #if USE_DEBUGGER
3191   // Non-NULL value should be assigned to make the debugger display the root
3192   // team.
3193   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3194 #endif
3195 
3196   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3197 
3198   root->r.r_root_team = root_team;
3199   root_team->t.t_control_stack_top = NULL;
3200 
3201   /* initialize root team */
3202   root_team->t.t_threads[0] = NULL;
3203   root_team->t.t_nproc = 1;
3204   root_team->t.t_serialized = 1;
3205   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3206   root_team->t.t_sched.sched = r_sched.sched;
3207   KA_TRACE(
3208       20,
3209       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3210        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3211 
3212   /* setup the  hot team for this task */
3213   /* allocate the hot team structure */
3214   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3215 
3216   hot_team =
3217       __kmp_allocate_team(root,
3218                           1, // new_nproc
3219                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3220 #if OMPT_SUPPORT
3221                           ompt_data_none, // root parallel id
3222 #endif
3223                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3224                           0 // argc
3225                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3226                           );
3227   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3228 
3229   root->r.r_hot_team = hot_team;
3230   root_team->t.t_control_stack_top = NULL;
3231 
3232   /* first-time initialization */
3233   hot_team->t.t_parent = root_team;
3234 
3235   /* initialize hot team */
3236   hot_team_max_nth = hot_team->t.t_max_nproc;
3237   for (f = 0; f < hot_team_max_nth; ++f) {
3238     hot_team->t.t_threads[f] = NULL;
3239   }
3240   hot_team->t.t_nproc = 1;
3241   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3242   hot_team->t.t_sched.sched = r_sched.sched;
3243   hot_team->t.t_size_changed = 0;
3244 }
3245 
3246 #ifdef KMP_DEBUG
3247 
3248 typedef struct kmp_team_list_item {
3249   kmp_team_p const *entry;
3250   struct kmp_team_list_item *next;
3251 } kmp_team_list_item_t;
3252 typedef kmp_team_list_item_t *kmp_team_list_t;
3253 
3254 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3255     kmp_team_list_t list, // List of teams.
3256     kmp_team_p const *team // Team to add.
3257 ) {
3258 
3259   // List must terminate with item where both entry and next are NULL.
3260   // Team is added to the list only once.
3261   // List is sorted in ascending order by team id.
3262   // Team id is *not* a key.
3263 
3264   kmp_team_list_t l;
3265 
3266   KMP_DEBUG_ASSERT(list != NULL);
3267   if (team == NULL) {
3268     return;
3269   }
3270 
3271   __kmp_print_structure_team_accum(list, team->t.t_parent);
3272   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3273 
3274   // Search list for the team.
3275   l = list;
3276   while (l->next != NULL && l->entry != team) {
3277     l = l->next;
3278   }
3279   if (l->next != NULL) {
3280     return; // Team has been added before, exit.
3281   }
3282 
3283   // Team is not found. Search list again for insertion point.
3284   l = list;
3285   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3286     l = l->next;
3287   }
3288 
3289   // Insert team.
3290   {
3291     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3292         sizeof(kmp_team_list_item_t));
3293     *item = *l;
3294     l->entry = team;
3295     l->next = item;
3296   }
3297 }
3298 
3299 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3300 
3301 ) {
3302   __kmp_printf("%s", title);
3303   if (team != NULL) {
3304     __kmp_printf("%2x %p\n", team->t.t_id, team);
3305   } else {
3306     __kmp_printf(" - (nil)\n");
3307   }
3308 }
3309 
3310 static void __kmp_print_structure_thread(char const *title,
3311                                          kmp_info_p const *thread) {
3312   __kmp_printf("%s", title);
3313   if (thread != NULL) {
3314     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3315   } else {
3316     __kmp_printf(" - (nil)\n");
3317   }
3318 }
3319 
3320 void __kmp_print_structure(void) {
3321 
3322   kmp_team_list_t list;
3323 
3324   // Initialize list of teams.
3325   list =
3326       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3327   list->entry = NULL;
3328   list->next = NULL;
3329 
3330   __kmp_printf("\n------------------------------\nGlobal Thread "
3331                "Table\n------------------------------\n");
3332   {
3333     int gtid;
3334     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3335       __kmp_printf("%2d", gtid);
3336       if (__kmp_threads != NULL) {
3337         __kmp_printf(" %p", __kmp_threads[gtid]);
3338       }
3339       if (__kmp_root != NULL) {
3340         __kmp_printf(" %p", __kmp_root[gtid]);
3341       }
3342       __kmp_printf("\n");
3343     }
3344   }
3345 
3346   // Print out __kmp_threads array.
3347   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3348                "----------\n");
3349   if (__kmp_threads != NULL) {
3350     int gtid;
3351     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3352       kmp_info_t const *thread = __kmp_threads[gtid];
3353       if (thread != NULL) {
3354         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3355         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3356         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3357         __kmp_print_structure_team("    Serial Team:  ",
3358                                    thread->th.th_serial_team);
3359         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3360         __kmp_print_structure_thread("    Primary:      ",
3361                                      thread->th.th_team_master);
3362         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3363         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3364         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3365         __kmp_print_structure_thread("    Next in pool: ",
3366                                      thread->th.th_next_pool);
3367         __kmp_printf("\n");
3368         __kmp_print_structure_team_accum(list, thread->th.th_team);
3369         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3370       }
3371     }
3372   } else {
3373     __kmp_printf("Threads array is not allocated.\n");
3374   }
3375 
3376   // Print out __kmp_root array.
3377   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3378                "--------\n");
3379   if (__kmp_root != NULL) {
3380     int gtid;
3381     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3382       kmp_root_t const *root = __kmp_root[gtid];
3383       if (root != NULL) {
3384         __kmp_printf("GTID %2d %p:\n", gtid, root);
3385         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3386         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3387         __kmp_print_structure_thread("    Uber Thread:  ",
3388                                      root->r.r_uber_thread);
3389         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3390         __kmp_printf("    In Parallel:  %2d\n",
3391                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3392         __kmp_printf("\n");
3393         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3394         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3395       }
3396     }
3397   } else {
3398     __kmp_printf("Ubers array is not allocated.\n");
3399   }
3400 
3401   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3402                "--------\n");
3403   while (list->next != NULL) {
3404     kmp_team_p const *team = list->entry;
3405     int i;
3406     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3407     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3408     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3409     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3410     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3411     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3412     for (i = 0; i < team->t.t_nproc; ++i) {
3413       __kmp_printf("    Thread %2d:      ", i);
3414       __kmp_print_structure_thread("", team->t.t_threads[i]);
3415     }
3416     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3417     __kmp_printf("\n");
3418     list = list->next;
3419   }
3420 
3421   // Print out __kmp_thread_pool and __kmp_team_pool.
3422   __kmp_printf("\n------------------------------\nPools\n----------------------"
3423                "--------\n");
3424   __kmp_print_structure_thread("Thread pool:          ",
3425                                CCAST(kmp_info_t *, __kmp_thread_pool));
3426   __kmp_print_structure_team("Team pool:            ",
3427                              CCAST(kmp_team_t *, __kmp_team_pool));
3428   __kmp_printf("\n");
3429 
3430   // Free team list.
3431   while (list != NULL) {
3432     kmp_team_list_item_t *item = list;
3433     list = list->next;
3434     KMP_INTERNAL_FREE(item);
3435   }
3436 }
3437 
3438 #endif
3439 
3440 //---------------------------------------------------------------------------
3441 //  Stuff for per-thread fast random number generator
3442 //  Table of primes
3443 static const unsigned __kmp_primes[] = {
3444     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3445     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3446     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3447     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3448     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3449     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3450     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3451     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3452     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3453     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3454     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3455 
3456 //---------------------------------------------------------------------------
3457 //  __kmp_get_random: Get a random number using a linear congruential method.
3458 unsigned short __kmp_get_random(kmp_info_t *thread) {
3459   unsigned x = thread->th.th_x;
3460   unsigned short r = (unsigned short)(x >> 16);
3461 
3462   thread->th.th_x = x * thread->th.th_a + 1;
3463 
3464   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3465                 thread->th.th_info.ds.ds_tid, r));
3466 
3467   return r;
3468 }
3469 //--------------------------------------------------------
3470 // __kmp_init_random: Initialize a random number generator
3471 void __kmp_init_random(kmp_info_t *thread) {
3472   unsigned seed = thread->th.th_info.ds.ds_tid;
3473 
3474   thread->th.th_a =
3475       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3476   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3477   KA_TRACE(30,
3478            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3479 }
3480 
3481 #if KMP_OS_WINDOWS
3482 /* reclaim array entries for root threads that are already dead, returns number
3483  * reclaimed */
3484 static int __kmp_reclaim_dead_roots(void) {
3485   int i, r = 0;
3486 
3487   for (i = 0; i < __kmp_threads_capacity; ++i) {
3488     if (KMP_UBER_GTID(i) &&
3489         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3490         !__kmp_root[i]
3491              ->r.r_active) { // AC: reclaim only roots died in non-active state
3492       r += __kmp_unregister_root_other_thread(i);
3493     }
3494   }
3495   return r;
3496 }
3497 #endif
3498 
3499 /* This function attempts to create free entries in __kmp_threads and
3500    __kmp_root, and returns the number of free entries generated.
3501 
3502    For Windows* OS static library, the first mechanism used is to reclaim array
3503    entries for root threads that are already dead.
3504 
3505    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3506    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3507    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3508    threadprivate cache array has been created. Synchronization with
3509    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3510 
3511    After any dead root reclamation, if the clipping value allows array expansion
3512    to result in the generation of a total of nNeed free slots, the function does
3513    that expansion. If not, nothing is done beyond the possible initial root
3514    thread reclamation.
3515 
3516    If any argument is negative, the behavior is undefined. */
3517 static int __kmp_expand_threads(int nNeed) {
3518   int added = 0;
3519   int minimumRequiredCapacity;
3520   int newCapacity;
3521   kmp_info_t **newThreads;
3522   kmp_root_t **newRoot;
3523 
3524   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3525   // resizing __kmp_threads does not need additional protection if foreign
3526   // threads are present
3527 
3528 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3529   /* only for Windows static library */
3530   /* reclaim array entries for root threads that are already dead */
3531   added = __kmp_reclaim_dead_roots();
3532 
3533   if (nNeed) {
3534     nNeed -= added;
3535     if (nNeed < 0)
3536       nNeed = 0;
3537   }
3538 #endif
3539   if (nNeed <= 0)
3540     return added;
3541 
3542   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3543   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3544   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3545   // > __kmp_max_nth in one of two ways:
3546   //
3547   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3548   //    may not be reused by another thread, so we may need to increase
3549   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3550   //
3551   // 2) New foreign root(s) are encountered.  We always register new foreign
3552   //    roots. This may cause a smaller # of threads to be allocated at
3553   //    subsequent parallel regions, but the worker threads hang around (and
3554   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3555   //
3556   // Anyway, that is the reason for moving the check to see if
3557   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3558   // instead of having it performed here. -BB
3559 
3560   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3561 
3562   /* compute expansion headroom to check if we can expand */
3563   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3564     /* possible expansion too small -- give up */
3565     return added;
3566   }
3567   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3568 
3569   newCapacity = __kmp_threads_capacity;
3570   do {
3571     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3572                                                           : __kmp_sys_max_nth;
3573   } while (newCapacity < minimumRequiredCapacity);
3574   newThreads = (kmp_info_t **)__kmp_allocate(
3575       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3576   newRoot =
3577       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3578   KMP_MEMCPY(newThreads, __kmp_threads,
3579              __kmp_threads_capacity * sizeof(kmp_info_t *));
3580   KMP_MEMCPY(newRoot, __kmp_root,
3581              __kmp_threads_capacity * sizeof(kmp_root_t *));
3582 
3583   kmp_info_t **temp_threads = __kmp_threads;
3584   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3585   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3586   __kmp_free(temp_threads);
3587   added += newCapacity - __kmp_threads_capacity;
3588   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3589 
3590   if (newCapacity > __kmp_tp_capacity) {
3591     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3592     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3593       __kmp_threadprivate_resize_cache(newCapacity);
3594     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3595       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3596     }
3597     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3598   }
3599 
3600   return added;
3601 }
3602 
3603 /* Register the current thread as a root thread and obtain our gtid. We must
3604    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3605    thread that calls from __kmp_do_serial_initialize() */
3606 int __kmp_register_root(int initial_thread) {
3607   kmp_info_t *root_thread;
3608   kmp_root_t *root;
3609   int gtid;
3610   int capacity;
3611   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3612   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3613   KMP_MB();
3614 
3615   /* 2007-03-02:
3616      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3617      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3618      work as expected -- it may return false (that means there is at least one
3619      empty slot in __kmp_threads array), but it is possible the only free slot
3620      is #0, which is reserved for initial thread and so cannot be used for this
3621      one. Following code workarounds this bug.
3622 
3623      However, right solution seems to be not reserving slot #0 for initial
3624      thread because:
3625      (1) there is no magic in slot #0,
3626      (2) we cannot detect initial thread reliably (the first thread which does
3627         serial initialization may be not a real initial thread).
3628   */
3629   capacity = __kmp_threads_capacity;
3630   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3631     --capacity;
3632   }
3633 
3634   // If it is not for initializing the hidden helper team, we need to take
3635   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3636   // in __kmp_threads_capacity.
3637   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3638     capacity -= __kmp_hidden_helper_threads_num;
3639   }
3640 
3641   /* see if there are too many threads */
3642   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3643     if (__kmp_tp_cached) {
3644       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3645                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3646                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3647     } else {
3648       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3649                   __kmp_msg_null);
3650     }
3651   }
3652 
3653   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3654   // 0: initial thread, also a regular OpenMP thread.
3655   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3656   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3657   // regular OpenMP threads.
3658   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3659     // Find an available thread slot for hidden helper thread. Slots for hidden
3660     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3661     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3662                    gtid <= __kmp_hidden_helper_threads_num;
3663          gtid++)
3664       ;
3665     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3666     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3667                  "hidden helper thread: T#%d\n",
3668                  gtid));
3669   } else {
3670     /* find an available thread slot */
3671     // Don't reassign the zero slot since we need that to only be used by
3672     // initial thread. Slots for hidden helper threads should also be skipped.
3673     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3674       gtid = 0;
3675     } else {
3676       for (gtid = __kmp_hidden_helper_threads_num + 1;
3677            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3678         ;
3679     }
3680     KA_TRACE(
3681         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3682     KMP_ASSERT(gtid < __kmp_threads_capacity);
3683   }
3684 
3685   /* update global accounting */
3686   __kmp_all_nth++;
3687   TCW_4(__kmp_nth, __kmp_nth + 1);
3688 
3689   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3690   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3691   if (__kmp_adjust_gtid_mode) {
3692     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3693       if (TCR_4(__kmp_gtid_mode) != 2) {
3694         TCW_4(__kmp_gtid_mode, 2);
3695       }
3696     } else {
3697       if (TCR_4(__kmp_gtid_mode) != 1) {
3698         TCW_4(__kmp_gtid_mode, 1);
3699       }
3700     }
3701   }
3702 
3703 #ifdef KMP_ADJUST_BLOCKTIME
3704   /* Adjust blocktime to zero if necessary            */
3705   /* Middle initialization might not have occurred yet */
3706   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3707     if (__kmp_nth > __kmp_avail_proc) {
3708       __kmp_zero_bt = TRUE;
3709     }
3710   }
3711 #endif /* KMP_ADJUST_BLOCKTIME */
3712 
3713   /* setup this new hierarchy */
3714   if (!(root = __kmp_root[gtid])) {
3715     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3716     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3717   }
3718 
3719 #if KMP_STATS_ENABLED
3720   // Initialize stats as soon as possible (right after gtid assignment).
3721   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3722   __kmp_stats_thread_ptr->startLife();
3723   KMP_SET_THREAD_STATE(SERIAL_REGION);
3724   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3725 #endif
3726   __kmp_initialize_root(root);
3727 
3728   /* setup new root thread structure */
3729   if (root->r.r_uber_thread) {
3730     root_thread = root->r.r_uber_thread;
3731   } else {
3732     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3733     if (__kmp_storage_map) {
3734       __kmp_print_thread_storage_map(root_thread, gtid);
3735     }
3736     root_thread->th.th_info.ds.ds_gtid = gtid;
3737 #if OMPT_SUPPORT
3738     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3739 #endif
3740     root_thread->th.th_root = root;
3741     if (__kmp_env_consistency_check) {
3742       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3743     }
3744 #if USE_FAST_MEMORY
3745     __kmp_initialize_fast_memory(root_thread);
3746 #endif /* USE_FAST_MEMORY */
3747 
3748 #if KMP_USE_BGET
3749     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3750     __kmp_initialize_bget(root_thread);
3751 #endif
3752     __kmp_init_random(root_thread); // Initialize random number generator
3753   }
3754 
3755   /* setup the serial team held in reserve by the root thread */
3756   if (!root_thread->th.th_serial_team) {
3757     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3758     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3759     root_thread->th.th_serial_team = __kmp_allocate_team(
3760         root, 1, 1,
3761 #if OMPT_SUPPORT
3762         ompt_data_none, // root parallel id
3763 #endif
3764         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3765   }
3766   KMP_ASSERT(root_thread->th.th_serial_team);
3767   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3768                 root_thread->th.th_serial_team));
3769 
3770   /* drop root_thread into place */
3771   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3772 
3773   root->r.r_root_team->t.t_threads[0] = root_thread;
3774   root->r.r_hot_team->t.t_threads[0] = root_thread;
3775   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3776   // AC: the team created in reserve, not for execution (it is unused for now).
3777   root_thread->th.th_serial_team->t.t_serialized = 0;
3778   root->r.r_uber_thread = root_thread;
3779 
3780   /* initialize the thread, get it ready to go */
3781   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3782   TCW_4(__kmp_init_gtid, TRUE);
3783 
3784   /* prepare the primary thread for get_gtid() */
3785   __kmp_gtid_set_specific(gtid);
3786 
3787 #if USE_ITT_BUILD
3788   __kmp_itt_thread_name(gtid);
3789 #endif /* USE_ITT_BUILD */
3790 
3791 #ifdef KMP_TDATA_GTID
3792   __kmp_gtid = gtid;
3793 #endif
3794   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3795   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3796 
3797   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3798                 "plain=%u\n",
3799                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3800                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3801                 KMP_INIT_BARRIER_STATE));
3802   { // Initialize barrier data.
3803     int b;
3804     for (b = 0; b < bs_last_barrier; ++b) {
3805       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3806 #if USE_DEBUGGER
3807       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3808 #endif
3809     }
3810   }
3811   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3812                    KMP_INIT_BARRIER_STATE);
3813 
3814 #if KMP_AFFINITY_SUPPORTED
3815   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3816   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3817   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3818   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3819   if (TCR_4(__kmp_init_middle)) {
3820     __kmp_affinity_set_init_mask(gtid, TRUE);
3821   }
3822 #endif /* KMP_AFFINITY_SUPPORTED */
3823   root_thread->th.th_def_allocator = __kmp_def_allocator;
3824   root_thread->th.th_prev_level = 0;
3825   root_thread->th.th_prev_num_threads = 1;
3826 
3827   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3828   tmp->cg_root = root_thread;
3829   tmp->cg_thread_limit = __kmp_cg_max_nth;
3830   tmp->cg_nthreads = 1;
3831   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3832                  " cg_nthreads init to 1\n",
3833                  root_thread, tmp));
3834   tmp->up = NULL;
3835   root_thread->th.th_cg_roots = tmp;
3836 
3837   __kmp_root_counter++;
3838 
3839 #if OMPT_SUPPORT
3840   if (!initial_thread && ompt_enabled.enabled) {
3841 
3842     kmp_info_t *root_thread = ompt_get_thread();
3843 
3844     ompt_set_thread_state(root_thread, ompt_state_overhead);
3845 
3846     if (ompt_enabled.ompt_callback_thread_begin) {
3847       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3848           ompt_thread_initial, __ompt_get_thread_data_internal());
3849     }
3850     ompt_data_t *task_data;
3851     ompt_data_t *parallel_data;
3852     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3853                                   NULL);
3854     if (ompt_enabled.ompt_callback_implicit_task) {
3855       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3856           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3857     }
3858 
3859     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3860   }
3861 #endif
3862 #if OMPD_SUPPORT
3863   if (ompd_state & OMPD_ENABLE_BP)
3864     ompd_bp_thread_begin();
3865 #endif
3866 
3867   KMP_MB();
3868   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3869 
3870   return gtid;
3871 }
3872 
3873 #if KMP_NESTED_HOT_TEAMS
3874 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3875                                 const int max_level) {
3876   int i, n, nth;
3877   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3878   if (!hot_teams || !hot_teams[level].hot_team) {
3879     return 0;
3880   }
3881   KMP_DEBUG_ASSERT(level < max_level);
3882   kmp_team_t *team = hot_teams[level].hot_team;
3883   nth = hot_teams[level].hot_team_nth;
3884   n = nth - 1; // primary thread is not freed
3885   if (level < max_level - 1) {
3886     for (i = 0; i < nth; ++i) {
3887       kmp_info_t *th = team->t.t_threads[i];
3888       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3889       if (i > 0 && th->th.th_hot_teams) {
3890         __kmp_free(th->th.th_hot_teams);
3891         th->th.th_hot_teams = NULL;
3892       }
3893     }
3894   }
3895   __kmp_free_team(root, team, NULL);
3896   return n;
3897 }
3898 #endif
3899 
3900 // Resets a root thread and clear its root and hot teams.
3901 // Returns the number of __kmp_threads entries directly and indirectly freed.
3902 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3903   kmp_team_t *root_team = root->r.r_root_team;
3904   kmp_team_t *hot_team = root->r.r_hot_team;
3905   int n = hot_team->t.t_nproc;
3906   int i;
3907 
3908   KMP_DEBUG_ASSERT(!root->r.r_active);
3909 
3910   root->r.r_root_team = NULL;
3911   root->r.r_hot_team = NULL;
3912   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3913   // before call to __kmp_free_team().
3914   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3915 #if KMP_NESTED_HOT_TEAMS
3916   if (__kmp_hot_teams_max_level >
3917       0) { // need to free nested hot teams and their threads if any
3918     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3919       kmp_info_t *th = hot_team->t.t_threads[i];
3920       if (__kmp_hot_teams_max_level > 1) {
3921         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3922       }
3923       if (th->th.th_hot_teams) {
3924         __kmp_free(th->th.th_hot_teams);
3925         th->th.th_hot_teams = NULL;
3926       }
3927     }
3928   }
3929 #endif
3930   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3931 
3932   // Before we can reap the thread, we need to make certain that all other
3933   // threads in the teams that had this root as ancestor have stopped trying to
3934   // steal tasks.
3935   if (__kmp_tasking_mode != tskm_immediate_exec) {
3936     __kmp_wait_to_unref_task_teams();
3937   }
3938 
3939 #if KMP_OS_WINDOWS
3940   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3941   KA_TRACE(
3942       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3943            "\n",
3944            (LPVOID) & (root->r.r_uber_thread->th),
3945            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3946   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3947 #endif /* KMP_OS_WINDOWS */
3948 
3949 #if OMPD_SUPPORT
3950   if (ompd_state & OMPD_ENABLE_BP)
3951     ompd_bp_thread_end();
3952 #endif
3953 
3954 #if OMPT_SUPPORT
3955   ompt_data_t *task_data;
3956   ompt_data_t *parallel_data;
3957   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3958                                 NULL);
3959   if (ompt_enabled.ompt_callback_implicit_task) {
3960     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3961         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3962   }
3963   if (ompt_enabled.ompt_callback_thread_end) {
3964     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3965         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3966   }
3967 #endif
3968 
3969   TCW_4(__kmp_nth,
3970         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3971   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3972   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3973                  " to %d\n",
3974                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3975                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3976   if (i == 1) {
3977     // need to free contention group structure
3978     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3979                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3980     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3981     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3982     root->r.r_uber_thread->th.th_cg_roots = NULL;
3983   }
3984   __kmp_reap_thread(root->r.r_uber_thread, 1);
3985 
3986   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3987   // instead of freeing.
3988   root->r.r_uber_thread = NULL;
3989   /* mark root as no longer in use */
3990   root->r.r_begin = FALSE;
3991 
3992   return n;
3993 }
3994 
3995 void __kmp_unregister_root_current_thread(int gtid) {
3996   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3997   /* this lock should be ok, since unregister_root_current_thread is never
3998      called during an abort, only during a normal close. furthermore, if you
3999      have the forkjoin lock, you should never try to get the initz lock */
4000   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4001   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4002     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4003                   "exiting T#%d\n",
4004                   gtid));
4005     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4006     return;
4007   }
4008   kmp_root_t *root = __kmp_root[gtid];
4009 
4010   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4011   KMP_ASSERT(KMP_UBER_GTID(gtid));
4012   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4013   KMP_ASSERT(root->r.r_active == FALSE);
4014 
4015   KMP_MB();
4016 
4017   kmp_info_t *thread = __kmp_threads[gtid];
4018   kmp_team_t *team = thread->th.th_team;
4019   kmp_task_team_t *task_team = thread->th.th_task_team;
4020 
4021   // we need to wait for the proxy tasks before finishing the thread
4022   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4023 #if OMPT_SUPPORT
4024     // the runtime is shutting down so we won't report any events
4025     thread->th.ompt_thread_info.state = ompt_state_undefined;
4026 #endif
4027     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4028   }
4029 
4030   __kmp_reset_root(gtid, root);
4031 
4032   KMP_MB();
4033   KC_TRACE(10,
4034            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4035 
4036   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4037 }
4038 
4039 #if KMP_OS_WINDOWS
4040 /* __kmp_forkjoin_lock must be already held
4041    Unregisters a root thread that is not the current thread.  Returns the number
4042    of __kmp_threads entries freed as a result. */
4043 static int __kmp_unregister_root_other_thread(int gtid) {
4044   kmp_root_t *root = __kmp_root[gtid];
4045   int r;
4046 
4047   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4048   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4049   KMP_ASSERT(KMP_UBER_GTID(gtid));
4050   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4051   KMP_ASSERT(root->r.r_active == FALSE);
4052 
4053   r = __kmp_reset_root(gtid, root);
4054   KC_TRACE(10,
4055            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4056   return r;
4057 }
4058 #endif
4059 
4060 #if KMP_DEBUG
4061 void __kmp_task_info() {
4062 
4063   kmp_int32 gtid = __kmp_entry_gtid();
4064   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4065   kmp_info_t *this_thr = __kmp_threads[gtid];
4066   kmp_team_t *steam = this_thr->th.th_serial_team;
4067   kmp_team_t *team = this_thr->th.th_team;
4068 
4069   __kmp_printf(
4070       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4071       "ptask=%p\n",
4072       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4073       team->t.t_implicit_task_taskdata[tid].td_parent);
4074 }
4075 #endif // KMP_DEBUG
4076 
4077 /* TODO optimize with one big memclr, take out what isn't needed, split
4078    responsibility to workers as much as possible, and delay initialization of
4079    features as much as possible  */
4080 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4081                                   int tid, int gtid) {
4082   /* this_thr->th.th_info.ds.ds_gtid is setup in
4083      kmp_allocate_thread/create_worker.
4084      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4085   KMP_DEBUG_ASSERT(this_thr != NULL);
4086   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4087   KMP_DEBUG_ASSERT(team);
4088   KMP_DEBUG_ASSERT(team->t.t_threads);
4089   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4090   kmp_info_t *master = team->t.t_threads[0];
4091   KMP_DEBUG_ASSERT(master);
4092   KMP_DEBUG_ASSERT(master->th.th_root);
4093 
4094   KMP_MB();
4095 
4096   TCW_SYNC_PTR(this_thr->th.th_team, team);
4097 
4098   this_thr->th.th_info.ds.ds_tid = tid;
4099   this_thr->th.th_set_nproc = 0;
4100   if (__kmp_tasking_mode != tskm_immediate_exec)
4101     // When tasking is possible, threads are not safe to reap until they are
4102     // done tasking; this will be set when tasking code is exited in wait
4103     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4104   else // no tasking --> always safe to reap
4105     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4106   this_thr->th.th_set_proc_bind = proc_bind_default;
4107 #if KMP_AFFINITY_SUPPORTED
4108   this_thr->th.th_new_place = this_thr->th.th_current_place;
4109 #endif
4110   this_thr->th.th_root = master->th.th_root;
4111 
4112   /* setup the thread's cache of the team structure */
4113   this_thr->th.th_team_nproc = team->t.t_nproc;
4114   this_thr->th.th_team_master = master;
4115   this_thr->th.th_team_serialized = team->t.t_serialized;
4116   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4117 
4118   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4119 
4120   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4121                 tid, gtid, this_thr, this_thr->th.th_current_task));
4122 
4123   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4124                            team, tid, TRUE);
4125 
4126   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4127                 tid, gtid, this_thr, this_thr->th.th_current_task));
4128   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4129   // __kmp_initialize_team()?
4130 
4131   /* TODO no worksharing in speculative threads */
4132   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4133 
4134   this_thr->th.th_local.this_construct = 0;
4135 
4136   if (!this_thr->th.th_pri_common) {
4137     this_thr->th.th_pri_common =
4138         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4139     if (__kmp_storage_map) {
4140       __kmp_print_storage_map_gtid(
4141           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4142           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4143     }
4144     this_thr->th.th_pri_head = NULL;
4145   }
4146 
4147   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4148       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4149     // Make new thread's CG root same as primary thread's
4150     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4151     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4152     if (tmp) {
4153       // worker changes CG, need to check if old CG should be freed
4154       int i = tmp->cg_nthreads--;
4155       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4156                      " on node %p of thread %p to %d\n",
4157                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4158       if (i == 1) {
4159         __kmp_free(tmp); // last thread left CG --> free it
4160       }
4161     }
4162     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4163     // Increment new thread's CG root's counter to add the new thread
4164     this_thr->th.th_cg_roots->cg_nthreads++;
4165     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4166                    " node %p of thread %p to %d\n",
4167                    this_thr, this_thr->th.th_cg_roots,
4168                    this_thr->th.th_cg_roots->cg_root,
4169                    this_thr->th.th_cg_roots->cg_nthreads));
4170     this_thr->th.th_current_task->td_icvs.thread_limit =
4171         this_thr->th.th_cg_roots->cg_thread_limit;
4172   }
4173 
4174   /* Initialize dynamic dispatch */
4175   {
4176     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4177     // Use team max_nproc since this will never change for the team.
4178     size_t disp_size =
4179         sizeof(dispatch_private_info_t) *
4180         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4181     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4182                   team->t.t_max_nproc));
4183     KMP_ASSERT(dispatch);
4184     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4185     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4186 
4187     dispatch->th_disp_index = 0;
4188     dispatch->th_doacross_buf_idx = 0;
4189     if (!dispatch->th_disp_buffer) {
4190       dispatch->th_disp_buffer =
4191           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4192 
4193       if (__kmp_storage_map) {
4194         __kmp_print_storage_map_gtid(
4195             gtid, &dispatch->th_disp_buffer[0],
4196             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4197                                           ? 1
4198                                           : __kmp_dispatch_num_buffers],
4199             disp_size,
4200             "th_%d.th_dispatch.th_disp_buffer "
4201             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4202             gtid, team->t.t_id, gtid);
4203       }
4204     } else {
4205       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4206     }
4207 
4208     dispatch->th_dispatch_pr_current = 0;
4209     dispatch->th_dispatch_sh_current = 0;
4210 
4211     dispatch->th_deo_fcn = 0; /* ORDERED     */
4212     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4213   }
4214 
4215   this_thr->th.th_next_pool = NULL;
4216 
4217   if (!this_thr->th.th_task_state_memo_stack) {
4218     size_t i;
4219     this_thr->th.th_task_state_memo_stack =
4220         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4221     this_thr->th.th_task_state_top = 0;
4222     this_thr->th.th_task_state_stack_sz = 4;
4223     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4224          ++i) // zero init the stack
4225       this_thr->th.th_task_state_memo_stack[i] = 0;
4226   }
4227 
4228   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4229   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4230 
4231   KMP_MB();
4232 }
4233 
4234 /* allocate a new thread for the requesting team. this is only called from
4235    within a forkjoin critical section. we will first try to get an available
4236    thread from the thread pool. if none is available, we will fork a new one
4237    assuming we are able to create a new one. this should be assured, as the
4238    caller should check on this first. */
4239 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4240                                   int new_tid) {
4241   kmp_team_t *serial_team;
4242   kmp_info_t *new_thr;
4243   int new_gtid;
4244 
4245   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4246   KMP_DEBUG_ASSERT(root && team);
4247 #if !KMP_NESTED_HOT_TEAMS
4248   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4249 #endif
4250   KMP_MB();
4251 
4252   /* first, try to get one from the thread pool */
4253   if (__kmp_thread_pool) {
4254     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4255     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4256     if (new_thr == __kmp_thread_pool_insert_pt) {
4257       __kmp_thread_pool_insert_pt = NULL;
4258     }
4259     TCW_4(new_thr->th.th_in_pool, FALSE);
4260     __kmp_suspend_initialize_thread(new_thr);
4261     __kmp_lock_suspend_mx(new_thr);
4262     if (new_thr->th.th_active_in_pool == TRUE) {
4263       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4264       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4265       new_thr->th.th_active_in_pool = FALSE;
4266     }
4267     __kmp_unlock_suspend_mx(new_thr);
4268 
4269     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4270                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4271     KMP_ASSERT(!new_thr->th.th_team);
4272     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4273 
4274     /* setup the thread structure */
4275     __kmp_initialize_info(new_thr, team, new_tid,
4276                           new_thr->th.th_info.ds.ds_gtid);
4277     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4278 
4279     TCW_4(__kmp_nth, __kmp_nth + 1);
4280 
4281     new_thr->th.th_task_state = 0;
4282     new_thr->th.th_task_state_top = 0;
4283     new_thr->th.th_task_state_stack_sz = 4;
4284 
4285 #ifdef KMP_ADJUST_BLOCKTIME
4286     /* Adjust blocktime back to zero if necessary */
4287     /* Middle initialization might not have occurred yet */
4288     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4289       if (__kmp_nth > __kmp_avail_proc) {
4290         __kmp_zero_bt = TRUE;
4291       }
4292     }
4293 #endif /* KMP_ADJUST_BLOCKTIME */
4294 
4295 #if KMP_DEBUG
4296     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4297     // KMP_BARRIER_PARENT_FLAG.
4298     int b;
4299     kmp_balign_t *balign = new_thr->th.th_bar;
4300     for (b = 0; b < bs_last_barrier; ++b)
4301       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4302 #endif
4303 
4304     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4305                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4306 
4307     KMP_MB();
4308     return new_thr;
4309   }
4310 
4311   /* no, well fork a new one */
4312   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4313   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4314 
4315 #if KMP_USE_MONITOR
4316   // If this is the first worker thread the RTL is creating, then also
4317   // launch the monitor thread.  We try to do this as early as possible.
4318   if (!TCR_4(__kmp_init_monitor)) {
4319     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4320     if (!TCR_4(__kmp_init_monitor)) {
4321       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4322       TCW_4(__kmp_init_monitor, 1);
4323       __kmp_create_monitor(&__kmp_monitor);
4324       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4325 #if KMP_OS_WINDOWS
4326       // AC: wait until monitor has started. This is a fix for CQ232808.
4327       // The reason is that if the library is loaded/unloaded in a loop with
4328       // small (parallel) work in between, then there is high probability that
4329       // monitor thread started after the library shutdown. At shutdown it is
4330       // too late to cope with the problem, because when the primary thread is
4331       // in DllMain (process detach) the monitor has no chances to start (it is
4332       // blocked), and primary thread has no means to inform the monitor that
4333       // the library has gone, because all the memory which the monitor can
4334       // access is going to be released/reset.
4335       while (TCR_4(__kmp_init_monitor) < 2) {
4336         KMP_YIELD(TRUE);
4337       }
4338       KF_TRACE(10, ("after monitor thread has started\n"));
4339 #endif
4340     }
4341     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4342   }
4343 #endif
4344 
4345   KMP_MB();
4346 
4347   {
4348     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4349                              ? 1
4350                              : __kmp_hidden_helper_threads_num + 1;
4351 
4352     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4353          ++new_gtid) {
4354       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4355     }
4356 
4357     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4358       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4359     }
4360   }
4361 
4362   /* allocate space for it. */
4363   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4364 
4365   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4366 
4367 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4368   // suppress race conditions detection on synchronization flags in debug mode
4369   // this helps to analyze library internals eliminating false positives
4370   __itt_suppress_mark_range(
4371       __itt_suppress_range, __itt_suppress_threading_errors,
4372       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4373   __itt_suppress_mark_range(
4374       __itt_suppress_range, __itt_suppress_threading_errors,
4375       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4376 #if KMP_OS_WINDOWS
4377   __itt_suppress_mark_range(
4378       __itt_suppress_range, __itt_suppress_threading_errors,
4379       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4380 #else
4381   __itt_suppress_mark_range(__itt_suppress_range,
4382                             __itt_suppress_threading_errors,
4383                             &new_thr->th.th_suspend_init_count,
4384                             sizeof(new_thr->th.th_suspend_init_count));
4385 #endif
4386   // TODO: check if we need to also suppress b_arrived flags
4387   __itt_suppress_mark_range(__itt_suppress_range,
4388                             __itt_suppress_threading_errors,
4389                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4390                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4391   __itt_suppress_mark_range(__itt_suppress_range,
4392                             __itt_suppress_threading_errors,
4393                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4394                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4395   __itt_suppress_mark_range(__itt_suppress_range,
4396                             __itt_suppress_threading_errors,
4397                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4398                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4399 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4400   if (__kmp_storage_map) {
4401     __kmp_print_thread_storage_map(new_thr, new_gtid);
4402   }
4403 
4404   // add the reserve serialized team, initialized from the team's primary thread
4405   {
4406     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4407     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4408     new_thr->th.th_serial_team = serial_team =
4409         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4410 #if OMPT_SUPPORT
4411                                           ompt_data_none, // root parallel id
4412 #endif
4413                                           proc_bind_default, &r_icvs,
4414                                           0 USE_NESTED_HOT_ARG(NULL));
4415   }
4416   KMP_ASSERT(serial_team);
4417   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4418   // execution (it is unused for now).
4419   serial_team->t.t_threads[0] = new_thr;
4420   KF_TRACE(10,
4421            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4422             new_thr));
4423 
4424   /* setup the thread structures */
4425   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4426 
4427 #if USE_FAST_MEMORY
4428   __kmp_initialize_fast_memory(new_thr);
4429 #endif /* USE_FAST_MEMORY */
4430 
4431 #if KMP_USE_BGET
4432   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4433   __kmp_initialize_bget(new_thr);
4434 #endif
4435 
4436   __kmp_init_random(new_thr); // Initialize random number generator
4437 
4438   /* Initialize these only once when thread is grabbed for a team allocation */
4439   KA_TRACE(20,
4440            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4441             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4442 
4443   int b;
4444   kmp_balign_t *balign = new_thr->th.th_bar;
4445   for (b = 0; b < bs_last_barrier; ++b) {
4446     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4447     balign[b].bb.team = NULL;
4448     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4449     balign[b].bb.use_oncore_barrier = 0;
4450   }
4451 
4452   new_thr->th.th_spin_here = FALSE;
4453   new_thr->th.th_next_waiting = 0;
4454 #if KMP_OS_UNIX
4455   new_thr->th.th_blocking = false;
4456 #endif
4457 
4458 #if KMP_AFFINITY_SUPPORTED
4459   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4460   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4461   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4462   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4463 #endif
4464   new_thr->th.th_def_allocator = __kmp_def_allocator;
4465   new_thr->th.th_prev_level = 0;
4466   new_thr->th.th_prev_num_threads = 1;
4467 
4468   TCW_4(new_thr->th.th_in_pool, FALSE);
4469   new_thr->th.th_active_in_pool = FALSE;
4470   TCW_4(new_thr->th.th_active, TRUE);
4471 
4472   /* adjust the global counters */
4473   __kmp_all_nth++;
4474   __kmp_nth++;
4475 
4476   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4477   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4478   if (__kmp_adjust_gtid_mode) {
4479     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4480       if (TCR_4(__kmp_gtid_mode) != 2) {
4481         TCW_4(__kmp_gtid_mode, 2);
4482       }
4483     } else {
4484       if (TCR_4(__kmp_gtid_mode) != 1) {
4485         TCW_4(__kmp_gtid_mode, 1);
4486       }
4487     }
4488   }
4489 
4490 #ifdef KMP_ADJUST_BLOCKTIME
4491   /* Adjust blocktime back to zero if necessary       */
4492   /* Middle initialization might not have occurred yet */
4493   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4494     if (__kmp_nth > __kmp_avail_proc) {
4495       __kmp_zero_bt = TRUE;
4496     }
4497   }
4498 #endif /* KMP_ADJUST_BLOCKTIME */
4499 
4500   /* actually fork it and create the new worker thread */
4501   KF_TRACE(
4502       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4503   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4504   KF_TRACE(10,
4505            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4506 
4507   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4508                 new_gtid));
4509   KMP_MB();
4510   return new_thr;
4511 }
4512 
4513 /* Reinitialize team for reuse.
4514    The hot team code calls this case at every fork barrier, so EPCC barrier
4515    test are extremely sensitive to changes in it, esp. writes to the team
4516    struct, which cause a cache invalidation in all threads.
4517    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4518 static void __kmp_reinitialize_team(kmp_team_t *team,
4519                                     kmp_internal_control_t *new_icvs,
4520                                     ident_t *loc) {
4521   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4522                 team->t.t_threads[0], team));
4523   KMP_DEBUG_ASSERT(team && new_icvs);
4524   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4525   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4526 
4527   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4528   // Copy ICVs to the primary thread's implicit taskdata
4529   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4530   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4531 
4532   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4533                 team->t.t_threads[0], team));
4534 }
4535 
4536 /* Initialize the team data structure.
4537    This assumes the t_threads and t_max_nproc are already set.
4538    Also, we don't touch the arguments */
4539 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4540                                   kmp_internal_control_t *new_icvs,
4541                                   ident_t *loc) {
4542   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4543 
4544   /* verify */
4545   KMP_DEBUG_ASSERT(team);
4546   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4547   KMP_DEBUG_ASSERT(team->t.t_threads);
4548   KMP_MB();
4549 
4550   team->t.t_master_tid = 0; /* not needed */
4551   /* team->t.t_master_bar;        not needed */
4552   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4553   team->t.t_nproc = new_nproc;
4554 
4555   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4556   team->t.t_next_pool = NULL;
4557   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4558    * up hot team */
4559 
4560   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4561   team->t.t_invoke = NULL; /* not needed */
4562 
4563   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4564   team->t.t_sched.sched = new_icvs->sched.sched;
4565 
4566 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4567   team->t.t_fp_control_saved = FALSE; /* not needed */
4568   team->t.t_x87_fpu_control_word = 0; /* not needed */
4569   team->t.t_mxcsr = 0; /* not needed */
4570 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4571 
4572   team->t.t_construct = 0;
4573 
4574   team->t.t_ordered.dt.t_value = 0;
4575   team->t.t_master_active = FALSE;
4576 
4577 #ifdef KMP_DEBUG
4578   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4579 #endif
4580 #if KMP_OS_WINDOWS
4581   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4582 #endif
4583 
4584   team->t.t_control_stack_top = NULL;
4585 
4586   __kmp_reinitialize_team(team, new_icvs, loc);
4587 
4588   KMP_MB();
4589   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4590 }
4591 
4592 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4593 /* Sets full mask for thread and returns old mask, no changes to structures. */
4594 static void
4595 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4596   if (KMP_AFFINITY_CAPABLE()) {
4597     int status;
4598     if (old_mask != NULL) {
4599       status = __kmp_get_system_affinity(old_mask, TRUE);
4600       int error = errno;
4601       if (status != 0) {
4602         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4603                     __kmp_msg_null);
4604       }
4605     }
4606     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4607   }
4608 }
4609 #endif
4610 
4611 #if KMP_AFFINITY_SUPPORTED
4612 
4613 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4614 // It calculates the worker + primary thread's partition based upon the parent
4615 // thread's partition, and binds each worker to a thread in their partition.
4616 // The primary thread's partition should already include its current binding.
4617 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4618   // Do not partition places for the hidden helper team
4619   if (KMP_HIDDEN_HELPER_TEAM(team))
4620     return;
4621   // Copy the primary thread's place partition to the team struct
4622   kmp_info_t *master_th = team->t.t_threads[0];
4623   KMP_DEBUG_ASSERT(master_th != NULL);
4624   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4625   int first_place = master_th->th.th_first_place;
4626   int last_place = master_th->th.th_last_place;
4627   int masters_place = master_th->th.th_current_place;
4628   team->t.t_first_place = first_place;
4629   team->t.t_last_place = last_place;
4630 
4631   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4632                 "bound to place %d partition = [%d,%d]\n",
4633                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4634                 team->t.t_id, masters_place, first_place, last_place));
4635 
4636   switch (proc_bind) {
4637 
4638   case proc_bind_default:
4639     // Serial teams might have the proc_bind policy set to proc_bind_default.
4640     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4641     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4642     break;
4643 
4644   case proc_bind_primary: {
4645     int f;
4646     int n_th = team->t.t_nproc;
4647     for (f = 1; f < n_th; f++) {
4648       kmp_info_t *th = team->t.t_threads[f];
4649       KMP_DEBUG_ASSERT(th != NULL);
4650       th->th.th_first_place = first_place;
4651       th->th.th_last_place = last_place;
4652       th->th.th_new_place = masters_place;
4653       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4654           team->t.t_display_affinity != 1) {
4655         team->t.t_display_affinity = 1;
4656       }
4657 
4658       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4659                      "partition = [%d,%d]\n",
4660                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4661                      f, masters_place, first_place, last_place));
4662     }
4663   } break;
4664 
4665   case proc_bind_close: {
4666     int f;
4667     int n_th = team->t.t_nproc;
4668     int n_places;
4669     if (first_place <= last_place) {
4670       n_places = last_place - first_place + 1;
4671     } else {
4672       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4673     }
4674     if (n_th <= n_places) {
4675       int place = masters_place;
4676       for (f = 1; f < n_th; f++) {
4677         kmp_info_t *th = team->t.t_threads[f];
4678         KMP_DEBUG_ASSERT(th != NULL);
4679 
4680         if (place == last_place) {
4681           place = first_place;
4682         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4683           place = 0;
4684         } else {
4685           place++;
4686         }
4687         th->th.th_first_place = first_place;
4688         th->th.th_last_place = last_place;
4689         th->th.th_new_place = place;
4690         if (__kmp_display_affinity && place != th->th.th_current_place &&
4691             team->t.t_display_affinity != 1) {
4692           team->t.t_display_affinity = 1;
4693         }
4694 
4695         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4696                        "partition = [%d,%d]\n",
4697                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4698                        team->t.t_id, f, place, first_place, last_place));
4699       }
4700     } else {
4701       int S, rem, gap, s_count;
4702       S = n_th / n_places;
4703       s_count = 0;
4704       rem = n_th - (S * n_places);
4705       gap = rem > 0 ? n_places / rem : n_places;
4706       int place = masters_place;
4707       int gap_ct = gap;
4708       for (f = 0; f < n_th; f++) {
4709         kmp_info_t *th = team->t.t_threads[f];
4710         KMP_DEBUG_ASSERT(th != NULL);
4711 
4712         th->th.th_first_place = first_place;
4713         th->th.th_last_place = last_place;
4714         th->th.th_new_place = place;
4715         if (__kmp_display_affinity && place != th->th.th_current_place &&
4716             team->t.t_display_affinity != 1) {
4717           team->t.t_display_affinity = 1;
4718         }
4719         s_count++;
4720 
4721         if ((s_count == S) && rem && (gap_ct == gap)) {
4722           // do nothing, add an extra thread to place on next iteration
4723         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4724           // we added an extra thread to this place; move to next place
4725           if (place == last_place) {
4726             place = first_place;
4727           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4728             place = 0;
4729           } else {
4730             place++;
4731           }
4732           s_count = 0;
4733           gap_ct = 1;
4734           rem--;
4735         } else if (s_count == S) { // place full; don't add extra
4736           if (place == last_place) {
4737             place = first_place;
4738           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4739             place = 0;
4740           } else {
4741             place++;
4742           }
4743           gap_ct++;
4744           s_count = 0;
4745         }
4746 
4747         KA_TRACE(100,
4748                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4749                   "partition = [%d,%d]\n",
4750                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4751                   th->th.th_new_place, first_place, last_place));
4752       }
4753       KMP_DEBUG_ASSERT(place == masters_place);
4754     }
4755   } break;
4756 
4757   case proc_bind_spread: {
4758     int f;
4759     int n_th = team->t.t_nproc;
4760     int n_places;
4761     int thidx;
4762     if (first_place <= last_place) {
4763       n_places = last_place - first_place + 1;
4764     } else {
4765       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4766     }
4767     if (n_th <= n_places) {
4768       int place = -1;
4769 
4770       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4771         int S = n_places / n_th;
4772         int s_count, rem, gap, gap_ct;
4773 
4774         place = masters_place;
4775         rem = n_places - n_th * S;
4776         gap = rem ? n_th / rem : 1;
4777         gap_ct = gap;
4778         thidx = n_th;
4779         if (update_master_only == 1)
4780           thidx = 1;
4781         for (f = 0; f < thidx; f++) {
4782           kmp_info_t *th = team->t.t_threads[f];
4783           KMP_DEBUG_ASSERT(th != NULL);
4784 
4785           th->th.th_first_place = place;
4786           th->th.th_new_place = place;
4787           if (__kmp_display_affinity && place != th->th.th_current_place &&
4788               team->t.t_display_affinity != 1) {
4789             team->t.t_display_affinity = 1;
4790           }
4791           s_count = 1;
4792           while (s_count < S) {
4793             if (place == last_place) {
4794               place = first_place;
4795             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4796               place = 0;
4797             } else {
4798               place++;
4799             }
4800             s_count++;
4801           }
4802           if (rem && (gap_ct == gap)) {
4803             if (place == last_place) {
4804               place = first_place;
4805             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4806               place = 0;
4807             } else {
4808               place++;
4809             }
4810             rem--;
4811             gap_ct = 0;
4812           }
4813           th->th.th_last_place = place;
4814           gap_ct++;
4815 
4816           if (place == last_place) {
4817             place = first_place;
4818           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4819             place = 0;
4820           } else {
4821             place++;
4822           }
4823 
4824           KA_TRACE(100,
4825                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4826                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4827                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4828                     f, th->th.th_new_place, th->th.th_first_place,
4829                     th->th.th_last_place, __kmp_affinity_num_masks));
4830         }
4831       } else {
4832         /* Having uniform space of available computation places I can create
4833            T partitions of round(P/T) size and put threads into the first
4834            place of each partition. */
4835         double current = static_cast<double>(masters_place);
4836         double spacing =
4837             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4838         int first, last;
4839         kmp_info_t *th;
4840 
4841         thidx = n_th + 1;
4842         if (update_master_only == 1)
4843           thidx = 1;
4844         for (f = 0; f < thidx; f++) {
4845           first = static_cast<int>(current);
4846           last = static_cast<int>(current + spacing) - 1;
4847           KMP_DEBUG_ASSERT(last >= first);
4848           if (first >= n_places) {
4849             if (masters_place) {
4850               first -= n_places;
4851               last -= n_places;
4852               if (first == (masters_place + 1)) {
4853                 KMP_DEBUG_ASSERT(f == n_th);
4854                 first--;
4855               }
4856               if (last == masters_place) {
4857                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4858                 last--;
4859               }
4860             } else {
4861               KMP_DEBUG_ASSERT(f == n_th);
4862               first = 0;
4863               last = 0;
4864             }
4865           }
4866           if (last >= n_places) {
4867             last = (n_places - 1);
4868           }
4869           place = first;
4870           current += spacing;
4871           if (f < n_th) {
4872             KMP_DEBUG_ASSERT(0 <= first);
4873             KMP_DEBUG_ASSERT(n_places > first);
4874             KMP_DEBUG_ASSERT(0 <= last);
4875             KMP_DEBUG_ASSERT(n_places > last);
4876             KMP_DEBUG_ASSERT(last_place >= first_place);
4877             th = team->t.t_threads[f];
4878             KMP_DEBUG_ASSERT(th);
4879             th->th.th_first_place = first;
4880             th->th.th_new_place = place;
4881             th->th.th_last_place = last;
4882             if (__kmp_display_affinity && place != th->th.th_current_place &&
4883                 team->t.t_display_affinity != 1) {
4884               team->t.t_display_affinity = 1;
4885             }
4886             KA_TRACE(100,
4887                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4888                       "partition = [%d,%d], spacing = %.4f\n",
4889                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4890                       team->t.t_id, f, th->th.th_new_place,
4891                       th->th.th_first_place, th->th.th_last_place, spacing));
4892           }
4893         }
4894       }
4895       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4896     } else {
4897       int S, rem, gap, s_count;
4898       S = n_th / n_places;
4899       s_count = 0;
4900       rem = n_th - (S * n_places);
4901       gap = rem > 0 ? n_places / rem : n_places;
4902       int place = masters_place;
4903       int gap_ct = gap;
4904       thidx = n_th;
4905       if (update_master_only == 1)
4906         thidx = 1;
4907       for (f = 0; f < thidx; f++) {
4908         kmp_info_t *th = team->t.t_threads[f];
4909         KMP_DEBUG_ASSERT(th != NULL);
4910 
4911         th->th.th_first_place = place;
4912         th->th.th_last_place = place;
4913         th->th.th_new_place = place;
4914         if (__kmp_display_affinity && place != th->th.th_current_place &&
4915             team->t.t_display_affinity != 1) {
4916           team->t.t_display_affinity = 1;
4917         }
4918         s_count++;
4919 
4920         if ((s_count == S) && rem && (gap_ct == gap)) {
4921           // do nothing, add an extra thread to place on next iteration
4922         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4923           // we added an extra thread to this place; move on to next place
4924           if (place == last_place) {
4925             place = first_place;
4926           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4927             place = 0;
4928           } else {
4929             place++;
4930           }
4931           s_count = 0;
4932           gap_ct = 1;
4933           rem--;
4934         } else if (s_count == S) { // place is full; don't add extra thread
4935           if (place == last_place) {
4936             place = first_place;
4937           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4938             place = 0;
4939           } else {
4940             place++;
4941           }
4942           gap_ct++;
4943           s_count = 0;
4944         }
4945 
4946         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4947                        "partition = [%d,%d]\n",
4948                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4949                        team->t.t_id, f, th->th.th_new_place,
4950                        th->th.th_first_place, th->th.th_last_place));
4951       }
4952       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4953     }
4954   } break;
4955 
4956   default:
4957     break;
4958   }
4959 
4960   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4961 }
4962 
4963 #endif // KMP_AFFINITY_SUPPORTED
4964 
4965 /* allocate a new team data structure to use.  take one off of the free pool if
4966    available */
4967 kmp_team_t *
4968 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4969 #if OMPT_SUPPORT
4970                     ompt_data_t ompt_parallel_data,
4971 #endif
4972                     kmp_proc_bind_t new_proc_bind,
4973                     kmp_internal_control_t *new_icvs,
4974                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4975   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4976   int f;
4977   kmp_team_t *team;
4978   int use_hot_team = !root->r.r_active;
4979   int level = 0;
4980 
4981   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4982   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4983   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4984   KMP_MB();
4985 
4986 #if KMP_NESTED_HOT_TEAMS
4987   kmp_hot_team_ptr_t *hot_teams;
4988   if (master) {
4989     team = master->th.th_team;
4990     level = team->t.t_active_level;
4991     if (master->th.th_teams_microtask) { // in teams construct?
4992       if (master->th.th_teams_size.nteams > 1 &&
4993           ( // #teams > 1
4994               team->t.t_pkfn ==
4995                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4996               master->th.th_teams_level <
4997                   team->t.t_level)) { // or nested parallel inside the teams
4998         ++level; // not increment if #teams==1, or for outer fork of the teams;
4999         // increment otherwise
5000       }
5001     }
5002     hot_teams = master->th.th_hot_teams;
5003     if (level < __kmp_hot_teams_max_level && hot_teams &&
5004         hot_teams[level].hot_team) {
5005       // hot team has already been allocated for given level
5006       use_hot_team = 1;
5007     } else {
5008       use_hot_team = 0;
5009     }
5010   } else {
5011     // check we won't access uninitialized hot_teams, just in case
5012     KMP_DEBUG_ASSERT(new_nproc == 1);
5013   }
5014 #endif
5015   // Optimization to use a "hot" team
5016   if (use_hot_team && new_nproc > 1) {
5017     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5018 #if KMP_NESTED_HOT_TEAMS
5019     team = hot_teams[level].hot_team;
5020 #else
5021     team = root->r.r_hot_team;
5022 #endif
5023 #if KMP_DEBUG
5024     if (__kmp_tasking_mode != tskm_immediate_exec) {
5025       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5026                     "task_team[1] = %p before reinit\n",
5027                     team->t.t_task_team[0], team->t.t_task_team[1]));
5028     }
5029 #endif
5030 
5031     // Has the number of threads changed?
5032     /* Let's assume the most common case is that the number of threads is
5033        unchanged, and put that case first. */
5034     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5035       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5036       // This case can mean that omp_set_num_threads() was called and the hot
5037       // team size was already reduced, so we check the special flag
5038       if (team->t.t_size_changed == -1) {
5039         team->t.t_size_changed = 1;
5040       } else {
5041         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5042       }
5043 
5044       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5045       kmp_r_sched_t new_sched = new_icvs->sched;
5046       // set primary thread's schedule as new run-time schedule
5047       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5048 
5049       __kmp_reinitialize_team(team, new_icvs,
5050                               root->r.r_uber_thread->th.th_ident);
5051 
5052       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5053                     team->t.t_threads[0], team));
5054       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5055 
5056 #if KMP_AFFINITY_SUPPORTED
5057       if ((team->t.t_size_changed == 0) &&
5058           (team->t.t_proc_bind == new_proc_bind)) {
5059         if (new_proc_bind == proc_bind_spread) {
5060           __kmp_partition_places(
5061               team, 1); // add flag to update only master for spread
5062         }
5063         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5064                        "proc_bind = %d, partition = [%d,%d]\n",
5065                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5066                        team->t.t_last_place));
5067       } else {
5068         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5069         __kmp_partition_places(team);
5070       }
5071 #else
5072       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5073 #endif /* KMP_AFFINITY_SUPPORTED */
5074     } else if (team->t.t_nproc > new_nproc) {
5075       KA_TRACE(20,
5076                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5077                 new_nproc));
5078 
5079       team->t.t_size_changed = 1;
5080 #if KMP_NESTED_HOT_TEAMS
5081       if (__kmp_hot_teams_mode == 0) {
5082         // AC: saved number of threads should correspond to team's value in this
5083         // mode, can be bigger in mode 1, when hot team has threads in reserve
5084         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5085         hot_teams[level].hot_team_nth = new_nproc;
5086 #endif // KMP_NESTED_HOT_TEAMS
5087         /* release the extra threads we don't need any more */
5088         for (f = new_nproc; f < team->t.t_nproc; f++) {
5089           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5090           if (__kmp_tasking_mode != tskm_immediate_exec) {
5091             // When decreasing team size, threads no longer in the team should
5092             // unref task team.
5093             team->t.t_threads[f]->th.th_task_team = NULL;
5094           }
5095           __kmp_free_thread(team->t.t_threads[f]);
5096           team->t.t_threads[f] = NULL;
5097         }
5098 #if KMP_NESTED_HOT_TEAMS
5099       } // (__kmp_hot_teams_mode == 0)
5100       else {
5101         // When keeping extra threads in team, switch threads to wait on own
5102         // b_go flag
5103         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5104           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5105           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5106           for (int b = 0; b < bs_last_barrier; ++b) {
5107             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5108               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5109             }
5110             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5111           }
5112         }
5113       }
5114 #endif // KMP_NESTED_HOT_TEAMS
5115       team->t.t_nproc = new_nproc;
5116       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5117       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5118       __kmp_reinitialize_team(team, new_icvs,
5119                               root->r.r_uber_thread->th.th_ident);
5120 
5121       // Update remaining threads
5122       for (f = 0; f < new_nproc; ++f) {
5123         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5124       }
5125 
5126       // restore the current task state of the primary thread: should be the
5127       // implicit task
5128       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5129                     team->t.t_threads[0], team));
5130 
5131       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5132 
5133 #ifdef KMP_DEBUG
5134       for (f = 0; f < team->t.t_nproc; f++) {
5135         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5136                          team->t.t_threads[f]->th.th_team_nproc ==
5137                              team->t.t_nproc);
5138       }
5139 #endif
5140 
5141       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5142 #if KMP_AFFINITY_SUPPORTED
5143       __kmp_partition_places(team);
5144 #endif
5145     } else { // team->t.t_nproc < new_nproc
5146 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5147       kmp_affin_mask_t *old_mask;
5148       if (KMP_AFFINITY_CAPABLE()) {
5149         KMP_CPU_ALLOC(old_mask);
5150       }
5151 #endif
5152 
5153       KA_TRACE(20,
5154                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5155                 new_nproc));
5156 
5157       team->t.t_size_changed = 1;
5158 
5159 #if KMP_NESTED_HOT_TEAMS
5160       int avail_threads = hot_teams[level].hot_team_nth;
5161       if (new_nproc < avail_threads)
5162         avail_threads = new_nproc;
5163       kmp_info_t **other_threads = team->t.t_threads;
5164       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5165         // Adjust barrier data of reserved threads (if any) of the team
5166         // Other data will be set in __kmp_initialize_info() below.
5167         int b;
5168         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5169         for (b = 0; b < bs_last_barrier; ++b) {
5170           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5171           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5172 #if USE_DEBUGGER
5173           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5174 #endif
5175         }
5176       }
5177       if (hot_teams[level].hot_team_nth >= new_nproc) {
5178         // we have all needed threads in reserve, no need to allocate any
5179         // this only possible in mode 1, cannot have reserved threads in mode 0
5180         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5181         team->t.t_nproc = new_nproc; // just get reserved threads involved
5182       } else {
5183         // we may have some threads in reserve, but not enough
5184         team->t.t_nproc =
5185             hot_teams[level]
5186                 .hot_team_nth; // get reserved threads involved if any
5187         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5188 #endif // KMP_NESTED_HOT_TEAMS
5189         if (team->t.t_max_nproc < new_nproc) {
5190           /* reallocate larger arrays */
5191           __kmp_reallocate_team_arrays(team, new_nproc);
5192           __kmp_reinitialize_team(team, new_icvs, NULL);
5193         }
5194 
5195 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5196         /* Temporarily set full mask for primary thread before creation of
5197            workers. The reason is that workers inherit the affinity from the
5198            primary thread, so if a lot of workers are created on the single
5199            core quickly, they don't get a chance to set their own affinity for
5200            a long time. */
5201         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5202 #endif
5203 
5204         /* allocate new threads for the hot team */
5205         for (f = team->t.t_nproc; f < new_nproc; f++) {
5206           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5207           KMP_DEBUG_ASSERT(new_worker);
5208           team->t.t_threads[f] = new_worker;
5209 
5210           KA_TRACE(20,
5211                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5212                     "join=%llu, plain=%llu\n",
5213                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5214                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5215                     team->t.t_bar[bs_plain_barrier].b_arrived));
5216 
5217           { // Initialize barrier data for new threads.
5218             int b;
5219             kmp_balign_t *balign = new_worker->th.th_bar;
5220             for (b = 0; b < bs_last_barrier; ++b) {
5221               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5222               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5223                                KMP_BARRIER_PARENT_FLAG);
5224 #if USE_DEBUGGER
5225               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5226 #endif
5227             }
5228           }
5229         }
5230 
5231 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5232         if (KMP_AFFINITY_CAPABLE()) {
5233           /* Restore initial primary thread's affinity mask */
5234           __kmp_set_system_affinity(old_mask, TRUE);
5235           KMP_CPU_FREE(old_mask);
5236         }
5237 #endif
5238 #if KMP_NESTED_HOT_TEAMS
5239       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5240 #endif // KMP_NESTED_HOT_TEAMS
5241       /* make sure everyone is syncronized */
5242       int old_nproc = team->t.t_nproc; // save old value and use to update only
5243       // new threads below
5244       __kmp_initialize_team(team, new_nproc, new_icvs,
5245                             root->r.r_uber_thread->th.th_ident);
5246 
5247       /* reinitialize the threads */
5248       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5249       for (f = 0; f < team->t.t_nproc; ++f)
5250         __kmp_initialize_info(team->t.t_threads[f], team, f,
5251                               __kmp_gtid_from_tid(f, team));
5252 
5253       if (level) { // set th_task_state for new threads in nested hot team
5254         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5255         // only need to set the th_task_state for the new threads. th_task_state
5256         // for primary thread will not be accurate until after this in
5257         // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5258         // get the correct value.
5259         for (f = old_nproc; f < team->t.t_nproc; ++f)
5260           team->t.t_threads[f]->th.th_task_state =
5261               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5262       } else { // set th_task_state for new threads in non-nested hot team
5263         // copy primary thread's state
5264         kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5265         for (f = old_nproc; f < team->t.t_nproc; ++f)
5266           team->t.t_threads[f]->th.th_task_state = old_state;
5267       }
5268 
5269 #ifdef KMP_DEBUG
5270       for (f = 0; f < team->t.t_nproc; ++f) {
5271         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5272                          team->t.t_threads[f]->th.th_team_nproc ==
5273                              team->t.t_nproc);
5274       }
5275 #endif
5276 
5277       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5278 #if KMP_AFFINITY_SUPPORTED
5279       __kmp_partition_places(team);
5280 #endif
5281     } // Check changes in number of threads
5282 
5283     kmp_info_t *master = team->t.t_threads[0];
5284     if (master->th.th_teams_microtask) {
5285       for (f = 1; f < new_nproc; ++f) {
5286         // propagate teams construct specific info to workers
5287         kmp_info_t *thr = team->t.t_threads[f];
5288         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5289         thr->th.th_teams_level = master->th.th_teams_level;
5290         thr->th.th_teams_size = master->th.th_teams_size;
5291       }
5292     }
5293 #if KMP_NESTED_HOT_TEAMS
5294     if (level) {
5295       // Sync barrier state for nested hot teams, not needed for outermost hot
5296       // team.
5297       for (f = 1; f < new_nproc; ++f) {
5298         kmp_info_t *thr = team->t.t_threads[f];
5299         int b;
5300         kmp_balign_t *balign = thr->th.th_bar;
5301         for (b = 0; b < bs_last_barrier; ++b) {
5302           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5303           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5304 #if USE_DEBUGGER
5305           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5306 #endif
5307         }
5308       }
5309     }
5310 #endif // KMP_NESTED_HOT_TEAMS
5311 
5312     /* reallocate space for arguments if necessary */
5313     __kmp_alloc_argv_entries(argc, team, TRUE);
5314     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5315     // The hot team re-uses the previous task team,
5316     // if untouched during the previous release->gather phase.
5317 
5318     KF_TRACE(10, (" hot_team = %p\n", team));
5319 
5320 #if KMP_DEBUG
5321     if (__kmp_tasking_mode != tskm_immediate_exec) {
5322       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5323                     "task_team[1] = %p after reinit\n",
5324                     team->t.t_task_team[0], team->t.t_task_team[1]));
5325     }
5326 #endif
5327 
5328 #if OMPT_SUPPORT
5329     __ompt_team_assign_id(team, ompt_parallel_data);
5330 #endif
5331 
5332     KMP_MB();
5333 
5334     return team;
5335   }
5336 
5337   /* next, let's try to take one from the team pool */
5338   KMP_MB();
5339   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5340     /* TODO: consider resizing undersized teams instead of reaping them, now
5341        that we have a resizing mechanism */
5342     if (team->t.t_max_nproc >= max_nproc) {
5343       /* take this team from the team pool */
5344       __kmp_team_pool = team->t.t_next_pool;
5345 
5346       /* setup the team for fresh use */
5347       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5348 
5349       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5350                     "task_team[1] %p to NULL\n",
5351                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5352       team->t.t_task_team[0] = NULL;
5353       team->t.t_task_team[1] = NULL;
5354 
5355       /* reallocate space for arguments if necessary */
5356       __kmp_alloc_argv_entries(argc, team, TRUE);
5357       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5358 
5359       KA_TRACE(
5360           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5361                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5362       { // Initialize barrier data.
5363         int b;
5364         for (b = 0; b < bs_last_barrier; ++b) {
5365           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5366 #if USE_DEBUGGER
5367           team->t.t_bar[b].b_master_arrived = 0;
5368           team->t.t_bar[b].b_team_arrived = 0;
5369 #endif
5370         }
5371       }
5372 
5373       team->t.t_proc_bind = new_proc_bind;
5374 
5375       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5376                     team->t.t_id));
5377 
5378 #if OMPT_SUPPORT
5379       __ompt_team_assign_id(team, ompt_parallel_data);
5380 #endif
5381 
5382       KMP_MB();
5383 
5384       return team;
5385     }
5386 
5387     /* reap team if it is too small, then loop back and check the next one */
5388     // not sure if this is wise, but, will be redone during the hot-teams
5389     // rewrite.
5390     /* TODO: Use technique to find the right size hot-team, don't reap them */
5391     team = __kmp_reap_team(team);
5392     __kmp_team_pool = team;
5393   }
5394 
5395   /* nothing available in the pool, no matter, make a new team! */
5396   KMP_MB();
5397   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5398 
5399   /* and set it up */
5400   team->t.t_max_nproc = max_nproc;
5401   /* NOTE well, for some reason allocating one big buffer and dividing it up
5402      seems to really hurt performance a lot on the P4, so, let's not use this */
5403   __kmp_allocate_team_arrays(team, max_nproc);
5404 
5405   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5406   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5407 
5408   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5409                 "%p to NULL\n",
5410                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5411   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5412   // memory, no need to duplicate
5413   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5414   // memory, no need to duplicate
5415 
5416   if (__kmp_storage_map) {
5417     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5418   }
5419 
5420   /* allocate space for arguments */
5421   __kmp_alloc_argv_entries(argc, team, FALSE);
5422   team->t.t_argc = argc;
5423 
5424   KA_TRACE(20,
5425            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5426             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5427   { // Initialize barrier data.
5428     int b;
5429     for (b = 0; b < bs_last_barrier; ++b) {
5430       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5431 #if USE_DEBUGGER
5432       team->t.t_bar[b].b_master_arrived = 0;
5433       team->t.t_bar[b].b_team_arrived = 0;
5434 #endif
5435     }
5436   }
5437 
5438   team->t.t_proc_bind = new_proc_bind;
5439 
5440 #if OMPT_SUPPORT
5441   __ompt_team_assign_id(team, ompt_parallel_data);
5442   team->t.ompt_serialized_team_info = NULL;
5443 #endif
5444 
5445   KMP_MB();
5446 
5447   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5448                 team->t.t_id));
5449 
5450   return team;
5451 }
5452 
5453 /* TODO implement hot-teams at all levels */
5454 /* TODO implement lazy thread release on demand (disband request) */
5455 
5456 /* free the team.  return it to the team pool.  release all the threads
5457  * associated with it */
5458 void __kmp_free_team(kmp_root_t *root,
5459                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5460   int f;
5461   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5462                 team->t.t_id));
5463 
5464   /* verify state */
5465   KMP_DEBUG_ASSERT(root);
5466   KMP_DEBUG_ASSERT(team);
5467   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5468   KMP_DEBUG_ASSERT(team->t.t_threads);
5469 
5470   int use_hot_team = team == root->r.r_hot_team;
5471 #if KMP_NESTED_HOT_TEAMS
5472   int level;
5473   kmp_hot_team_ptr_t *hot_teams;
5474   if (master) {
5475     level = team->t.t_active_level - 1;
5476     if (master->th.th_teams_microtask) { // in teams construct?
5477       if (master->th.th_teams_size.nteams > 1) {
5478         ++level; // level was not increased in teams construct for
5479         // team_of_masters
5480       }
5481       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5482           master->th.th_teams_level == team->t.t_level) {
5483         ++level; // level was not increased in teams construct for
5484         // team_of_workers before the parallel
5485       } // team->t.t_level will be increased inside parallel
5486     }
5487     hot_teams = master->th.th_hot_teams;
5488     if (level < __kmp_hot_teams_max_level) {
5489       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5490       use_hot_team = 1;
5491     }
5492   }
5493 #endif // KMP_NESTED_HOT_TEAMS
5494 
5495   /* team is done working */
5496   TCW_SYNC_PTR(team->t.t_pkfn,
5497                NULL); // Important for Debugging Support Library.
5498 #if KMP_OS_WINDOWS
5499   team->t.t_copyin_counter = 0; // init counter for possible reuse
5500 #endif
5501   // Do not reset pointer to parent team to NULL for hot teams.
5502 
5503   /* if we are non-hot team, release our threads */
5504   if (!use_hot_team) {
5505     if (__kmp_tasking_mode != tskm_immediate_exec) {
5506       // Wait for threads to reach reapable state
5507       for (f = 1; f < team->t.t_nproc; ++f) {
5508         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5509         kmp_info_t *th = team->t.t_threads[f];
5510         volatile kmp_uint32 *state = &th->th.th_reap_state;
5511         while (*state != KMP_SAFE_TO_REAP) {
5512 #if KMP_OS_WINDOWS
5513           // On Windows a thread can be killed at any time, check this
5514           DWORD ecode;
5515           if (!__kmp_is_thread_alive(th, &ecode)) {
5516             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5517             break;
5518           }
5519 #endif
5520           // first check if thread is sleeping
5521           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5522           if (fl.is_sleeping())
5523             fl.resume(__kmp_gtid_from_thread(th));
5524           KMP_CPU_PAUSE();
5525         }
5526       }
5527 
5528       // Delete task teams
5529       int tt_idx;
5530       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5531         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5532         if (task_team != NULL) {
5533           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5534             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5535             team->t.t_threads[f]->th.th_task_team = NULL;
5536           }
5537           KA_TRACE(
5538               20,
5539               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5540                __kmp_get_gtid(), task_team, team->t.t_id));
5541 #if KMP_NESTED_HOT_TEAMS
5542           __kmp_free_task_team(master, task_team);
5543 #endif
5544           team->t.t_task_team[tt_idx] = NULL;
5545         }
5546       }
5547     }
5548 
5549     // Reset pointer to parent team only for non-hot teams.
5550     team->t.t_parent = NULL;
5551     team->t.t_level = 0;
5552     team->t.t_active_level = 0;
5553 
5554     /* free the worker threads */
5555     for (f = 1; f < team->t.t_nproc; ++f) {
5556       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5557       __kmp_free_thread(team->t.t_threads[f]);
5558       team->t.t_threads[f] = NULL;
5559     }
5560 
5561     /* put the team back in the team pool */
5562     /* TODO limit size of team pool, call reap_team if pool too large */
5563     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5564     __kmp_team_pool = (volatile kmp_team_t *)team;
5565   } else { // Check if team was created for primary threads in teams construct
5566     // See if first worker is a CG root
5567     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5568                      team->t.t_threads[1]->th.th_cg_roots);
5569     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5570       // Clean up the CG root nodes on workers so that this team can be re-used
5571       for (f = 1; f < team->t.t_nproc; ++f) {
5572         kmp_info_t *thr = team->t.t_threads[f];
5573         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5574                          thr->th.th_cg_roots->cg_root == thr);
5575         // Pop current CG root off list
5576         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5577         thr->th.th_cg_roots = tmp->up;
5578         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5579                        " up to node %p. cg_nthreads was %d\n",
5580                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5581         int i = tmp->cg_nthreads--;
5582         if (i == 1) {
5583           __kmp_free(tmp); // free CG if we are the last thread in it
5584         }
5585         // Restore current task's thread_limit from CG root
5586         if (thr->th.th_cg_roots)
5587           thr->th.th_current_task->td_icvs.thread_limit =
5588               thr->th.th_cg_roots->cg_thread_limit;
5589       }
5590     }
5591   }
5592 
5593   KMP_MB();
5594 }
5595 
5596 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5597 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5598   kmp_team_t *next_pool = team->t.t_next_pool;
5599 
5600   KMP_DEBUG_ASSERT(team);
5601   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5602   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5603   KMP_DEBUG_ASSERT(team->t.t_threads);
5604   KMP_DEBUG_ASSERT(team->t.t_argv);
5605 
5606   /* TODO clean the threads that are a part of this? */
5607 
5608   /* free stuff */
5609   __kmp_free_team_arrays(team);
5610   if (team->t.t_argv != &team->t.t_inline_argv[0])
5611     __kmp_free((void *)team->t.t_argv);
5612   __kmp_free(team);
5613 
5614   KMP_MB();
5615   return next_pool;
5616 }
5617 
5618 // Free the thread.  Don't reap it, just place it on the pool of available
5619 // threads.
5620 //
5621 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5622 // binding for the affinity mechanism to be useful.
5623 //
5624 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5625 // However, we want to avoid a potential performance problem by always
5626 // scanning through the list to find the correct point at which to insert
5627 // the thread (potential N**2 behavior).  To do this we keep track of the
5628 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5629 // With single-level parallelism, threads will always be added to the tail
5630 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5631 // parallelism, all bets are off and we may need to scan through the entire
5632 // free list.
5633 //
5634 // This change also has a potentially large performance benefit, for some
5635 // applications.  Previously, as threads were freed from the hot team, they
5636 // would be placed back on the free list in inverse order.  If the hot team
5637 // grew back to it's original size, then the freed thread would be placed
5638 // back on the hot team in reverse order.  This could cause bad cache
5639 // locality problems on programs where the size of the hot team regularly
5640 // grew and shrunk.
5641 //
5642 // Now, for single-level parallelism, the OMP tid is always == gtid.
5643 void __kmp_free_thread(kmp_info_t *this_th) {
5644   int gtid;
5645   kmp_info_t **scan;
5646 
5647   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5648                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5649 
5650   KMP_DEBUG_ASSERT(this_th);
5651 
5652   // When moving thread to pool, switch thread to wait on own b_go flag, and
5653   // uninitialized (NULL team).
5654   int b;
5655   kmp_balign_t *balign = this_th->th.th_bar;
5656   for (b = 0; b < bs_last_barrier; ++b) {
5657     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5658       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5659     balign[b].bb.team = NULL;
5660     balign[b].bb.leaf_kids = 0;
5661   }
5662   this_th->th.th_task_state = 0;
5663   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5664 
5665   /* put thread back on the free pool */
5666   TCW_PTR(this_th->th.th_team, NULL);
5667   TCW_PTR(this_th->th.th_root, NULL);
5668   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5669 
5670   while (this_th->th.th_cg_roots) {
5671     this_th->th.th_cg_roots->cg_nthreads--;
5672     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5673                    " %p of thread  %p to %d\n",
5674                    this_th, this_th->th.th_cg_roots,
5675                    this_th->th.th_cg_roots->cg_root,
5676                    this_th->th.th_cg_roots->cg_nthreads));
5677     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5678     if (tmp->cg_root == this_th) { // Thread is a cg_root
5679       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5680       KA_TRACE(
5681           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5682       this_th->th.th_cg_roots = tmp->up;
5683       __kmp_free(tmp);
5684     } else { // Worker thread
5685       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5686         __kmp_free(tmp);
5687       }
5688       this_th->th.th_cg_roots = NULL;
5689       break;
5690     }
5691   }
5692 
5693   /* If the implicit task assigned to this thread can be used by other threads
5694    * -> multiple threads can share the data and try to free the task at
5695    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5696    * with higher probability when hot team is disabled but can occurs even when
5697    * the hot team is enabled */
5698   __kmp_free_implicit_task(this_th);
5699   this_th->th.th_current_task = NULL;
5700 
5701   // If the __kmp_thread_pool_insert_pt is already past the new insert
5702   // point, then we need to re-scan the entire list.
5703   gtid = this_th->th.th_info.ds.ds_gtid;
5704   if (__kmp_thread_pool_insert_pt != NULL) {
5705     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5706     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5707       __kmp_thread_pool_insert_pt = NULL;
5708     }
5709   }
5710 
5711   // Scan down the list to find the place to insert the thread.
5712   // scan is the address of a link in the list, possibly the address of
5713   // __kmp_thread_pool itself.
5714   //
5715   // In the absence of nested parallelism, the for loop will have 0 iterations.
5716   if (__kmp_thread_pool_insert_pt != NULL) {
5717     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5718   } else {
5719     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5720   }
5721   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5722        scan = &((*scan)->th.th_next_pool))
5723     ;
5724 
5725   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5726   // to its address.
5727   TCW_PTR(this_th->th.th_next_pool, *scan);
5728   __kmp_thread_pool_insert_pt = *scan = this_th;
5729   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5730                    (this_th->th.th_info.ds.ds_gtid <
5731                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5732   TCW_4(this_th->th.th_in_pool, TRUE);
5733   __kmp_suspend_initialize_thread(this_th);
5734   __kmp_lock_suspend_mx(this_th);
5735   if (this_th->th.th_active == TRUE) {
5736     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5737     this_th->th.th_active_in_pool = TRUE;
5738   }
5739 #if KMP_DEBUG
5740   else {
5741     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5742   }
5743 #endif
5744   __kmp_unlock_suspend_mx(this_th);
5745 
5746   TCW_4(__kmp_nth, __kmp_nth - 1);
5747 
5748 #ifdef KMP_ADJUST_BLOCKTIME
5749   /* Adjust blocktime back to user setting or default if necessary */
5750   /* Middle initialization might never have occurred                */
5751   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5752     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5753     if (__kmp_nth <= __kmp_avail_proc) {
5754       __kmp_zero_bt = FALSE;
5755     }
5756   }
5757 #endif /* KMP_ADJUST_BLOCKTIME */
5758 
5759   KMP_MB();
5760 }
5761 
5762 /* ------------------------------------------------------------------------ */
5763 
5764 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5765 #if OMP_PROFILING_SUPPORT
5766   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5767   // TODO: add a configuration option for time granularity
5768   if (ProfileTraceFile)
5769     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5770 #endif
5771 
5772   int gtid = this_thr->th.th_info.ds.ds_gtid;
5773   /*    void                 *stack_data;*/
5774   kmp_team_t **volatile pteam;
5775 
5776   KMP_MB();
5777   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5778 
5779   if (__kmp_env_consistency_check) {
5780     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5781   }
5782 
5783 #if OMPD_SUPPORT
5784   if (ompd_state & OMPD_ENABLE_BP)
5785     ompd_bp_thread_begin();
5786 #endif
5787 
5788 #if OMPT_SUPPORT
5789   ompt_data_t *thread_data = nullptr;
5790   if (ompt_enabled.enabled) {
5791     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5792     *thread_data = ompt_data_none;
5793 
5794     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5795     this_thr->th.ompt_thread_info.wait_id = 0;
5796     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5797     this_thr->th.ompt_thread_info.parallel_flags = 0;
5798     if (ompt_enabled.ompt_callback_thread_begin) {
5799       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5800           ompt_thread_worker, thread_data);
5801     }
5802     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5803   }
5804 #endif
5805 
5806   /* This is the place where threads wait for work */
5807   while (!TCR_4(__kmp_global.g.g_done)) {
5808     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5809     KMP_MB();
5810 
5811     /* wait for work to do */
5812     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5813 
5814     /* No tid yet since not part of a team */
5815     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5816 
5817 #if OMPT_SUPPORT
5818     if (ompt_enabled.enabled) {
5819       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5820     }
5821 #endif
5822 
5823     pteam = &this_thr->th.th_team;
5824 
5825     /* have we been allocated? */
5826     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5827       /* we were just woken up, so run our new task */
5828       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5829         int rc;
5830         KA_TRACE(20,
5831                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5832                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5833                   (*pteam)->t.t_pkfn));
5834 
5835         updateHWFPControl(*pteam);
5836 
5837 #if OMPT_SUPPORT
5838         if (ompt_enabled.enabled) {
5839           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5840         }
5841 #endif
5842 
5843         rc = (*pteam)->t.t_invoke(gtid);
5844         KMP_ASSERT(rc);
5845 
5846         KMP_MB();
5847         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5848                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5849                       (*pteam)->t.t_pkfn));
5850       }
5851 #if OMPT_SUPPORT
5852       if (ompt_enabled.enabled) {
5853         /* no frame set while outside task */
5854         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5855 
5856         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5857       }
5858 #endif
5859       /* join barrier after parallel region */
5860       __kmp_join_barrier(gtid);
5861     }
5862   }
5863   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5864 
5865 #if OMPD_SUPPORT
5866   if (ompd_state & OMPD_ENABLE_BP)
5867     ompd_bp_thread_end();
5868 #endif
5869 
5870 #if OMPT_SUPPORT
5871   if (ompt_enabled.ompt_callback_thread_end) {
5872     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5873   }
5874 #endif
5875 
5876   this_thr->th.th_task_team = NULL;
5877   /* run the destructors for the threadprivate data for this thread */
5878   __kmp_common_destroy_gtid(gtid);
5879 
5880   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5881   KMP_MB();
5882 
5883 #if OMP_PROFILING_SUPPORT
5884   llvm::timeTraceProfilerFinishThread();
5885 #endif
5886   return this_thr;
5887 }
5888 
5889 /* ------------------------------------------------------------------------ */
5890 
5891 void __kmp_internal_end_dest(void *specific_gtid) {
5892   // Make sure no significant bits are lost
5893   int gtid;
5894   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5895 
5896   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5897   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5898    * this is because 0 is reserved for the nothing-stored case */
5899 
5900   __kmp_internal_end_thread(gtid);
5901 }
5902 
5903 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5904 
5905 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5906   __kmp_internal_end_atexit();
5907 }
5908 
5909 #endif
5910 
5911 /* [Windows] josh: when the atexit handler is called, there may still be more
5912    than one thread alive */
5913 void __kmp_internal_end_atexit(void) {
5914   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5915   /* [Windows]
5916      josh: ideally, we want to completely shutdown the library in this atexit
5917      handler, but stat code that depends on thread specific data for gtid fails
5918      because that data becomes unavailable at some point during the shutdown, so
5919      we call __kmp_internal_end_thread instead. We should eventually remove the
5920      dependency on __kmp_get_specific_gtid in the stat code and use
5921      __kmp_internal_end_library to cleanly shutdown the library.
5922 
5923      // TODO: Can some of this comment about GVS be removed?
5924      I suspect that the offending stat code is executed when the calling thread
5925      tries to clean up a dead root thread's data structures, resulting in GVS
5926      code trying to close the GVS structures for that thread, but since the stat
5927      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5928      the calling thread is cleaning up itself instead of another thread, it get
5929      confused. This happens because allowing a thread to unregister and cleanup
5930      another thread is a recent modification for addressing an issue.
5931      Based on the current design (20050722), a thread may end up
5932      trying to unregister another thread only if thread death does not trigger
5933      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5934      thread specific data destructor function to detect thread death. For
5935      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5936      is nothing.  Thus, the workaround is applicable only for Windows static
5937      stat library. */
5938   __kmp_internal_end_library(-1);
5939 #if KMP_OS_WINDOWS
5940   __kmp_close_console();
5941 #endif
5942 }
5943 
5944 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5945   // It is assumed __kmp_forkjoin_lock is acquired.
5946 
5947   int gtid;
5948 
5949   KMP_DEBUG_ASSERT(thread != NULL);
5950 
5951   gtid = thread->th.th_info.ds.ds_gtid;
5952 
5953   if (!is_root) {
5954     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5955       /* Assume the threads are at the fork barrier here */
5956       KA_TRACE(
5957           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5958                gtid));
5959       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5960        * (GEH) */
5961       ANNOTATE_HAPPENS_BEFORE(thread);
5962       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5963                          thread);
5964       __kmp_release_64(&flag);
5965     }
5966 
5967     // Terminate OS thread.
5968     __kmp_reap_worker(thread);
5969 
5970     // The thread was killed asynchronously.  If it was actively
5971     // spinning in the thread pool, decrement the global count.
5972     //
5973     // There is a small timing hole here - if the worker thread was just waking
5974     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5975     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5976     // the global counter might not get updated.
5977     //
5978     // Currently, this can only happen as the library is unloaded,
5979     // so there are no harmful side effects.
5980     if (thread->th.th_active_in_pool) {
5981       thread->th.th_active_in_pool = FALSE;
5982       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5983       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5984     }
5985   }
5986 
5987   __kmp_free_implicit_task(thread);
5988 
5989 // Free the fast memory for tasking
5990 #if USE_FAST_MEMORY
5991   __kmp_free_fast_memory(thread);
5992 #endif /* USE_FAST_MEMORY */
5993 
5994   __kmp_suspend_uninitialize_thread(thread);
5995 
5996   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5997   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5998 
5999   --__kmp_all_nth;
6000   // __kmp_nth was decremented when thread is added to the pool.
6001 
6002 #ifdef KMP_ADJUST_BLOCKTIME
6003   /* Adjust blocktime back to user setting or default if necessary */
6004   /* Middle initialization might never have occurred                */
6005   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6006     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6007     if (__kmp_nth <= __kmp_avail_proc) {
6008       __kmp_zero_bt = FALSE;
6009     }
6010   }
6011 #endif /* KMP_ADJUST_BLOCKTIME */
6012 
6013   /* free the memory being used */
6014   if (__kmp_env_consistency_check) {
6015     if (thread->th.th_cons) {
6016       __kmp_free_cons_stack(thread->th.th_cons);
6017       thread->th.th_cons = NULL;
6018     }
6019   }
6020 
6021   if (thread->th.th_pri_common != NULL) {
6022     __kmp_free(thread->th.th_pri_common);
6023     thread->th.th_pri_common = NULL;
6024   }
6025 
6026   if (thread->th.th_task_state_memo_stack != NULL) {
6027     __kmp_free(thread->th.th_task_state_memo_stack);
6028     thread->th.th_task_state_memo_stack = NULL;
6029   }
6030 
6031 #if KMP_USE_BGET
6032   if (thread->th.th_local.bget_data != NULL) {
6033     __kmp_finalize_bget(thread);
6034   }
6035 #endif
6036 
6037 #if KMP_AFFINITY_SUPPORTED
6038   if (thread->th.th_affin_mask != NULL) {
6039     KMP_CPU_FREE(thread->th.th_affin_mask);
6040     thread->th.th_affin_mask = NULL;
6041   }
6042 #endif /* KMP_AFFINITY_SUPPORTED */
6043 
6044 #if KMP_USE_HIER_SCHED
6045   if (thread->th.th_hier_bar_data != NULL) {
6046     __kmp_free(thread->th.th_hier_bar_data);
6047     thread->th.th_hier_bar_data = NULL;
6048   }
6049 #endif
6050 
6051   __kmp_reap_team(thread->th.th_serial_team);
6052   thread->th.th_serial_team = NULL;
6053   __kmp_free(thread);
6054 
6055   KMP_MB();
6056 
6057 } // __kmp_reap_thread
6058 
6059 static void __kmp_internal_end(void) {
6060   int i;
6061 
6062   /* First, unregister the library */
6063   __kmp_unregister_library();
6064 
6065 #if KMP_OS_WINDOWS
6066   /* In Win static library, we can't tell when a root actually dies, so we
6067      reclaim the data structures for any root threads that have died but not
6068      unregistered themselves, in order to shut down cleanly.
6069      In Win dynamic library we also can't tell when a thread dies.  */
6070   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6071 // dead roots
6072 #endif
6073 
6074   for (i = 0; i < __kmp_threads_capacity; i++)
6075     if (__kmp_root[i])
6076       if (__kmp_root[i]->r.r_active)
6077         break;
6078   KMP_MB(); /* Flush all pending memory write invalidates.  */
6079   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6080 
6081   if (i < __kmp_threads_capacity) {
6082 #if KMP_USE_MONITOR
6083     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6084     KMP_MB(); /* Flush all pending memory write invalidates.  */
6085 
6086     // Need to check that monitor was initialized before reaping it. If we are
6087     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6088     // __kmp_monitor will appear to contain valid data, but it is only valid in
6089     // the parent process, not the child.
6090     // New behavior (201008): instead of keying off of the flag
6091     // __kmp_init_parallel, the monitor thread creation is keyed off
6092     // of the new flag __kmp_init_monitor.
6093     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6094     if (TCR_4(__kmp_init_monitor)) {
6095       __kmp_reap_monitor(&__kmp_monitor);
6096       TCW_4(__kmp_init_monitor, 0);
6097     }
6098     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6099     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6100 #endif // KMP_USE_MONITOR
6101   } else {
6102 /* TODO move this to cleanup code */
6103 #ifdef KMP_DEBUG
6104     /* make sure that everything has properly ended */
6105     for (i = 0; i < __kmp_threads_capacity; i++) {
6106       if (__kmp_root[i]) {
6107         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6108         //                    there can be uber threads alive here
6109         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6110       }
6111     }
6112 #endif
6113 
6114     KMP_MB();
6115 
6116     // Reap the worker threads.
6117     // This is valid for now, but be careful if threads are reaped sooner.
6118     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6119       // Get the next thread from the pool.
6120       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6121       __kmp_thread_pool = thread->th.th_next_pool;
6122       // Reap it.
6123       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6124       thread->th.th_next_pool = NULL;
6125       thread->th.th_in_pool = FALSE;
6126       __kmp_reap_thread(thread, 0);
6127     }
6128     __kmp_thread_pool_insert_pt = NULL;
6129 
6130     // Reap teams.
6131     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6132       // Get the next team from the pool.
6133       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6134       __kmp_team_pool = team->t.t_next_pool;
6135       // Reap it.
6136       team->t.t_next_pool = NULL;
6137       __kmp_reap_team(team);
6138     }
6139 
6140     __kmp_reap_task_teams();
6141 
6142 #if KMP_OS_UNIX
6143     // Threads that are not reaped should not access any resources since they
6144     // are going to be deallocated soon, so the shutdown sequence should wait
6145     // until all threads either exit the final spin-waiting loop or begin
6146     // sleeping after the given blocktime.
6147     for (i = 0; i < __kmp_threads_capacity; i++) {
6148       kmp_info_t *thr = __kmp_threads[i];
6149       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6150         KMP_CPU_PAUSE();
6151     }
6152 #endif
6153 
6154     for (i = 0; i < __kmp_threads_capacity; ++i) {
6155       // TBD: Add some checking...
6156       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6157     }
6158 
6159     /* Make sure all threadprivate destructors get run by joining with all
6160        worker threads before resetting this flag */
6161     TCW_SYNC_4(__kmp_init_common, FALSE);
6162 
6163     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6164     KMP_MB();
6165 
6166 #if KMP_USE_MONITOR
6167     // See note above: One of the possible fixes for CQ138434 / CQ140126
6168     //
6169     // FIXME: push both code fragments down and CSE them?
6170     // push them into __kmp_cleanup() ?
6171     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6172     if (TCR_4(__kmp_init_monitor)) {
6173       __kmp_reap_monitor(&__kmp_monitor);
6174       TCW_4(__kmp_init_monitor, 0);
6175     }
6176     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6177     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6178 #endif
6179   } /* else !__kmp_global.t_active */
6180   TCW_4(__kmp_init_gtid, FALSE);
6181   KMP_MB(); /* Flush all pending memory write invalidates.  */
6182 
6183   __kmp_cleanup();
6184 #if OMPT_SUPPORT
6185   ompt_fini();
6186 #endif
6187 }
6188 
6189 void __kmp_internal_end_library(int gtid_req) {
6190   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6191   /* this shouldn't be a race condition because __kmp_internal_end() is the
6192      only place to clear __kmp_serial_init */
6193   /* we'll check this later too, after we get the lock */
6194   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6195   // redundant, because the next check will work in any case.
6196   if (__kmp_global.g.g_abort) {
6197     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6198     /* TODO abort? */
6199     return;
6200   }
6201   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6202     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6203     return;
6204   }
6205 
6206   KMP_MB(); /* Flush all pending memory write invalidates.  */
6207   /* find out who we are and what we should do */
6208   {
6209     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6210     KA_TRACE(
6211         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6212     if (gtid == KMP_GTID_SHUTDOWN) {
6213       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6214                     "already shutdown\n"));
6215       return;
6216     } else if (gtid == KMP_GTID_MONITOR) {
6217       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6218                     "registered, or system shutdown\n"));
6219       return;
6220     } else if (gtid == KMP_GTID_DNE) {
6221       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6222                     "shutdown\n"));
6223       /* we don't know who we are, but we may still shutdown the library */
6224     } else if (KMP_UBER_GTID(gtid)) {
6225       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6226       if (__kmp_root[gtid]->r.r_active) {
6227         __kmp_global.g.g_abort = -1;
6228         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6229         __kmp_unregister_library();
6230         KA_TRACE(10,
6231                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6232                   gtid));
6233         return;
6234       } else {
6235         KA_TRACE(
6236             10,
6237             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6238         __kmp_unregister_root_current_thread(gtid);
6239       }
6240     } else {
6241 /* worker threads may call this function through the atexit handler, if they
6242  * call exit() */
6243 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6244    TODO: do a thorough shutdown instead */
6245 #ifdef DUMP_DEBUG_ON_EXIT
6246       if (__kmp_debug_buf)
6247         __kmp_dump_debug_buffer();
6248 #endif
6249       // added unregister library call here when we switch to shm linux
6250       // if we don't, it will leave lots of files in /dev/shm
6251       // cleanup shared memory file before exiting.
6252       __kmp_unregister_library();
6253       return;
6254     }
6255   }
6256   /* synchronize the termination process */
6257   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6258 
6259   /* have we already finished */
6260   if (__kmp_global.g.g_abort) {
6261     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6262     /* TODO abort? */
6263     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6264     return;
6265   }
6266   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6267     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6268     return;
6269   }
6270 
6271   /* We need this lock to enforce mutex between this reading of
6272      __kmp_threads_capacity and the writing by __kmp_register_root.
6273      Alternatively, we can use a counter of roots that is atomically updated by
6274      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6275      __kmp_internal_end_*.  */
6276   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6277 
6278   /* now we can safely conduct the actual termination */
6279   __kmp_internal_end();
6280 
6281   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6282   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6283 
6284   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6285 
6286 #ifdef DUMP_DEBUG_ON_EXIT
6287   if (__kmp_debug_buf)
6288     __kmp_dump_debug_buffer();
6289 #endif
6290 
6291 #if KMP_OS_WINDOWS
6292   __kmp_close_console();
6293 #endif
6294 
6295   __kmp_fini_allocator();
6296 
6297 } // __kmp_internal_end_library
6298 
6299 void __kmp_internal_end_thread(int gtid_req) {
6300   int i;
6301 
6302   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6303   /* this shouldn't be a race condition because __kmp_internal_end() is the
6304    * only place to clear __kmp_serial_init */
6305   /* we'll check this later too, after we get the lock */
6306   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6307   // redundant, because the next check will work in any case.
6308   if (__kmp_global.g.g_abort) {
6309     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6310     /* TODO abort? */
6311     return;
6312   }
6313   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6314     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6315     return;
6316   }
6317 
6318   // If hidden helper team has been initialized, we need to deinit it
6319   if (TCR_4(__kmp_init_hidden_helper)) {
6320     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6321     // First release the main thread to let it continue its work
6322     __kmp_hidden_helper_main_thread_release();
6323     // Wait until the hidden helper team has been destroyed
6324     __kmp_hidden_helper_threads_deinitz_wait();
6325   }
6326 
6327   KMP_MB(); /* Flush all pending memory write invalidates.  */
6328 
6329   /* find out who we are and what we should do */
6330   {
6331     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6332     KA_TRACE(10,
6333              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6334     if (gtid == KMP_GTID_SHUTDOWN) {
6335       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6336                     "already shutdown\n"));
6337       return;
6338     } else if (gtid == KMP_GTID_MONITOR) {
6339       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6340                     "registered, or system shutdown\n"));
6341       return;
6342     } else if (gtid == KMP_GTID_DNE) {
6343       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6344                     "shutdown\n"));
6345       return;
6346       /* we don't know who we are */
6347     } else if (KMP_UBER_GTID(gtid)) {
6348       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6349       if (__kmp_root[gtid]->r.r_active) {
6350         __kmp_global.g.g_abort = -1;
6351         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6352         KA_TRACE(10,
6353                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6354                   gtid));
6355         return;
6356       } else {
6357         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6358                       gtid));
6359         __kmp_unregister_root_current_thread(gtid);
6360       }
6361     } else {
6362       /* just a worker thread, let's leave */
6363       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6364 
6365       if (gtid >= 0) {
6366         __kmp_threads[gtid]->th.th_task_team = NULL;
6367       }
6368 
6369       KA_TRACE(10,
6370                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6371                 gtid));
6372       return;
6373     }
6374   }
6375 #if KMP_DYNAMIC_LIB
6376   if (__kmp_pause_status != kmp_hard_paused)
6377   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6378   // because we will better shutdown later in the library destructor.
6379   {
6380     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6381     return;
6382   }
6383 #endif
6384   /* synchronize the termination process */
6385   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6386 
6387   /* have we already finished */
6388   if (__kmp_global.g.g_abort) {
6389     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6390     /* TODO abort? */
6391     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6392     return;
6393   }
6394   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6395     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6396     return;
6397   }
6398 
6399   /* We need this lock to enforce mutex between this reading of
6400      __kmp_threads_capacity and the writing by __kmp_register_root.
6401      Alternatively, we can use a counter of roots that is atomically updated by
6402      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6403      __kmp_internal_end_*.  */
6404 
6405   /* should we finish the run-time?  are all siblings done? */
6406   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6407 
6408   for (i = 0; i < __kmp_threads_capacity; ++i) {
6409     if (KMP_UBER_GTID(i)) {
6410       KA_TRACE(
6411           10,
6412           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6413       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6414       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6415       return;
6416     }
6417   }
6418 
6419   /* now we can safely conduct the actual termination */
6420 
6421   __kmp_internal_end();
6422 
6423   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6424   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6425 
6426   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6427 
6428 #ifdef DUMP_DEBUG_ON_EXIT
6429   if (__kmp_debug_buf)
6430     __kmp_dump_debug_buffer();
6431 #endif
6432 } // __kmp_internal_end_thread
6433 
6434 // -----------------------------------------------------------------------------
6435 // Library registration stuff.
6436 
6437 static long __kmp_registration_flag = 0;
6438 // Random value used to indicate library initialization.
6439 static char *__kmp_registration_str = NULL;
6440 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6441 
6442 static inline char *__kmp_reg_status_name() {
6443 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6444    each thread. If registration and unregistration go in different threads
6445    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6446    env var can not be found, because the name will contain different pid. */
6447 // macOS* complains about name being too long with additional getuid()
6448 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6449   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6450                           (int)getuid());
6451 #else
6452   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6453 #endif
6454 } // __kmp_reg_status_get
6455 
6456 void __kmp_register_library_startup(void) {
6457 
6458   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6459   int done = 0;
6460   union {
6461     double dtime;
6462     long ltime;
6463   } time;
6464 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6465   __kmp_initialize_system_tick();
6466 #endif
6467   __kmp_read_system_time(&time.dtime);
6468   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6469   __kmp_registration_str =
6470       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6471                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6472 
6473   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6474                 __kmp_registration_str));
6475 
6476   while (!done) {
6477 
6478     char *value = NULL; // Actual value of the environment variable.
6479 
6480 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6481     char *shm_name = __kmp_str_format("/%s", name);
6482     int shm_preexist = 0;
6483     char *data1;
6484     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6485     if ((fd1 == -1) && (errno == EEXIST)) {
6486       // file didn't open because it already exists.
6487       // try opening existing file
6488       fd1 = shm_open(shm_name, O_RDWR, 0666);
6489       if (fd1 == -1) { // file didn't open
6490         // error out here
6491         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6492                     __kmp_msg_null);
6493       } else {
6494         // able to open existing file
6495         shm_preexist = 1;
6496       }
6497     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6498       // already exists.
6499       // error out here.
6500       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6501                   __kmp_msg_null);
6502     }
6503     if (shm_preexist == 0) {
6504       // we created SHM now set size
6505       if (ftruncate(fd1, SHM_SIZE) == -1) {
6506         // error occured setting size;
6507         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6508                     KMP_ERR(errno), __kmp_msg_null);
6509       }
6510     }
6511     data1 =
6512         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6513     if (data1 == MAP_FAILED) {
6514       // failed to map shared memory
6515       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6516                   __kmp_msg_null);
6517     }
6518     if (shm_preexist == 0) { // set data to SHM, set value
6519       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6520     }
6521     // Read value from either what we just wrote or existing file.
6522     value = __kmp_str_format("%s", data1); // read value from SHM
6523     munmap(data1, SHM_SIZE);
6524     close(fd1);
6525 #else // Windows and unix with static library
6526     // Set environment variable, but do not overwrite if it is exist.
6527     __kmp_env_set(name, __kmp_registration_str, 0);
6528     // read value to see if it got set
6529     value = __kmp_env_get(name);
6530 #endif
6531 
6532     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6533       done = 1; // Ok, environment variable set successfully, exit the loop.
6534     } else {
6535       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6536       // Check whether it alive or dead.
6537       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6538       char *tail = value;
6539       char *flag_addr_str = NULL;
6540       char *flag_val_str = NULL;
6541       char const *file_name = NULL;
6542       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6543       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6544       file_name = tail;
6545       if (tail != NULL) {
6546         long *flag_addr = 0;
6547         unsigned long flag_val = 0;
6548         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6549         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6550         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6551           // First, check whether environment-encoded address is mapped into
6552           // addr space.
6553           // If so, dereference it to see if it still has the right value.
6554           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6555             neighbor = 1;
6556           } else {
6557             // If not, then we know the other copy of the library is no longer
6558             // running.
6559             neighbor = 2;
6560           }
6561         }
6562       }
6563       switch (neighbor) {
6564       case 0: // Cannot parse environment variable -- neighbor status unknown.
6565         // Assume it is the incompatible format of future version of the
6566         // library. Assume the other library is alive.
6567         // WARN( ... ); // TODO: Issue a warning.
6568         file_name = "unknown library";
6569         KMP_FALLTHROUGH();
6570       // Attention! Falling to the next case. That's intentional.
6571       case 1: { // Neighbor is alive.
6572         // Check it is allowed.
6573         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6574         if (!__kmp_str_match_true(duplicate_ok)) {
6575           // That's not allowed. Issue fatal error.
6576           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6577                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6578         }
6579         KMP_INTERNAL_FREE(duplicate_ok);
6580         __kmp_duplicate_library_ok = 1;
6581         done = 1; // Exit the loop.
6582       } break;
6583       case 2: { // Neighbor is dead.
6584 
6585 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6586         // close shared memory.
6587         shm_unlink(shm_name); // this removes file in /dev/shm
6588 #else
6589         // Clear the variable and try to register library again.
6590         __kmp_env_unset(name);
6591 #endif
6592       } break;
6593       default: {
6594         KMP_DEBUG_ASSERT(0);
6595       } break;
6596       }
6597     }
6598     KMP_INTERNAL_FREE((void *)value);
6599 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6600     KMP_INTERNAL_FREE((void *)shm_name);
6601 #endif
6602   } // while
6603   KMP_INTERNAL_FREE((void *)name);
6604 
6605 } // func __kmp_register_library_startup
6606 
6607 void __kmp_unregister_library(void) {
6608 
6609   char *name = __kmp_reg_status_name();
6610   char *value = NULL;
6611 
6612 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6613   char *shm_name = __kmp_str_format("/%s", name);
6614   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6615   if (fd1 == -1) {
6616     // file did not open. return.
6617     return;
6618   }
6619   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6620   if (data1 != MAP_FAILED) {
6621     value = __kmp_str_format("%s", data1); // read value from SHM
6622     munmap(data1, SHM_SIZE);
6623   }
6624   close(fd1);
6625 #else
6626   value = __kmp_env_get(name);
6627 #endif
6628 
6629   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6630   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6631   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6632 //  Ok, this is our variable. Delete it.
6633 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6634     shm_unlink(shm_name); // this removes file in /dev/shm
6635 #else
6636     __kmp_env_unset(name);
6637 #endif
6638   }
6639 
6640 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6641   KMP_INTERNAL_FREE(shm_name);
6642 #endif
6643 
6644   KMP_INTERNAL_FREE(__kmp_registration_str);
6645   KMP_INTERNAL_FREE(value);
6646   KMP_INTERNAL_FREE(name);
6647 
6648   __kmp_registration_flag = 0;
6649   __kmp_registration_str = NULL;
6650 
6651 } // __kmp_unregister_library
6652 
6653 // End of Library registration stuff.
6654 // -----------------------------------------------------------------------------
6655 
6656 #if KMP_MIC_SUPPORTED
6657 
6658 static void __kmp_check_mic_type() {
6659   kmp_cpuid_t cpuid_state = {0};
6660   kmp_cpuid_t *cs_p = &cpuid_state;
6661   __kmp_x86_cpuid(1, 0, cs_p);
6662   // We don't support mic1 at the moment
6663   if ((cs_p->eax & 0xff0) == 0xB10) {
6664     __kmp_mic_type = mic2;
6665   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6666     __kmp_mic_type = mic3;
6667   } else {
6668     __kmp_mic_type = non_mic;
6669   }
6670 }
6671 
6672 #endif /* KMP_MIC_SUPPORTED */
6673 
6674 #if KMP_HAVE_UMWAIT
6675 static void __kmp_user_level_mwait_init() {
6676   struct kmp_cpuid buf;
6677   __kmp_x86_cpuid(7, 0, &buf);
6678   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6679   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6680                 __kmp_umwait_enabled));
6681 }
6682 #elif KMP_HAVE_MWAIT
6683 #ifndef AT_INTELPHIUSERMWAIT
6684 // Spurious, non-existent value that should always fail to return anything.
6685 // Will be replaced with the correct value when we know that.
6686 #define AT_INTELPHIUSERMWAIT 10000
6687 #endif
6688 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6689 // earlier OS is used to build the RTL, we'll use the following internal
6690 // function when the entry is not found.
6691 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6692 unsigned long getauxval(unsigned long) { return 0; }
6693 
6694 static void __kmp_user_level_mwait_init() {
6695   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6696   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6697   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6698   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6699   if (__kmp_mic_type == mic3) {
6700     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6701     if ((res & 0x1) || __kmp_user_level_mwait) {
6702       __kmp_mwait_enabled = TRUE;
6703       if (__kmp_user_level_mwait) {
6704         KMP_INFORM(EnvMwaitWarn);
6705       }
6706     } else {
6707       __kmp_mwait_enabled = FALSE;
6708     }
6709   }
6710   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6711                 "__kmp_mwait_enabled = %d\n",
6712                 __kmp_mic_type, __kmp_mwait_enabled));
6713 }
6714 #endif /* KMP_HAVE_UMWAIT */
6715 
6716 static void __kmp_do_serial_initialize(void) {
6717   int i, gtid;
6718   size_t size;
6719 
6720   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6721 
6722   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6723   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6724   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6725   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6726   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6727 
6728 #if OMPT_SUPPORT
6729   ompt_pre_init();
6730 #endif
6731 #if OMPD_SUPPORT
6732   __kmp_env_dump();
6733   ompd_init();
6734 #endif
6735 
6736   __kmp_validate_locks();
6737 
6738   /* Initialize internal memory allocator */
6739   __kmp_init_allocator();
6740 
6741   /* Register the library startup via an environment variable and check to see
6742      whether another copy of the library is already registered. */
6743 
6744   __kmp_register_library_startup();
6745 
6746   /* TODO reinitialization of library */
6747   if (TCR_4(__kmp_global.g.g_done)) {
6748     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6749   }
6750 
6751   __kmp_global.g.g_abort = 0;
6752   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6753 
6754 /* initialize the locks */
6755 #if KMP_USE_ADAPTIVE_LOCKS
6756 #if KMP_DEBUG_ADAPTIVE_LOCKS
6757   __kmp_init_speculative_stats();
6758 #endif
6759 #endif
6760 #if KMP_STATS_ENABLED
6761   __kmp_stats_init();
6762 #endif
6763   __kmp_init_lock(&__kmp_global_lock);
6764   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6765   __kmp_init_lock(&__kmp_debug_lock);
6766   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6767   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6768   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6769   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6770   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6771   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6772   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6773   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6774   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6775   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6776   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6777   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6778   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6779   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6780   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6781 #if KMP_USE_MONITOR
6782   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6783 #endif
6784   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6785 
6786   /* conduct initialization and initial setup of configuration */
6787 
6788   __kmp_runtime_initialize();
6789 
6790 #if KMP_MIC_SUPPORTED
6791   __kmp_check_mic_type();
6792 #endif
6793 
6794 // Some global variable initialization moved here from kmp_env_initialize()
6795 #ifdef KMP_DEBUG
6796   kmp_diag = 0;
6797 #endif
6798   __kmp_abort_delay = 0;
6799 
6800   // From __kmp_init_dflt_team_nth()
6801   /* assume the entire machine will be used */
6802   __kmp_dflt_team_nth_ub = __kmp_xproc;
6803   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6804     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6805   }
6806   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6807     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6808   }
6809   __kmp_max_nth = __kmp_sys_max_nth;
6810   __kmp_cg_max_nth = __kmp_sys_max_nth;
6811   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6812   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6813     __kmp_teams_max_nth = __kmp_sys_max_nth;
6814   }
6815 
6816   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6817   // part
6818   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6819 #if KMP_USE_MONITOR
6820   __kmp_monitor_wakeups =
6821       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6822   __kmp_bt_intervals =
6823       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6824 #endif
6825   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6826   __kmp_library = library_throughput;
6827   // From KMP_SCHEDULE initialization
6828   __kmp_static = kmp_sch_static_balanced;
6829 // AC: do not use analytical here, because it is non-monotonous
6830 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6831 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6832 // need to repeat assignment
6833 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6834 // bit control and barrier method control parts
6835 #if KMP_FAST_REDUCTION_BARRIER
6836 #define kmp_reduction_barrier_gather_bb ((int)1)
6837 #define kmp_reduction_barrier_release_bb ((int)1)
6838 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6839 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6840 #endif // KMP_FAST_REDUCTION_BARRIER
6841   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6842     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6843     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6844     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6845     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6846 #if KMP_FAST_REDUCTION_BARRIER
6847     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6848       // lin_64 ): hyper,1
6849       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6850       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6851       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6852       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6853     }
6854 #endif // KMP_FAST_REDUCTION_BARRIER
6855   }
6856 #if KMP_FAST_REDUCTION_BARRIER
6857 #undef kmp_reduction_barrier_release_pat
6858 #undef kmp_reduction_barrier_gather_pat
6859 #undef kmp_reduction_barrier_release_bb
6860 #undef kmp_reduction_barrier_gather_bb
6861 #endif // KMP_FAST_REDUCTION_BARRIER
6862 #if KMP_MIC_SUPPORTED
6863   if (__kmp_mic_type == mic2) { // KNC
6864     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6865     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6866     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6867         1; // forkjoin release
6868     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6869     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6870   }
6871 #if KMP_FAST_REDUCTION_BARRIER
6872   if (__kmp_mic_type == mic2) { // KNC
6873     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6874     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6875   }
6876 #endif // KMP_FAST_REDUCTION_BARRIER
6877 #endif // KMP_MIC_SUPPORTED
6878 
6879 // From KMP_CHECKS initialization
6880 #ifdef KMP_DEBUG
6881   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6882 #else
6883   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6884 #endif
6885 
6886   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6887   __kmp_foreign_tp = TRUE;
6888 
6889   __kmp_global.g.g_dynamic = FALSE;
6890   __kmp_global.g.g_dynamic_mode = dynamic_default;
6891 
6892   __kmp_init_nesting_mode();
6893 
6894   __kmp_env_initialize(NULL);
6895 
6896 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6897   __kmp_user_level_mwait_init();
6898 #endif
6899 // Print all messages in message catalog for testing purposes.
6900 #ifdef KMP_DEBUG
6901   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6902   if (__kmp_str_match_true(val)) {
6903     kmp_str_buf_t buffer;
6904     __kmp_str_buf_init(&buffer);
6905     __kmp_i18n_dump_catalog(&buffer);
6906     __kmp_printf("%s", buffer.str);
6907     __kmp_str_buf_free(&buffer);
6908   }
6909   __kmp_env_free(&val);
6910 #endif
6911 
6912   __kmp_threads_capacity =
6913       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6914   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6915   __kmp_tp_capacity = __kmp_default_tp_capacity(
6916       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6917 
6918   // If the library is shut down properly, both pools must be NULL. Just in
6919   // case, set them to NULL -- some memory may leak, but subsequent code will
6920   // work even if pools are not freed.
6921   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6922   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6923   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6924   __kmp_thread_pool = NULL;
6925   __kmp_thread_pool_insert_pt = NULL;
6926   __kmp_team_pool = NULL;
6927 
6928   /* Allocate all of the variable sized records */
6929   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6930    * expandable */
6931   /* Since allocation is cache-aligned, just add extra padding at the end */
6932   size =
6933       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6934       CACHE_LINE;
6935   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6936   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6937                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6938 
6939   /* init thread counts */
6940   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6941                    0); // Asserts fail if the library is reinitializing and
6942   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6943   __kmp_all_nth = 0;
6944   __kmp_nth = 0;
6945 
6946   /* setup the uber master thread and hierarchy */
6947   gtid = __kmp_register_root(TRUE);
6948   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6949   KMP_ASSERT(KMP_UBER_GTID(gtid));
6950   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6951 
6952   KMP_MB(); /* Flush all pending memory write invalidates.  */
6953 
6954   __kmp_common_initialize();
6955 
6956 #if KMP_OS_UNIX
6957   /* invoke the child fork handler */
6958   __kmp_register_atfork();
6959 #endif
6960 
6961 #if !KMP_DYNAMIC_LIB
6962   {
6963     /* Invoke the exit handler when the program finishes, only for static
6964        library. For dynamic library, we already have _fini and DllMain. */
6965     int rc = atexit(__kmp_internal_end_atexit);
6966     if (rc != 0) {
6967       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6968                   __kmp_msg_null);
6969     }
6970   }
6971 #endif
6972 
6973 #if KMP_HANDLE_SIGNALS
6974 #if KMP_OS_UNIX
6975   /* NOTE: make sure that this is called before the user installs their own
6976      signal handlers so that the user handlers are called first. this way they
6977      can return false, not call our handler, avoid terminating the library, and
6978      continue execution where they left off. */
6979   __kmp_install_signals(FALSE);
6980 #endif /* KMP_OS_UNIX */
6981 #if KMP_OS_WINDOWS
6982   __kmp_install_signals(TRUE);
6983 #endif /* KMP_OS_WINDOWS */
6984 #endif
6985 
6986   /* we have finished the serial initialization */
6987   __kmp_init_counter++;
6988 
6989   __kmp_init_serial = TRUE;
6990 
6991   if (__kmp_settings) {
6992     __kmp_env_print();
6993   }
6994 
6995   if (__kmp_display_env || __kmp_display_env_verbose) {
6996     __kmp_env_print_2();
6997   }
6998 
6999 #if OMPT_SUPPORT
7000   ompt_post_init();
7001 #endif
7002 
7003   KMP_MB();
7004 
7005   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7006 }
7007 
7008 void __kmp_serial_initialize(void) {
7009   if (__kmp_init_serial) {
7010     return;
7011   }
7012   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7013   if (__kmp_init_serial) {
7014     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7015     return;
7016   }
7017   __kmp_do_serial_initialize();
7018   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7019 }
7020 
7021 static void __kmp_do_middle_initialize(void) {
7022   int i, j;
7023   int prev_dflt_team_nth;
7024 
7025   if (!__kmp_init_serial) {
7026     __kmp_do_serial_initialize();
7027   }
7028 
7029   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7030 
7031   // Save the previous value for the __kmp_dflt_team_nth so that
7032   // we can avoid some reinitialization if it hasn't changed.
7033   prev_dflt_team_nth = __kmp_dflt_team_nth;
7034 
7035 #if KMP_AFFINITY_SUPPORTED
7036   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7037   // number of cores on the machine.
7038   __kmp_affinity_initialize();
7039 
7040   // Run through the __kmp_threads array and set the affinity mask
7041   // for each root thread that is currently registered with the RTL.
7042   for (i = 0; i < __kmp_threads_capacity; i++) {
7043     if (TCR_PTR(__kmp_threads[i]) != NULL) {
7044       __kmp_affinity_set_init_mask(i, TRUE);
7045     }
7046   }
7047 #endif /* KMP_AFFINITY_SUPPORTED */
7048 
7049   KMP_ASSERT(__kmp_xproc > 0);
7050   if (__kmp_avail_proc == 0) {
7051     __kmp_avail_proc = __kmp_xproc;
7052   }
7053 
7054   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7055   // correct them now
7056   j = 0;
7057   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7058     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7059         __kmp_avail_proc;
7060     j++;
7061   }
7062 
7063   if (__kmp_dflt_team_nth == 0) {
7064 #ifdef KMP_DFLT_NTH_CORES
7065     // Default #threads = #cores
7066     __kmp_dflt_team_nth = __kmp_ncores;
7067     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7068                   "__kmp_ncores (%d)\n",
7069                   __kmp_dflt_team_nth));
7070 #else
7071     // Default #threads = #available OS procs
7072     __kmp_dflt_team_nth = __kmp_avail_proc;
7073     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7074                   "__kmp_avail_proc(%d)\n",
7075                   __kmp_dflt_team_nth));
7076 #endif /* KMP_DFLT_NTH_CORES */
7077   }
7078 
7079   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7080     __kmp_dflt_team_nth = KMP_MIN_NTH;
7081   }
7082   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7083     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7084   }
7085 
7086   if (__kmp_nesting_mode > 0)
7087     __kmp_set_nesting_mode_threads();
7088 
7089   // There's no harm in continuing if the following check fails,
7090   // but it indicates an error in the previous logic.
7091   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7092 
7093   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7094     // Run through the __kmp_threads array and set the num threads icv for each
7095     // root thread that is currently registered with the RTL (which has not
7096     // already explicitly set its nthreads-var with a call to
7097     // omp_set_num_threads()).
7098     for (i = 0; i < __kmp_threads_capacity; i++) {
7099       kmp_info_t *thread = __kmp_threads[i];
7100       if (thread == NULL)
7101         continue;
7102       if (thread->th.th_current_task->td_icvs.nproc != 0)
7103         continue;
7104 
7105       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7106     }
7107   }
7108   KA_TRACE(
7109       20,
7110       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7111        __kmp_dflt_team_nth));
7112 
7113 #ifdef KMP_ADJUST_BLOCKTIME
7114   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7115   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7116     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7117     if (__kmp_nth > __kmp_avail_proc) {
7118       __kmp_zero_bt = TRUE;
7119     }
7120   }
7121 #endif /* KMP_ADJUST_BLOCKTIME */
7122 
7123   /* we have finished middle initialization */
7124   TCW_SYNC_4(__kmp_init_middle, TRUE);
7125 
7126   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7127 }
7128 
7129 void __kmp_middle_initialize(void) {
7130   if (__kmp_init_middle) {
7131     return;
7132   }
7133   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7134   if (__kmp_init_middle) {
7135     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7136     return;
7137   }
7138   __kmp_do_middle_initialize();
7139   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7140 }
7141 
7142 void __kmp_parallel_initialize(void) {
7143   int gtid = __kmp_entry_gtid(); // this might be a new root
7144 
7145   /* synchronize parallel initialization (for sibling) */
7146   if (TCR_4(__kmp_init_parallel))
7147     return;
7148   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7149   if (TCR_4(__kmp_init_parallel)) {
7150     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7151     return;
7152   }
7153 
7154   /* TODO reinitialization after we have already shut down */
7155   if (TCR_4(__kmp_global.g.g_done)) {
7156     KA_TRACE(
7157         10,
7158         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7159     __kmp_infinite_loop();
7160   }
7161 
7162   /* jc: The lock __kmp_initz_lock is already held, so calling
7163      __kmp_serial_initialize would cause a deadlock.  So we call
7164      __kmp_do_serial_initialize directly. */
7165   if (!__kmp_init_middle) {
7166     __kmp_do_middle_initialize();
7167   }
7168   __kmp_resume_if_hard_paused();
7169 
7170   /* begin initialization */
7171   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7172   KMP_ASSERT(KMP_UBER_GTID(gtid));
7173 
7174 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7175   // Save the FP control regs.
7176   // Worker threads will set theirs to these values at thread startup.
7177   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7178   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7179   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7180 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7181 
7182 #if KMP_OS_UNIX
7183 #if KMP_HANDLE_SIGNALS
7184   /*  must be after __kmp_serial_initialize  */
7185   __kmp_install_signals(TRUE);
7186 #endif
7187 #endif
7188 
7189   __kmp_suspend_initialize();
7190 
7191 #if defined(USE_LOAD_BALANCE)
7192   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7193     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7194   }
7195 #else
7196   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7197     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7198   }
7199 #endif
7200 
7201   if (__kmp_version) {
7202     __kmp_print_version_2();
7203   }
7204 
7205   /* we have finished parallel initialization */
7206   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7207 
7208   KMP_MB();
7209   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7210 
7211   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7212 }
7213 
7214 void __kmp_hidden_helper_initialize() {
7215   if (TCR_4(__kmp_init_hidden_helper))
7216     return;
7217 
7218   // __kmp_parallel_initialize is required before we initialize hidden helper
7219   if (!TCR_4(__kmp_init_parallel))
7220     __kmp_parallel_initialize();
7221 
7222   // Double check. Note that this double check should not be placed before
7223   // __kmp_parallel_initialize as it will cause dead lock.
7224   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7225   if (TCR_4(__kmp_init_hidden_helper)) {
7226     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7227     return;
7228   }
7229 
7230   // Set the count of hidden helper tasks to be executed to zero
7231   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7232 
7233   // Set the global variable indicating that we're initializing hidden helper
7234   // team/threads
7235   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7236 
7237   // Platform independent initialization
7238   __kmp_do_initialize_hidden_helper_threads();
7239 
7240   // Wait here for the finish of initialization of hidden helper teams
7241   __kmp_hidden_helper_threads_initz_wait();
7242 
7243   // We have finished hidden helper initialization
7244   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7245 
7246   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7247 }
7248 
7249 /* ------------------------------------------------------------------------ */
7250 
7251 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7252                                    kmp_team_t *team) {
7253   kmp_disp_t *dispatch;
7254 
7255   KMP_MB();
7256 
7257   /* none of the threads have encountered any constructs, yet. */
7258   this_thr->th.th_local.this_construct = 0;
7259 #if KMP_CACHE_MANAGE
7260   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7261 #endif /* KMP_CACHE_MANAGE */
7262   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7263   KMP_DEBUG_ASSERT(dispatch);
7264   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7265   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7266   // this_thr->th.th_info.ds.ds_tid ] );
7267 
7268   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7269   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7270   if (__kmp_env_consistency_check)
7271     __kmp_push_parallel(gtid, team->t.t_ident);
7272 
7273   KMP_MB(); /* Flush all pending memory write invalidates.  */
7274 }
7275 
7276 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7277                                   kmp_team_t *team) {
7278   if (__kmp_env_consistency_check)
7279     __kmp_pop_parallel(gtid, team->t.t_ident);
7280 
7281   __kmp_finish_implicit_task(this_thr);
7282 }
7283 
7284 int __kmp_invoke_task_func(int gtid) {
7285   int rc;
7286   int tid = __kmp_tid_from_gtid(gtid);
7287   kmp_info_t *this_thr = __kmp_threads[gtid];
7288   kmp_team_t *team = this_thr->th.th_team;
7289 
7290   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7291 #if USE_ITT_BUILD
7292   if (__itt_stack_caller_create_ptr) {
7293     // inform ittnotify about entering user's code
7294     if (team->t.t_stack_id != NULL) {
7295       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7296     } else {
7297       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7298       __kmp_itt_stack_callee_enter(
7299           (__itt_caller)team->t.t_parent->t.t_stack_id);
7300     }
7301   }
7302 #endif /* USE_ITT_BUILD */
7303 #if INCLUDE_SSC_MARKS
7304   SSC_MARK_INVOKING();
7305 #endif
7306 
7307 #if OMPT_SUPPORT
7308   void *dummy;
7309   void **exit_frame_p;
7310   ompt_data_t *my_task_data;
7311   ompt_data_t *my_parallel_data;
7312   int ompt_team_size;
7313 
7314   if (ompt_enabled.enabled) {
7315     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7316                          .ompt_task_info.frame.exit_frame.ptr);
7317   } else {
7318     exit_frame_p = &dummy;
7319   }
7320 
7321   my_task_data =
7322       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7323   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7324   if (ompt_enabled.ompt_callback_implicit_task) {
7325     ompt_team_size = team->t.t_nproc;
7326     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7327         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7328         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7329     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7330   }
7331 #endif
7332 
7333 #if KMP_STATS_ENABLED
7334   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7335   if (previous_state == stats_state_e::TEAMS_REGION) {
7336     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7337   } else {
7338     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7339   }
7340   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7341 #endif
7342 
7343   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7344                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7345 #if OMPT_SUPPORT
7346                               ,
7347                               exit_frame_p
7348 #endif
7349   );
7350 #if OMPT_SUPPORT
7351   *exit_frame_p = NULL;
7352   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7353 #endif
7354 
7355 #if KMP_STATS_ENABLED
7356   if (previous_state == stats_state_e::TEAMS_REGION) {
7357     KMP_SET_THREAD_STATE(previous_state);
7358   }
7359   KMP_POP_PARTITIONED_TIMER();
7360 #endif
7361 
7362 #if USE_ITT_BUILD
7363   if (__itt_stack_caller_create_ptr) {
7364     // inform ittnotify about leaving user's code
7365     if (team->t.t_stack_id != NULL) {
7366       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7367     } else {
7368       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7369       __kmp_itt_stack_callee_leave(
7370           (__itt_caller)team->t.t_parent->t.t_stack_id);
7371     }
7372   }
7373 #endif /* USE_ITT_BUILD */
7374   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7375 
7376   return rc;
7377 }
7378 
7379 void __kmp_teams_master(int gtid) {
7380   // This routine is called by all primary threads in teams construct
7381   kmp_info_t *thr = __kmp_threads[gtid];
7382   kmp_team_t *team = thr->th.th_team;
7383   ident_t *loc = team->t.t_ident;
7384   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7385   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7386   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7387   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7388                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7389 
7390   // This thread is a new CG root.  Set up the proper variables.
7391   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7392   tmp->cg_root = thr; // Make thr the CG root
7393   // Init to thread limit stored when league primary threads were forked
7394   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7395   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7396   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7397                  " cg_nthreads to 1\n",
7398                  thr, tmp));
7399   tmp->up = thr->th.th_cg_roots;
7400   thr->th.th_cg_roots = tmp;
7401 
7402 // Launch league of teams now, but not let workers execute
7403 // (they hang on fork barrier until next parallel)
7404 #if INCLUDE_SSC_MARKS
7405   SSC_MARK_FORKING();
7406 #endif
7407   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7408                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7409                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7410 #if INCLUDE_SSC_MARKS
7411   SSC_MARK_JOINING();
7412 #endif
7413   // If the team size was reduced from the limit, set it to the new size
7414   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7415     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7416   // AC: last parameter "1" eliminates join barrier which won't work because
7417   // worker threads are in a fork barrier waiting for more parallel regions
7418   __kmp_join_call(loc, gtid
7419 #if OMPT_SUPPORT
7420                   ,
7421                   fork_context_intel
7422 #endif
7423                   ,
7424                   1);
7425 }
7426 
7427 int __kmp_invoke_teams_master(int gtid) {
7428   kmp_info_t *this_thr = __kmp_threads[gtid];
7429   kmp_team_t *team = this_thr->th.th_team;
7430 #if KMP_DEBUG
7431   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7432     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7433                      (void *)__kmp_teams_master);
7434 #endif
7435   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7436 #if OMPT_SUPPORT
7437   int tid = __kmp_tid_from_gtid(gtid);
7438   ompt_data_t *task_data =
7439       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7440   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7441   if (ompt_enabled.ompt_callback_implicit_task) {
7442     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7443         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7444         ompt_task_initial);
7445     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7446   }
7447 #endif
7448   __kmp_teams_master(gtid);
7449 #if OMPT_SUPPORT
7450   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7451 #endif
7452   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7453   return 1;
7454 }
7455 
7456 /* this sets the requested number of threads for the next parallel region
7457    encountered by this team. since this should be enclosed in the forkjoin
7458    critical section it should avoid race conditions with asymmetrical nested
7459    parallelism */
7460 
7461 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7462   kmp_info_t *thr = __kmp_threads[gtid];
7463 
7464   if (num_threads > 0)
7465     thr->th.th_set_nproc = num_threads;
7466 }
7467 
7468 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7469                                     int num_threads) {
7470   KMP_DEBUG_ASSERT(thr);
7471   // Remember the number of threads for inner parallel regions
7472   if (!TCR_4(__kmp_init_middle))
7473     __kmp_middle_initialize(); // get internal globals calculated
7474   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7475   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7476 
7477   if (num_threads == 0) {
7478     if (__kmp_teams_thread_limit > 0) {
7479       num_threads = __kmp_teams_thread_limit;
7480     } else {
7481       num_threads = __kmp_avail_proc / num_teams;
7482     }
7483     // adjust num_threads w/o warning as it is not user setting
7484     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7485     // no thread_limit clause specified -  do not change thread-limit-var ICV
7486     if (num_threads > __kmp_dflt_team_nth) {
7487       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7488     }
7489     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7490       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7491     } // prevent team size to exceed thread-limit-var
7492     if (num_teams * num_threads > __kmp_teams_max_nth) {
7493       num_threads = __kmp_teams_max_nth / num_teams;
7494     }
7495     if (num_threads == 0) {
7496       num_threads = 1;
7497     }
7498   } else {
7499     // This thread will be the primary thread of the league primary threads
7500     // Store new thread limit; old limit is saved in th_cg_roots list
7501     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7502     // num_threads = min(num_threads, nthreads-var)
7503     if (num_threads > __kmp_dflt_team_nth) {
7504       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7505     }
7506     if (num_teams * num_threads > __kmp_teams_max_nth) {
7507       int new_threads = __kmp_teams_max_nth / num_teams;
7508       if (new_threads == 0) {
7509         new_threads = 1;
7510       }
7511       if (new_threads != num_threads) {
7512         if (!__kmp_reserve_warn) { // user asked for too many threads
7513           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7514           __kmp_msg(kmp_ms_warning,
7515                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7516                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7517         }
7518       }
7519       num_threads = new_threads;
7520     }
7521   }
7522   thr->th.th_teams_size.nth = num_threads;
7523 }
7524 
7525 /* this sets the requested number of teams for the teams region and/or
7526    the number of threads for the next parallel region encountered  */
7527 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7528                           int num_threads) {
7529   kmp_info_t *thr = __kmp_threads[gtid];
7530   KMP_DEBUG_ASSERT(num_teams >= 0);
7531   KMP_DEBUG_ASSERT(num_threads >= 0);
7532 
7533   if (num_teams == 0) {
7534     if (__kmp_nteams > 0) {
7535       num_teams = __kmp_nteams;
7536     } else {
7537       num_teams = 1; // default number of teams is 1.
7538     }
7539   }
7540   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7541     if (!__kmp_reserve_warn) {
7542       __kmp_reserve_warn = 1;
7543       __kmp_msg(kmp_ms_warning,
7544                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7545                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7546     }
7547     num_teams = __kmp_teams_max_nth;
7548   }
7549   // Set number of teams (number of threads in the outer "parallel" of the
7550   // teams)
7551   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7552 
7553   __kmp_push_thread_limit(thr, num_teams, num_threads);
7554 }
7555 
7556 /* This sets the requested number of teams for the teams region and/or
7557    the number of threads for the next parallel region encountered  */
7558 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7559                              int num_teams_ub, int num_threads) {
7560   kmp_info_t *thr = __kmp_threads[gtid];
7561   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7562   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7563   KMP_DEBUG_ASSERT(num_threads >= 0);
7564 
7565   if (num_teams_lb > num_teams_ub) {
7566     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7567                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7568   }
7569 
7570   int num_teams = 1; // defalt number of teams is 1.
7571 
7572   if (num_teams_lb == 0 && num_teams_ub > 0)
7573     num_teams_lb = num_teams_ub;
7574 
7575   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7576     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7577     if (num_teams > __kmp_teams_max_nth) {
7578       if (!__kmp_reserve_warn) {
7579         __kmp_reserve_warn = 1;
7580         __kmp_msg(kmp_ms_warning,
7581                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7582                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7583       }
7584       num_teams = __kmp_teams_max_nth;
7585     }
7586   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7587     num_teams = num_teams_ub;
7588   } else { // num_teams_lb <= num_teams <= num_teams_ub
7589     if (num_threads == 0) {
7590       if (num_teams_ub > __kmp_teams_max_nth) {
7591         num_teams = num_teams_lb;
7592       } else {
7593         num_teams = num_teams_ub;
7594       }
7595     } else {
7596       num_teams = (num_threads > __kmp_teams_max_nth)
7597                       ? num_teams
7598                       : __kmp_teams_max_nth / num_threads;
7599       if (num_teams < num_teams_lb) {
7600         num_teams = num_teams_lb;
7601       } else if (num_teams > num_teams_ub) {
7602         num_teams = num_teams_ub;
7603       }
7604     }
7605   }
7606   // Set number of teams (number of threads in the outer "parallel" of the
7607   // teams)
7608   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7609 
7610   __kmp_push_thread_limit(thr, num_teams, num_threads);
7611 }
7612 
7613 // Set the proc_bind var to use in the following parallel region.
7614 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7615   kmp_info_t *thr = __kmp_threads[gtid];
7616   thr->th.th_set_proc_bind = proc_bind;
7617 }
7618 
7619 /* Launch the worker threads into the microtask. */
7620 
7621 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7622   kmp_info_t *this_thr = __kmp_threads[gtid];
7623 
7624 #ifdef KMP_DEBUG
7625   int f;
7626 #endif /* KMP_DEBUG */
7627 
7628   KMP_DEBUG_ASSERT(team);
7629   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7630   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7631   KMP_MB(); /* Flush all pending memory write invalidates.  */
7632 
7633   team->t.t_construct = 0; /* no single directives seen yet */
7634   team->t.t_ordered.dt.t_value =
7635       0; /* thread 0 enters the ordered section first */
7636 
7637   /* Reset the identifiers on the dispatch buffer */
7638   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7639   if (team->t.t_max_nproc > 1) {
7640     int i;
7641     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7642       team->t.t_disp_buffer[i].buffer_index = i;
7643       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7644     }
7645   } else {
7646     team->t.t_disp_buffer[0].buffer_index = 0;
7647     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7648   }
7649 
7650   KMP_MB(); /* Flush all pending memory write invalidates.  */
7651   KMP_ASSERT(this_thr->th.th_team == team);
7652 
7653 #ifdef KMP_DEBUG
7654   for (f = 0; f < team->t.t_nproc; f++) {
7655     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7656                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7657   }
7658 #endif /* KMP_DEBUG */
7659 
7660   /* release the worker threads so they may begin working */
7661   __kmp_fork_barrier(gtid, 0);
7662 }
7663 
7664 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7665   kmp_info_t *this_thr = __kmp_threads[gtid];
7666 
7667   KMP_DEBUG_ASSERT(team);
7668   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7669   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7670   KMP_MB(); /* Flush all pending memory write invalidates.  */
7671 
7672   /* Join barrier after fork */
7673 
7674 #ifdef KMP_DEBUG
7675   if (__kmp_threads[gtid] &&
7676       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7677     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7678                  __kmp_threads[gtid]);
7679     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7680                  "team->t.t_nproc=%d\n",
7681                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7682                  team->t.t_nproc);
7683     __kmp_print_structure();
7684   }
7685   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7686                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7687 #endif /* KMP_DEBUG */
7688 
7689   __kmp_join_barrier(gtid); /* wait for everyone */
7690 #if OMPT_SUPPORT
7691   if (ompt_enabled.enabled &&
7692       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7693     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7694     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7695     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7696 #if OMPT_OPTIONAL
7697     void *codeptr = NULL;
7698     if (KMP_MASTER_TID(ds_tid) &&
7699         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7700          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7701       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7702 
7703     if (ompt_enabled.ompt_callback_sync_region_wait) {
7704       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7705           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7706           codeptr);
7707     }
7708     if (ompt_enabled.ompt_callback_sync_region) {
7709       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7710           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7711           codeptr);
7712     }
7713 #endif
7714     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7715       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7716           ompt_scope_end, NULL, task_data, 0, ds_tid,
7717           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7718     }
7719   }
7720 #endif
7721 
7722   KMP_MB(); /* Flush all pending memory write invalidates.  */
7723   KMP_ASSERT(this_thr->th.th_team == team);
7724 }
7725 
7726 /* ------------------------------------------------------------------------ */
7727 
7728 #ifdef USE_LOAD_BALANCE
7729 
7730 // Return the worker threads actively spinning in the hot team, if we
7731 // are at the outermost level of parallelism.  Otherwise, return 0.
7732 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7733   int i;
7734   int retval;
7735   kmp_team_t *hot_team;
7736 
7737   if (root->r.r_active) {
7738     return 0;
7739   }
7740   hot_team = root->r.r_hot_team;
7741   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7742     return hot_team->t.t_nproc - 1; // Don't count primary thread
7743   }
7744 
7745   // Skip the primary thread - it is accounted for elsewhere.
7746   retval = 0;
7747   for (i = 1; i < hot_team->t.t_nproc; i++) {
7748     if (hot_team->t.t_threads[i]->th.th_active) {
7749       retval++;
7750     }
7751   }
7752   return retval;
7753 }
7754 
7755 // Perform an automatic adjustment to the number of
7756 // threads used by the next parallel region.
7757 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7758   int retval;
7759   int pool_active;
7760   int hot_team_active;
7761   int team_curr_active;
7762   int system_active;
7763 
7764   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7765                 set_nproc));
7766   KMP_DEBUG_ASSERT(root);
7767   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7768                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7769   KMP_DEBUG_ASSERT(set_nproc > 1);
7770 
7771   if (set_nproc == 1) {
7772     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7773     return 1;
7774   }
7775 
7776   // Threads that are active in the thread pool, active in the hot team for this
7777   // particular root (if we are at the outer par level), and the currently
7778   // executing thread (to become the primary thread) are available to add to the
7779   // new team, but are currently contributing to the system load, and must be
7780   // accounted for.
7781   pool_active = __kmp_thread_pool_active_nth;
7782   hot_team_active = __kmp_active_hot_team_nproc(root);
7783   team_curr_active = pool_active + hot_team_active + 1;
7784 
7785   // Check the system load.
7786   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7787   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7788                 "hot team active = %d\n",
7789                 system_active, pool_active, hot_team_active));
7790 
7791   if (system_active < 0) {
7792     // There was an error reading the necessary info from /proc, so use the
7793     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7794     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7795     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7796     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7797 
7798     // Make this call behave like the thread limit algorithm.
7799     retval = __kmp_avail_proc - __kmp_nth +
7800              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7801     if (retval > set_nproc) {
7802       retval = set_nproc;
7803     }
7804     if (retval < KMP_MIN_NTH) {
7805       retval = KMP_MIN_NTH;
7806     }
7807 
7808     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7809                   retval));
7810     return retval;
7811   }
7812 
7813   // There is a slight delay in the load balance algorithm in detecting new
7814   // running procs. The real system load at this instant should be at least as
7815   // large as the #active omp thread that are available to add to the team.
7816   if (system_active < team_curr_active) {
7817     system_active = team_curr_active;
7818   }
7819   retval = __kmp_avail_proc - system_active + team_curr_active;
7820   if (retval > set_nproc) {
7821     retval = set_nproc;
7822   }
7823   if (retval < KMP_MIN_NTH) {
7824     retval = KMP_MIN_NTH;
7825   }
7826 
7827   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7828   return retval;
7829 } // __kmp_load_balance_nproc()
7830 
7831 #endif /* USE_LOAD_BALANCE */
7832 
7833 /* ------------------------------------------------------------------------ */
7834 
7835 /* NOTE: this is called with the __kmp_init_lock held */
7836 void __kmp_cleanup(void) {
7837   int f;
7838 
7839   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7840 
7841   if (TCR_4(__kmp_init_parallel)) {
7842 #if KMP_HANDLE_SIGNALS
7843     __kmp_remove_signals();
7844 #endif
7845     TCW_4(__kmp_init_parallel, FALSE);
7846   }
7847 
7848   if (TCR_4(__kmp_init_middle)) {
7849 #if KMP_AFFINITY_SUPPORTED
7850     __kmp_affinity_uninitialize();
7851 #endif /* KMP_AFFINITY_SUPPORTED */
7852     __kmp_cleanup_hierarchy();
7853     TCW_4(__kmp_init_middle, FALSE);
7854   }
7855 
7856   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7857 
7858   if (__kmp_init_serial) {
7859     __kmp_runtime_destroy();
7860     __kmp_init_serial = FALSE;
7861   }
7862 
7863   __kmp_cleanup_threadprivate_caches();
7864 
7865   for (f = 0; f < __kmp_threads_capacity; f++) {
7866     if (__kmp_root[f] != NULL) {
7867       __kmp_free(__kmp_root[f]);
7868       __kmp_root[f] = NULL;
7869     }
7870   }
7871   __kmp_free(__kmp_threads);
7872   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7873   // there is no need in freeing __kmp_root.
7874   __kmp_threads = NULL;
7875   __kmp_root = NULL;
7876   __kmp_threads_capacity = 0;
7877 
7878 #if KMP_USE_DYNAMIC_LOCK
7879   __kmp_cleanup_indirect_user_locks();
7880 #else
7881   __kmp_cleanup_user_locks();
7882 #endif
7883 #if OMPD_SUPPORT
7884   if (ompd_state) {
7885     __kmp_free(ompd_env_block);
7886     ompd_env_block = NULL;
7887     ompd_env_block_size = 0;
7888   }
7889 #endif
7890 
7891 #if KMP_AFFINITY_SUPPORTED
7892   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7893   __kmp_cpuinfo_file = NULL;
7894 #endif /* KMP_AFFINITY_SUPPORTED */
7895 
7896 #if KMP_USE_ADAPTIVE_LOCKS
7897 #if KMP_DEBUG_ADAPTIVE_LOCKS
7898   __kmp_print_speculative_stats();
7899 #endif
7900 #endif
7901   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7902   __kmp_nested_nth.nth = NULL;
7903   __kmp_nested_nth.size = 0;
7904   __kmp_nested_nth.used = 0;
7905   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7906   __kmp_nested_proc_bind.bind_types = NULL;
7907   __kmp_nested_proc_bind.size = 0;
7908   __kmp_nested_proc_bind.used = 0;
7909   if (__kmp_affinity_format) {
7910     KMP_INTERNAL_FREE(__kmp_affinity_format);
7911     __kmp_affinity_format = NULL;
7912   }
7913 
7914   __kmp_i18n_catclose();
7915 
7916 #if KMP_USE_HIER_SCHED
7917   __kmp_hier_scheds.deallocate();
7918 #endif
7919 
7920 #if KMP_STATS_ENABLED
7921   __kmp_stats_fini();
7922 #endif
7923 
7924   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7925 }
7926 
7927 /* ------------------------------------------------------------------------ */
7928 
7929 int __kmp_ignore_mppbeg(void) {
7930   char *env;
7931 
7932   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7933     if (__kmp_str_match_false(env))
7934       return FALSE;
7935   }
7936   // By default __kmpc_begin() is no-op.
7937   return TRUE;
7938 }
7939 
7940 int __kmp_ignore_mppend(void) {
7941   char *env;
7942 
7943   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7944     if (__kmp_str_match_false(env))
7945       return FALSE;
7946   }
7947   // By default __kmpc_end() is no-op.
7948   return TRUE;
7949 }
7950 
7951 void __kmp_internal_begin(void) {
7952   int gtid;
7953   kmp_root_t *root;
7954 
7955   /* this is a very important step as it will register new sibling threads
7956      and assign these new uber threads a new gtid */
7957   gtid = __kmp_entry_gtid();
7958   root = __kmp_threads[gtid]->th.th_root;
7959   KMP_ASSERT(KMP_UBER_GTID(gtid));
7960 
7961   if (root->r.r_begin)
7962     return;
7963   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7964   if (root->r.r_begin) {
7965     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7966     return;
7967   }
7968 
7969   root->r.r_begin = TRUE;
7970 
7971   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7972 }
7973 
7974 /* ------------------------------------------------------------------------ */
7975 
7976 void __kmp_user_set_library(enum library_type arg) {
7977   int gtid;
7978   kmp_root_t *root;
7979   kmp_info_t *thread;
7980 
7981   /* first, make sure we are initialized so we can get our gtid */
7982 
7983   gtid = __kmp_entry_gtid();
7984   thread = __kmp_threads[gtid];
7985 
7986   root = thread->th.th_root;
7987 
7988   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7989                 library_serial));
7990   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7991                                   thread */
7992     KMP_WARNING(SetLibraryIncorrectCall);
7993     return;
7994   }
7995 
7996   switch (arg) {
7997   case library_serial:
7998     thread->th.th_set_nproc = 0;
7999     set__nproc(thread, 1);
8000     break;
8001   case library_turnaround:
8002     thread->th.th_set_nproc = 0;
8003     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8004                                            : __kmp_dflt_team_nth_ub);
8005     break;
8006   case library_throughput:
8007     thread->th.th_set_nproc = 0;
8008     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8009                                            : __kmp_dflt_team_nth_ub);
8010     break;
8011   default:
8012     KMP_FATAL(UnknownLibraryType, arg);
8013   }
8014 
8015   __kmp_aux_set_library(arg);
8016 }
8017 
8018 void __kmp_aux_set_stacksize(size_t arg) {
8019   if (!__kmp_init_serial)
8020     __kmp_serial_initialize();
8021 
8022 #if KMP_OS_DARWIN
8023   if (arg & (0x1000 - 1)) {
8024     arg &= ~(0x1000 - 1);
8025     if (arg + 0x1000) /* check for overflow if we round up */
8026       arg += 0x1000;
8027   }
8028 #endif
8029   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8030 
8031   /* only change the default stacksize before the first parallel region */
8032   if (!TCR_4(__kmp_init_parallel)) {
8033     size_t value = arg; /* argument is in bytes */
8034 
8035     if (value < __kmp_sys_min_stksize)
8036       value = __kmp_sys_min_stksize;
8037     else if (value > KMP_MAX_STKSIZE)
8038       value = KMP_MAX_STKSIZE;
8039 
8040     __kmp_stksize = value;
8041 
8042     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8043   }
8044 
8045   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8046 }
8047 
8048 /* set the behaviour of the runtime library */
8049 /* TODO this can cause some odd behaviour with sibling parallelism... */
8050 void __kmp_aux_set_library(enum library_type arg) {
8051   __kmp_library = arg;
8052 
8053   switch (__kmp_library) {
8054   case library_serial: {
8055     KMP_INFORM(LibraryIsSerial);
8056   } break;
8057   case library_turnaround:
8058     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8059       __kmp_use_yield = 2; // only yield when oversubscribed
8060     break;
8061   case library_throughput:
8062     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8063       __kmp_dflt_blocktime = 200;
8064     break;
8065   default:
8066     KMP_FATAL(UnknownLibraryType, arg);
8067   }
8068 }
8069 
8070 /* Getting team information common for all team API */
8071 // Returns NULL if not in teams construct
8072 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8073   kmp_info_t *thr = __kmp_entry_thread();
8074   teams_serialized = 0;
8075   if (thr->th.th_teams_microtask) {
8076     kmp_team_t *team = thr->th.th_team;
8077     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8078     int ii = team->t.t_level;
8079     teams_serialized = team->t.t_serialized;
8080     int level = tlevel + 1;
8081     KMP_DEBUG_ASSERT(ii >= tlevel);
8082     while (ii > level) {
8083       for (teams_serialized = team->t.t_serialized;
8084            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8085       }
8086       if (team->t.t_serialized && (!teams_serialized)) {
8087         team = team->t.t_parent;
8088         continue;
8089       }
8090       if (ii > level) {
8091         team = team->t.t_parent;
8092         ii--;
8093       }
8094     }
8095     return team;
8096   }
8097   return NULL;
8098 }
8099 
8100 int __kmp_aux_get_team_num() {
8101   int serialized;
8102   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8103   if (team) {
8104     if (serialized > 1) {
8105       return 0; // teams region is serialized ( 1 team of 1 thread ).
8106     } else {
8107       return team->t.t_master_tid;
8108     }
8109   }
8110   return 0;
8111 }
8112 
8113 int __kmp_aux_get_num_teams() {
8114   int serialized;
8115   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8116   if (team) {
8117     if (serialized > 1) {
8118       return 1;
8119     } else {
8120       return team->t.t_parent->t.t_nproc;
8121     }
8122   }
8123   return 1;
8124 }
8125 
8126 /* ------------------------------------------------------------------------ */
8127 
8128 /*
8129  * Affinity Format Parser
8130  *
8131  * Field is in form of: %[[[0].]size]type
8132  * % and type are required (%% means print a literal '%')
8133  * type is either single char or long name surrounded by {},
8134  * e.g., N or {num_threads}
8135  * 0 => leading zeros
8136  * . => right justified when size is specified
8137  * by default output is left justified
8138  * size is the *minimum* field length
8139  * All other characters are printed as is
8140  *
8141  * Available field types:
8142  * L {thread_level}      - omp_get_level()
8143  * n {thread_num}        - omp_get_thread_num()
8144  * h {host}              - name of host machine
8145  * P {process_id}        - process id (integer)
8146  * T {thread_identifier} - native thread identifier (integer)
8147  * N {num_threads}       - omp_get_num_threads()
8148  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8149  * a {thread_affinity}   - comma separated list of integers or integer ranges
8150  *                         (values of affinity mask)
8151  *
8152  * Implementation-specific field types can be added
8153  * If a type is unknown, print "undefined"
8154  */
8155 
8156 // Structure holding the short name, long name, and corresponding data type
8157 // for snprintf.  A table of these will represent the entire valid keyword
8158 // field types.
8159 typedef struct kmp_affinity_format_field_t {
8160   char short_name; // from spec e.g., L -> thread level
8161   const char *long_name; // from spec thread_level -> thread level
8162   char field_format; // data type for snprintf (typically 'd' or 's'
8163   // for integer or string)
8164 } kmp_affinity_format_field_t;
8165 
8166 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8167 #if KMP_AFFINITY_SUPPORTED
8168     {'A', "thread_affinity", 's'},
8169 #endif
8170     {'t', "team_num", 'd'},
8171     {'T', "num_teams", 'd'},
8172     {'L', "nesting_level", 'd'},
8173     {'n', "thread_num", 'd'},
8174     {'N', "num_threads", 'd'},
8175     {'a', "ancestor_tnum", 'd'},
8176     {'H', "host", 's'},
8177     {'P', "process_id", 'd'},
8178     {'i', "native_thread_id", 'd'}};
8179 
8180 // Return the number of characters it takes to hold field
8181 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8182                                             const char **ptr,
8183                                             kmp_str_buf_t *field_buffer) {
8184   int rc, format_index, field_value;
8185   const char *width_left, *width_right;
8186   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8187   static const int FORMAT_SIZE = 20;
8188   char format[FORMAT_SIZE] = {0};
8189   char absolute_short_name = 0;
8190 
8191   KMP_DEBUG_ASSERT(gtid >= 0);
8192   KMP_DEBUG_ASSERT(th);
8193   KMP_DEBUG_ASSERT(**ptr == '%');
8194   KMP_DEBUG_ASSERT(field_buffer);
8195 
8196   __kmp_str_buf_clear(field_buffer);
8197 
8198   // Skip the initial %
8199   (*ptr)++;
8200 
8201   // Check for %% first
8202   if (**ptr == '%') {
8203     __kmp_str_buf_cat(field_buffer, "%", 1);
8204     (*ptr)++; // skip over the second %
8205     return 1;
8206   }
8207 
8208   // Parse field modifiers if they are present
8209   pad_zeros = false;
8210   if (**ptr == '0') {
8211     pad_zeros = true;
8212     (*ptr)++; // skip over 0
8213   }
8214   right_justify = false;
8215   if (**ptr == '.') {
8216     right_justify = true;
8217     (*ptr)++; // skip over .
8218   }
8219   // Parse width of field: [width_left, width_right)
8220   width_left = width_right = NULL;
8221   if (**ptr >= '0' && **ptr <= '9') {
8222     width_left = *ptr;
8223     SKIP_DIGITS(*ptr);
8224     width_right = *ptr;
8225   }
8226 
8227   // Create the format for KMP_SNPRINTF based on flags parsed above
8228   format_index = 0;
8229   format[format_index++] = '%';
8230   if (!right_justify)
8231     format[format_index++] = '-';
8232   if (pad_zeros)
8233     format[format_index++] = '0';
8234   if (width_left && width_right) {
8235     int i = 0;
8236     // Only allow 8 digit number widths.
8237     // This also prevents overflowing format variable
8238     while (i < 8 && width_left < width_right) {
8239       format[format_index++] = *width_left;
8240       width_left++;
8241       i++;
8242     }
8243   }
8244 
8245   // Parse a name (long or short)
8246   // Canonicalize the name into absolute_short_name
8247   found_valid_name = false;
8248   parse_long_name = (**ptr == '{');
8249   if (parse_long_name)
8250     (*ptr)++; // skip initial left brace
8251   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8252                              sizeof(__kmp_affinity_format_table[0]);
8253        ++i) {
8254     char short_name = __kmp_affinity_format_table[i].short_name;
8255     const char *long_name = __kmp_affinity_format_table[i].long_name;
8256     char field_format = __kmp_affinity_format_table[i].field_format;
8257     if (parse_long_name) {
8258       size_t length = KMP_STRLEN(long_name);
8259       if (strncmp(*ptr, long_name, length) == 0) {
8260         found_valid_name = true;
8261         (*ptr) += length; // skip the long name
8262       }
8263     } else if (**ptr == short_name) {
8264       found_valid_name = true;
8265       (*ptr)++; // skip the short name
8266     }
8267     if (found_valid_name) {
8268       format[format_index++] = field_format;
8269       format[format_index++] = '\0';
8270       absolute_short_name = short_name;
8271       break;
8272     }
8273   }
8274   if (parse_long_name) {
8275     if (**ptr != '}') {
8276       absolute_short_name = 0;
8277     } else {
8278       (*ptr)++; // skip over the right brace
8279     }
8280   }
8281 
8282   // Attempt to fill the buffer with the requested
8283   // value using snprintf within __kmp_str_buf_print()
8284   switch (absolute_short_name) {
8285   case 't':
8286     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8287     break;
8288   case 'T':
8289     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8290     break;
8291   case 'L':
8292     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8293     break;
8294   case 'n':
8295     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8296     break;
8297   case 'H': {
8298     static const int BUFFER_SIZE = 256;
8299     char buf[BUFFER_SIZE];
8300     __kmp_expand_host_name(buf, BUFFER_SIZE);
8301     rc = __kmp_str_buf_print(field_buffer, format, buf);
8302   } break;
8303   case 'P':
8304     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8305     break;
8306   case 'i':
8307     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8308     break;
8309   case 'N':
8310     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8311     break;
8312   case 'a':
8313     field_value =
8314         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8315     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8316     break;
8317 #if KMP_AFFINITY_SUPPORTED
8318   case 'A': {
8319     kmp_str_buf_t buf;
8320     __kmp_str_buf_init(&buf);
8321     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8322     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8323     __kmp_str_buf_free(&buf);
8324   } break;
8325 #endif
8326   default:
8327     // According to spec, If an implementation does not have info for field
8328     // type, then "undefined" is printed
8329     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8330     // Skip the field
8331     if (parse_long_name) {
8332       SKIP_TOKEN(*ptr);
8333       if (**ptr == '}')
8334         (*ptr)++;
8335     } else {
8336       (*ptr)++;
8337     }
8338   }
8339 
8340   KMP_ASSERT(format_index <= FORMAT_SIZE);
8341   return rc;
8342 }
8343 
8344 /*
8345  * Return number of characters needed to hold the affinity string
8346  * (not including null byte character)
8347  * The resultant string is printed to buffer, which the caller can then
8348  * handle afterwards
8349  */
8350 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8351                                   kmp_str_buf_t *buffer) {
8352   const char *parse_ptr;
8353   size_t retval;
8354   const kmp_info_t *th;
8355   kmp_str_buf_t field;
8356 
8357   KMP_DEBUG_ASSERT(buffer);
8358   KMP_DEBUG_ASSERT(gtid >= 0);
8359 
8360   __kmp_str_buf_init(&field);
8361   __kmp_str_buf_clear(buffer);
8362 
8363   th = __kmp_threads[gtid];
8364   retval = 0;
8365 
8366   // If format is NULL or zero-length string, then we use
8367   // affinity-format-var ICV
8368   parse_ptr = format;
8369   if (parse_ptr == NULL || *parse_ptr == '\0') {
8370     parse_ptr = __kmp_affinity_format;
8371   }
8372   KMP_DEBUG_ASSERT(parse_ptr);
8373 
8374   while (*parse_ptr != '\0') {
8375     // Parse a field
8376     if (*parse_ptr == '%') {
8377       // Put field in the buffer
8378       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8379       __kmp_str_buf_catbuf(buffer, &field);
8380       retval += rc;
8381     } else {
8382       // Put literal character in buffer
8383       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8384       retval++;
8385       parse_ptr++;
8386     }
8387   }
8388   __kmp_str_buf_free(&field);
8389   return retval;
8390 }
8391 
8392 // Displays the affinity string to stdout
8393 void __kmp_aux_display_affinity(int gtid, const char *format) {
8394   kmp_str_buf_t buf;
8395   __kmp_str_buf_init(&buf);
8396   __kmp_aux_capture_affinity(gtid, format, &buf);
8397   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8398   __kmp_str_buf_free(&buf);
8399 }
8400 
8401 /* ------------------------------------------------------------------------ */
8402 
8403 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8404   int blocktime = arg; /* argument is in milliseconds */
8405 #if KMP_USE_MONITOR
8406   int bt_intervals;
8407 #endif
8408   kmp_int8 bt_set;
8409 
8410   __kmp_save_internal_controls(thread);
8411 
8412   /* Normalize and set blocktime for the teams */
8413   if (blocktime < KMP_MIN_BLOCKTIME)
8414     blocktime = KMP_MIN_BLOCKTIME;
8415   else if (blocktime > KMP_MAX_BLOCKTIME)
8416     blocktime = KMP_MAX_BLOCKTIME;
8417 
8418   set__blocktime_team(thread->th.th_team, tid, blocktime);
8419   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8420 
8421 #if KMP_USE_MONITOR
8422   /* Calculate and set blocktime intervals for the teams */
8423   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8424 
8425   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8426   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8427 #endif
8428 
8429   /* Set whether blocktime has been set to "TRUE" */
8430   bt_set = TRUE;
8431 
8432   set__bt_set_team(thread->th.th_team, tid, bt_set);
8433   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8434 #if KMP_USE_MONITOR
8435   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8436                 "bt_intervals=%d, monitor_updates=%d\n",
8437                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8438                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8439                 __kmp_monitor_wakeups));
8440 #else
8441   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8442                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8443                 thread->th.th_team->t.t_id, tid, blocktime));
8444 #endif
8445 }
8446 
8447 void __kmp_aux_set_defaults(char const *str, size_t len) {
8448   if (!__kmp_init_serial) {
8449     __kmp_serial_initialize();
8450   }
8451   __kmp_env_initialize(str);
8452 
8453   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8454     __kmp_env_print();
8455   }
8456 } // __kmp_aux_set_defaults
8457 
8458 /* ------------------------------------------------------------------------ */
8459 /* internal fast reduction routines */
8460 
8461 PACKED_REDUCTION_METHOD_T
8462 __kmp_determine_reduction_method(
8463     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8464     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8465     kmp_critical_name *lck) {
8466 
8467   // Default reduction method: critical construct ( lck != NULL, like in current
8468   // PAROPT )
8469   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8470   // can be selected by RTL
8471   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8472   // can be selected by RTL
8473   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8474   // among generated by PAROPT.
8475 
8476   PACKED_REDUCTION_METHOD_T retval;
8477 
8478   int team_size;
8479 
8480   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8481   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8482 
8483 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8484   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8485 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8486 
8487   retval = critical_reduce_block;
8488 
8489   // another choice of getting a team size (with 1 dynamic deference) is slower
8490   team_size = __kmp_get_team_num_threads(global_tid);
8491   if (team_size == 1) {
8492 
8493     retval = empty_reduce_block;
8494 
8495   } else {
8496 
8497     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8498 
8499 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8500     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8501 
8502 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8503     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8504 
8505     int teamsize_cutoff = 4;
8506 
8507 #if KMP_MIC_SUPPORTED
8508     if (__kmp_mic_type != non_mic) {
8509       teamsize_cutoff = 8;
8510     }
8511 #endif
8512     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8513     if (tree_available) {
8514       if (team_size <= teamsize_cutoff) {
8515         if (atomic_available) {
8516           retval = atomic_reduce_block;
8517         }
8518       } else {
8519         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8520       }
8521     } else if (atomic_available) {
8522       retval = atomic_reduce_block;
8523     }
8524 #else
8525 #error "Unknown or unsupported OS"
8526 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8527        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8528 
8529 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8530 
8531 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8532 
8533     // basic tuning
8534 
8535     if (atomic_available) {
8536       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8537         retval = atomic_reduce_block;
8538       }
8539     } // otherwise: use critical section
8540 
8541 #elif KMP_OS_DARWIN
8542 
8543     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8544     if (atomic_available && (num_vars <= 3)) {
8545       retval = atomic_reduce_block;
8546     } else if (tree_available) {
8547       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8548           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8549         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8550       }
8551     } // otherwise: use critical section
8552 
8553 #else
8554 #error "Unknown or unsupported OS"
8555 #endif
8556 
8557 #else
8558 #error "Unknown or unsupported architecture"
8559 #endif
8560   }
8561 
8562   // KMP_FORCE_REDUCTION
8563 
8564   // If the team is serialized (team_size == 1), ignore the forced reduction
8565   // method and stay with the unsynchronized method (empty_reduce_block)
8566   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8567       team_size != 1) {
8568 
8569     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8570 
8571     int atomic_available, tree_available;
8572 
8573     switch ((forced_retval = __kmp_force_reduction_method)) {
8574     case critical_reduce_block:
8575       KMP_ASSERT(lck); // lck should be != 0
8576       break;
8577 
8578     case atomic_reduce_block:
8579       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8580       if (!atomic_available) {
8581         KMP_WARNING(RedMethodNotSupported, "atomic");
8582         forced_retval = critical_reduce_block;
8583       }
8584       break;
8585 
8586     case tree_reduce_block:
8587       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8588       if (!tree_available) {
8589         KMP_WARNING(RedMethodNotSupported, "tree");
8590         forced_retval = critical_reduce_block;
8591       } else {
8592 #if KMP_FAST_REDUCTION_BARRIER
8593         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8594 #endif
8595       }
8596       break;
8597 
8598     default:
8599       KMP_ASSERT(0); // "unsupported method specified"
8600     }
8601 
8602     retval = forced_retval;
8603   }
8604 
8605   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8606 
8607 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8608 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8609 
8610   return (retval);
8611 }
8612 // this function is for testing set/get/determine reduce method
8613 kmp_int32 __kmp_get_reduce_method(void) {
8614   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8615 }
8616 
8617 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8618 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8619 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8620 
8621 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8622 // OpenMP is used subsequently.
8623 void __kmp_hard_pause() {
8624   __kmp_pause_status = kmp_hard_paused;
8625   __kmp_internal_end_thread(-1);
8626 }
8627 
8628 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8629 void __kmp_resume_if_soft_paused() {
8630   if (__kmp_pause_status == kmp_soft_paused) {
8631     __kmp_pause_status = kmp_not_paused;
8632 
8633     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8634       kmp_info_t *thread = __kmp_threads[gtid];
8635       if (thread) { // Wake it if sleeping
8636         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8637                          thread);
8638         if (fl.is_sleeping())
8639           fl.resume(gtid);
8640         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8641           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8642         } else { // thread holds the lock and may sleep soon
8643           do { // until either the thread sleeps, or we can get the lock
8644             if (fl.is_sleeping()) {
8645               fl.resume(gtid);
8646               break;
8647             } else if (__kmp_try_suspend_mx(thread)) {
8648               __kmp_unlock_suspend_mx(thread);
8649               break;
8650             }
8651           } while (1);
8652         }
8653       }
8654     }
8655   }
8656 }
8657 
8658 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8659 // TODO: add warning messages
8660 int __kmp_pause_resource(kmp_pause_status_t level) {
8661   if (level == kmp_not_paused) { // requesting resume
8662     if (__kmp_pause_status == kmp_not_paused) {
8663       // error message about runtime not being paused, so can't resume
8664       return 1;
8665     } else {
8666       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8667                        __kmp_pause_status == kmp_hard_paused);
8668       __kmp_pause_status = kmp_not_paused;
8669       return 0;
8670     }
8671   } else if (level == kmp_soft_paused) { // requesting soft pause
8672     if (__kmp_pause_status != kmp_not_paused) {
8673       // error message about already being paused
8674       return 1;
8675     } else {
8676       __kmp_soft_pause();
8677       return 0;
8678     }
8679   } else if (level == kmp_hard_paused) { // requesting hard pause
8680     if (__kmp_pause_status != kmp_not_paused) {
8681       // error message about already being paused
8682       return 1;
8683     } else {
8684       __kmp_hard_pause();
8685       return 0;
8686     }
8687   } else {
8688     // error message about invalid level
8689     return 1;
8690   }
8691 }
8692 
8693 void __kmp_omp_display_env(int verbose) {
8694   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8695   if (__kmp_init_serial == 0)
8696     __kmp_do_serial_initialize();
8697   __kmp_display_env_impl(!verbose, verbose);
8698   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8699 }
8700 
8701 // Globals and functions for hidden helper task
8702 kmp_info_t **__kmp_hidden_helper_threads;
8703 kmp_info_t *__kmp_hidden_helper_main_thread;
8704 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8705 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8706 #if KMP_OS_LINUX
8707 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8708 #else
8709 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8710 #endif
8711 
8712 namespace {
8713 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8714 
8715 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8716   // This is an explicit synchronization on all hidden helper threads in case
8717   // that when a regular thread pushes a hidden helper task to one hidden
8718   // helper thread, the thread has not been awaken once since they're released
8719   // by the main thread after creating the team.
8720   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8721   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8722          __kmp_hidden_helper_threads_num)
8723     ;
8724 
8725   // If main thread, then wait for signal
8726   if (__kmpc_master(nullptr, *gtid)) {
8727     // First, unset the initial state and release the initial thread
8728     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8729     __kmp_hidden_helper_initz_release();
8730     __kmp_hidden_helper_main_thread_wait();
8731     // Now wake up all worker threads
8732     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8733       __kmp_hidden_helper_worker_thread_signal();
8734     }
8735   }
8736 }
8737 } // namespace
8738 
8739 void __kmp_hidden_helper_threads_initz_routine() {
8740   // Create a new root for hidden helper team/threads
8741   const int gtid = __kmp_register_root(TRUE);
8742   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8743   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8744   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8745       __kmp_hidden_helper_threads_num;
8746 
8747   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8748 
8749   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8750 
8751   // Set the initialization flag to FALSE
8752   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8753 
8754   __kmp_hidden_helper_threads_deinitz_release();
8755 }
8756 
8757 /* Nesting Mode:
8758    Set via KMP_NESTING_MODE, which takes an integer.
8759    Note: we skip duplicate topology levels, and skip levels with only
8760       one entity.
8761    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
8762    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
8763       in the topology, and initializes the number of threads at each of those
8764       levels to the number of entities at each level, respectively, below the
8765       entity at the parent level.
8766    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
8767       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
8768       the user to turn nesting on explicitly. This is an even more experimental
8769       option to this experimental feature, and may change or go away in the
8770       future.
8771 */
8772 
8773 // Allocate space to store nesting levels
8774 void __kmp_init_nesting_mode() {
8775   int levels = KMP_HW_LAST;
8776   __kmp_nesting_mode_nlevels = levels;
8777   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
8778   for (int i = 0; i < levels; ++i)
8779     __kmp_nesting_nth_level[i] = 0;
8780   if (__kmp_nested_nth.size < levels) {
8781     __kmp_nested_nth.nth =
8782         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
8783     __kmp_nested_nth.size = levels;
8784   }
8785 }
8786 
8787 // Set # threads for top levels of nesting; must be called after topology set
8788 void __kmp_set_nesting_mode_threads() {
8789   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
8790 
8791   if (__kmp_nesting_mode == 1)
8792     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
8793   else if (__kmp_nesting_mode > 1)
8794     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8795 
8796   if (__kmp_topology) { // use topology info
8797     int loc, hw_level;
8798     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
8799                                 loc < __kmp_nesting_mode_nlevels;
8800          loc++, hw_level++) {
8801       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
8802       if (__kmp_nesting_nth_level[loc] == 1)
8803         loc--;
8804     }
8805     // Make sure all cores are used
8806     if (__kmp_nesting_mode > 1 && loc > 1) {
8807       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
8808       int num_cores = __kmp_topology->get_count(core_level);
8809       int upper_levels = 1;
8810       for (int level = 0; level < loc - 1; ++level)
8811         upper_levels *= __kmp_nesting_nth_level[level];
8812       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
8813         __kmp_nesting_nth_level[loc - 1] =
8814             num_cores / __kmp_nesting_nth_level[loc - 2];
8815     }
8816     __kmp_nesting_mode_nlevels = loc;
8817     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8818   } else { // no topology info available; provide a reasonable guesstimation
8819     if (__kmp_avail_proc >= 4) {
8820       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
8821       __kmp_nesting_nth_level[1] = 2;
8822       __kmp_nesting_mode_nlevels = 2;
8823     } else {
8824       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
8825       __kmp_nesting_mode_nlevels = 1;
8826     }
8827     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8828   }
8829   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
8830     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
8831   }
8832   set__nproc(thread, __kmp_nesting_nth_level[0]);
8833   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
8834     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8835   if (get__max_active_levels(thread) > 1) {
8836     // if max levels was set, set nesting mode levels to same
8837     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
8838   }
8839   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
8840     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
8841 }
8842