1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #include "tsan_annotations.h"
51 
52 #if KMP_OS_WINDOWS
53 // windows does not need include files as it doesn't use shared memory
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60 
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63     KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] =
67     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68 
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71     KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75 
76 /* ------------------------------------------------------------------------ */
77 
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81 
82 /* Forward declarations */
83 
84 void __kmp_cleanup(void);
85 
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87                                   int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89                                   kmp_internal_control_t *new_icvs,
90                                   ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93                                    int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99                           kmp_internal_control_t *new_icvs, ident_t *loc);
100 
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104 
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111 
112 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113                                int new_nthreads);
114 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115 
116 /* Calculate the identifier of the current thread */
117 /* fast (and somewhat portable) way to get unique identifier of executing
118    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
119 int __kmp_get_global_thread_id() {
120   int i;
121   kmp_info_t **other_threads;
122   size_t stack_data;
123   char *stack_addr;
124   size_t stack_size;
125   char *stack_base;
126 
127   KA_TRACE(
128       1000,
129       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
130        __kmp_nth, __kmp_all_nth));
131 
132   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
133      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
134      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
135      __kmp_init_gtid for this to work. */
136 
137   if (!TCR_4(__kmp_init_gtid))
138     return KMP_GTID_DNE;
139 
140 #ifdef KMP_TDATA_GTID
141   if (TCR_4(__kmp_gtid_mode) >= 3) {
142     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
143     return __kmp_gtid;
144   }
145 #endif
146   if (TCR_4(__kmp_gtid_mode) >= 2) {
147     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
148     return __kmp_gtid_get_specific();
149   }
150   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
151 
152   stack_addr = (char *)&stack_data;
153   other_threads = __kmp_threads;
154 
155   /* ATT: The code below is a source of potential bugs due to unsynchronized
156      access to __kmp_threads array. For example:
157      1. Current thread loads other_threads[i] to thr and checks it, it is
158         non-NULL.
159      2. Current thread is suspended by OS.
160      3. Another thread unregisters and finishes (debug versions of free()
161         may fill memory with something like 0xEF).
162      4. Current thread is resumed.
163      5. Current thread reads junk from *thr.
164      TODO: Fix it.  --ln  */
165 
166   for (i = 0; i < __kmp_threads_capacity; i++) {
167 
168     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
169     if (!thr)
170       continue;
171 
172     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
173     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
174 
175     /* stack grows down -- search through all of the active threads */
176 
177     if (stack_addr <= stack_base) {
178       size_t stack_diff = stack_base - stack_addr;
179 
180       if (stack_diff <= stack_size) {
181         /* The only way we can be closer than the allocated */
182         /* stack size is if we are running on this thread. */
183         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
184         return i;
185       }
186     }
187   }
188 
189   /* get specific to try and determine our gtid */
190   KA_TRACE(1000,
191            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
192             "thread, using TLS\n"));
193   i = __kmp_gtid_get_specific();
194 
195   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
196 
197   /* if we havn't been assigned a gtid, then return code */
198   if (i < 0)
199     return i;
200 
201   /* dynamically updated stack window for uber threads to avoid get_specific
202      call */
203   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
204     KMP_FATAL(StackOverflow, i);
205   }
206 
207   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
208   if (stack_addr > stack_base) {
209     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
210     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
211             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
212                 stack_base);
213   } else {
214     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
215             stack_base - stack_addr);
216   }
217 
218   /* Reprint stack bounds for ubermaster since they have been refined */
219   if (__kmp_storage_map) {
220     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
221     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
222     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
223                                  other_threads[i]->th.th_info.ds.ds_stacksize,
224                                  "th_%d stack (refinement)", i);
225   }
226   return i;
227 }
228 
229 int __kmp_get_global_thread_id_reg() {
230   int gtid;
231 
232   if (!__kmp_init_serial) {
233     gtid = KMP_GTID_DNE;
234   } else
235 #ifdef KMP_TDATA_GTID
236       if (TCR_4(__kmp_gtid_mode) >= 3) {
237     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
238     gtid = __kmp_gtid;
239   } else
240 #endif
241       if (TCR_4(__kmp_gtid_mode) >= 2) {
242     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
243     gtid = __kmp_gtid_get_specific();
244   } else {
245     KA_TRACE(1000,
246              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
247     gtid = __kmp_get_global_thread_id();
248   }
249 
250   /* we must be a new uber master sibling thread */
251   if (gtid == KMP_GTID_DNE) {
252     KA_TRACE(10,
253              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
254               "Registering a new gtid.\n"));
255     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
256     if (!__kmp_init_serial) {
257       __kmp_do_serial_initialize();
258       gtid = __kmp_gtid_get_specific();
259     } else {
260       gtid = __kmp_register_root(FALSE);
261     }
262     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
263     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
264   }
265 
266   KMP_DEBUG_ASSERT(gtid >= 0);
267 
268   return gtid;
269 }
270 
271 /* caller must hold forkjoin_lock */
272 void __kmp_check_stack_overlap(kmp_info_t *th) {
273   int f;
274   char *stack_beg = NULL;
275   char *stack_end = NULL;
276   int gtid;
277 
278   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
279   if (__kmp_storage_map) {
280     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
281     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
282 
283     gtid = __kmp_gtid_from_thread(th);
284 
285     if (gtid == KMP_GTID_MONITOR) {
286       __kmp_print_storage_map_gtid(
287           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
288           "th_%s stack (%s)", "mon",
289           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
290     } else {
291       __kmp_print_storage_map_gtid(
292           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
293           "th_%d stack (%s)", gtid,
294           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
295     }
296   }
297 
298   /* No point in checking ubermaster threads since they use refinement and
299    * cannot overlap */
300   gtid = __kmp_gtid_from_thread(th);
301   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
302     KA_TRACE(10,
303              ("__kmp_check_stack_overlap: performing extensive checking\n"));
304     if (stack_beg == NULL) {
305       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
306       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
307     }
308 
309     for (f = 0; f < __kmp_threads_capacity; f++) {
310       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
311 
312       if (f_th && f_th != th) {
313         char *other_stack_end =
314             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
315         char *other_stack_beg =
316             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
317         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
318             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
319 
320           /* Print the other stack values before the abort */
321           if (__kmp_storage_map)
322             __kmp_print_storage_map_gtid(
323                 -1, other_stack_beg, other_stack_end,
324                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
325                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
326 
327           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
328                       __kmp_msg_null);
329         }
330       }
331     }
332   }
333   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
334 }
335 
336 /* ------------------------------------------------------------------------ */
337 
338 void __kmp_infinite_loop(void) {
339   static int done = FALSE;
340 
341   while (!done) {
342     KMP_YIELD(TRUE);
343   }
344 }
345 
346 #define MAX_MESSAGE 512
347 
348 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
349                                   char const *format, ...) {
350   char buffer[MAX_MESSAGE];
351   va_list ap;
352 
353   va_start(ap, format);
354   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
355                p2, (unsigned long)size, format);
356   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
357   __kmp_vprintf(kmp_err, buffer, ap);
358 #if KMP_PRINT_DATA_PLACEMENT
359   int node;
360   if (gtid >= 0) {
361     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
362       if (__kmp_storage_map_verbose) {
363         node = __kmp_get_host_node(p1);
364         if (node < 0) /* doesn't work, so don't try this next time */
365           __kmp_storage_map_verbose = FALSE;
366         else {
367           char *last;
368           int lastNode;
369           int localProc = __kmp_get_cpu_from_gtid(gtid);
370 
371           const int page_size = KMP_GET_PAGE_SIZE();
372 
373           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
374           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
375           if (localProc >= 0)
376             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
377                                  localProc >> 1);
378           else
379             __kmp_printf_no_lock("  GTID %d\n", gtid);
380 #if KMP_USE_PRCTL
381           /* The more elaborate format is disabled for now because of the prctl
382            * hanging bug. */
383           do {
384             last = p1;
385             lastNode = node;
386             /* This loop collates adjacent pages with the same host node. */
387             do {
388               (char *)p1 += page_size;
389             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
390             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
391                                  lastNode);
392           } while (p1 <= p2);
393 #else
394           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
395                                (char *)p1 + (page_size - 1),
396                                __kmp_get_host_node(p1));
397           if (p1 < p2) {
398             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
399                                  (char *)p2 + (page_size - 1),
400                                  __kmp_get_host_node(p2));
401           }
402 #endif
403         }
404       }
405     } else
406       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
407   }
408 #endif /* KMP_PRINT_DATA_PLACEMENT */
409   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
410 }
411 
412 void __kmp_warn(char const *format, ...) {
413   char buffer[MAX_MESSAGE];
414   va_list ap;
415 
416   if (__kmp_generate_warnings == kmp_warnings_off) {
417     return;
418   }
419 
420   va_start(ap, format);
421 
422   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
423   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
424   __kmp_vprintf(kmp_err, buffer, ap);
425   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
426 
427   va_end(ap);
428 }
429 
430 void __kmp_abort_process() {
431   // Later threads may stall here, but that's ok because abort() will kill them.
432   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
433 
434   if (__kmp_debug_buf) {
435     __kmp_dump_debug_buffer();
436   }
437 
438   if (KMP_OS_WINDOWS) {
439     // Let other threads know of abnormal termination and prevent deadlock
440     // if abort happened during library initialization or shutdown
441     __kmp_global.g.g_abort = SIGABRT;
442 
443     /* On Windows* OS by default abort() causes pop-up error box, which stalls
444        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
445        boxes. _set_abort_behavior() works well, but this function is not
446        available in VS7 (this is not problem for DLL, but it is a problem for
447        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
448        help, at least in some versions of MS C RTL.
449 
450        It seems following sequence is the only way to simulate abort() and
451        avoid pop-up error box. */
452     raise(SIGABRT);
453     _exit(3); // Just in case, if signal ignored, exit anyway.
454   } else {
455     __kmp_unregister_library();
456     abort();
457   }
458 
459   __kmp_infinite_loop();
460   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
461 
462 } // __kmp_abort_process
463 
464 void __kmp_abort_thread(void) {
465   // TODO: Eliminate g_abort global variable and this function.
466   // In case of abort just call abort(), it will kill all the threads.
467   __kmp_infinite_loop();
468 } // __kmp_abort_thread
469 
470 /* Print out the storage map for the major kmp_info_t thread data structures
471    that are allocated together. */
472 
473 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
474   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
475                                gtid);
476 
477   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
478                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
481                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
482 
483   __kmp_print_storage_map_gtid(
484       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
485       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
486 
487   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
488                                &thr->th.th_bar[bs_plain_barrier + 1],
489                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
490                                gtid);
491 
492   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
493                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
494                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
495                                gtid);
496 
497 #if KMP_FAST_REDUCTION_BARRIER
498   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
499                                &thr->th.th_bar[bs_reduction_barrier + 1],
500                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
501                                gtid);
502 #endif // KMP_FAST_REDUCTION_BARRIER
503 }
504 
505 /* Print out the storage map for the major kmp_team_t team data structures
506    that are allocated together. */
507 
508 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
509                                          int team_id, int num_thr) {
510   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
511   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
512                                header, team_id);
513 
514   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
515                                &team->t.t_bar[bs_last_barrier],
516                                sizeof(kmp_balign_team_t) * bs_last_barrier,
517                                "%s_%d.t_bar", header, team_id);
518 
519   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
520                                &team->t.t_bar[bs_plain_barrier + 1],
521                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
522                                header, team_id);
523 
524   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
525                                &team->t.t_bar[bs_forkjoin_barrier + 1],
526                                sizeof(kmp_balign_team_t),
527                                "%s_%d.t_bar[forkjoin]", header, team_id);
528 
529 #if KMP_FAST_REDUCTION_BARRIER
530   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
531                                &team->t.t_bar[bs_reduction_barrier + 1],
532                                sizeof(kmp_balign_team_t),
533                                "%s_%d.t_bar[reduction]", header, team_id);
534 #endif // KMP_FAST_REDUCTION_BARRIER
535 
536   __kmp_print_storage_map_gtid(
537       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
538       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
539 
540   __kmp_print_storage_map_gtid(
541       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
542       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
543 
544   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
545                                &team->t.t_disp_buffer[num_disp_buff],
546                                sizeof(dispatch_shared_info_t) * num_disp_buff,
547                                "%s_%d.t_disp_buffer", header, team_id);
548 }
549 
550 static void __kmp_init_allocator() {
551   __kmp_init_memkind();
552   __kmp_init_target_mem();
553 }
554 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
555 
556 /* ------------------------------------------------------------------------ */
557 
558 #if KMP_DYNAMIC_LIB
559 #if KMP_OS_WINDOWS
560 
561 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
562   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
563 
564   switch (fdwReason) {
565 
566   case DLL_PROCESS_ATTACH:
567     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
568 
569     return TRUE;
570 
571   case DLL_PROCESS_DETACH:
572     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
573 
574     // According to Windows* documentation for DllMain entry point:
575     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
576     //   lpReserved == NULL when FreeLibrary() is called,
577     //   lpReserved != NULL when the process is terminated.
578     // When FreeLibrary() is called, worker threads remain alive. So the
579     // runtime's state is consistent and executing proper shutdown is OK.
580     // When the process is terminated, worker threads have exited or been
581     // forcefully terminated by the OS and only the shutdown thread remains.
582     // This can leave the runtime in an inconsistent state.
583     // Hence, only attempt proper cleanup when FreeLibrary() is called.
584     // Otherwise, rely on OS to reclaim resources.
585     if (lpReserved == NULL)
586       __kmp_internal_end_library(__kmp_gtid_get_specific());
587 
588     return TRUE;
589 
590   case DLL_THREAD_ATTACH:
591     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
592 
593     /* if we want to register new siblings all the time here call
594      * __kmp_get_gtid(); */
595     return TRUE;
596 
597   case DLL_THREAD_DETACH:
598     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
599 
600     __kmp_internal_end_thread(__kmp_gtid_get_specific());
601     return TRUE;
602   }
603 
604   return TRUE;
605 }
606 
607 #endif /* KMP_OS_WINDOWS */
608 #endif /* KMP_DYNAMIC_LIB */
609 
610 /* __kmp_parallel_deo -- Wait until it's our turn. */
611 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
612   int gtid = *gtid_ref;
613 #ifdef BUILD_PARALLEL_ORDERED
614   kmp_team_t *team = __kmp_team_from_gtid(gtid);
615 #endif /* BUILD_PARALLEL_ORDERED */
616 
617   if (__kmp_env_consistency_check) {
618     if (__kmp_threads[gtid]->th.th_root->r.r_active)
619 #if KMP_USE_DYNAMIC_LOCK
620       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
621 #else
622       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
623 #endif
624   }
625 #ifdef BUILD_PARALLEL_ORDERED
626   if (!team->t.t_serialized) {
627     KMP_MB();
628     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
629              NULL);
630     KMP_MB();
631   }
632 #endif /* BUILD_PARALLEL_ORDERED */
633 }
634 
635 /* __kmp_parallel_dxo -- Signal the next task. */
636 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
637   int gtid = *gtid_ref;
638 #ifdef BUILD_PARALLEL_ORDERED
639   int tid = __kmp_tid_from_gtid(gtid);
640   kmp_team_t *team = __kmp_team_from_gtid(gtid);
641 #endif /* BUILD_PARALLEL_ORDERED */
642 
643   if (__kmp_env_consistency_check) {
644     if (__kmp_threads[gtid]->th.th_root->r.r_active)
645       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
646   }
647 #ifdef BUILD_PARALLEL_ORDERED
648   if (!team->t.t_serialized) {
649     KMP_MB(); /* Flush all pending memory write invalidates.  */
650 
651     /* use the tid of the next thread in this team */
652     /* TODO replace with general release procedure */
653     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
654 
655     KMP_MB(); /* Flush all pending memory write invalidates.  */
656   }
657 #endif /* BUILD_PARALLEL_ORDERED */
658 }
659 
660 /* ------------------------------------------------------------------------ */
661 /* The BARRIER for a SINGLE process section is always explicit   */
662 
663 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
664   int status;
665   kmp_info_t *th;
666   kmp_team_t *team;
667 
668   if (!TCR_4(__kmp_init_parallel))
669     __kmp_parallel_initialize();
670   __kmp_resume_if_soft_paused();
671 
672   th = __kmp_threads[gtid];
673   team = th->th.th_team;
674   status = 0;
675 
676   th->th.th_ident = id_ref;
677 
678   if (team->t.t_serialized) {
679     status = 1;
680   } else {
681     kmp_int32 old_this = th->th.th_local.this_construct;
682 
683     ++th->th.th_local.this_construct;
684     /* try to set team count to thread count--success means thread got the
685        single block */
686     /* TODO: Should this be acquire or release? */
687     if (team->t.t_construct == old_this) {
688       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
689                                               th->th.th_local.this_construct);
690     }
691 #if USE_ITT_BUILD
692     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
693         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
694         team->t.t_active_level == 1) {
695       // Only report metadata by primary thread of active team at level 1
696       __kmp_itt_metadata_single(id_ref);
697     }
698 #endif /* USE_ITT_BUILD */
699   }
700 
701   if (__kmp_env_consistency_check) {
702     if (status && push_ws) {
703       __kmp_push_workshare(gtid, ct_psingle, id_ref);
704     } else {
705       __kmp_check_workshare(gtid, ct_psingle, id_ref);
706     }
707   }
708 #if USE_ITT_BUILD
709   if (status) {
710     __kmp_itt_single_start(gtid);
711   }
712 #endif /* USE_ITT_BUILD */
713   return status;
714 }
715 
716 void __kmp_exit_single(int gtid) {
717 #if USE_ITT_BUILD
718   __kmp_itt_single_end(gtid);
719 #endif /* USE_ITT_BUILD */
720   if (__kmp_env_consistency_check)
721     __kmp_pop_workshare(gtid, ct_psingle, NULL);
722 }
723 
724 /* determine if we can go parallel or must use a serialized parallel region and
725  * how many threads we can use
726  * set_nproc is the number of threads requested for the team
727  * returns 0 if we should serialize or only use one thread,
728  * otherwise the number of threads to use
729  * The forkjoin lock is held by the caller. */
730 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
731                                  int master_tid, int set_nthreads,
732                                  int enter_teams) {
733   int capacity;
734   int new_nthreads;
735   KMP_DEBUG_ASSERT(__kmp_init_serial);
736   KMP_DEBUG_ASSERT(root && parent_team);
737   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
738 
739   // If dyn-var is set, dynamically adjust the number of desired threads,
740   // according to the method specified by dynamic_mode.
741   new_nthreads = set_nthreads;
742   if (!get__dynamic_2(parent_team, master_tid)) {
743     ;
744   }
745 #ifdef USE_LOAD_BALANCE
746   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
747     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
748     if (new_nthreads == 1) {
749       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
750                     "reservation to 1 thread\n",
751                     master_tid));
752       return 1;
753     }
754     if (new_nthreads < set_nthreads) {
755       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
756                     "reservation to %d threads\n",
757                     master_tid, new_nthreads));
758     }
759   }
760 #endif /* USE_LOAD_BALANCE */
761   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
762     new_nthreads = __kmp_avail_proc - __kmp_nth +
763                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
764     if (new_nthreads <= 1) {
765       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
766                     "reservation to 1 thread\n",
767                     master_tid));
768       return 1;
769     }
770     if (new_nthreads < set_nthreads) {
771       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
772                     "reservation to %d threads\n",
773                     master_tid, new_nthreads));
774     } else {
775       new_nthreads = set_nthreads;
776     }
777   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
778     if (set_nthreads > 2) {
779       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
780       new_nthreads = (new_nthreads % set_nthreads) + 1;
781       if (new_nthreads == 1) {
782         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
783                       "reservation to 1 thread\n",
784                       master_tid));
785         return 1;
786       }
787       if (new_nthreads < set_nthreads) {
788         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
789                       "reservation to %d threads\n",
790                       master_tid, new_nthreads));
791       }
792     }
793   } else {
794     KMP_ASSERT(0);
795   }
796 
797   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
798   if (__kmp_nth + new_nthreads -
799           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
800       __kmp_max_nth) {
801     int tl_nthreads = __kmp_max_nth - __kmp_nth +
802                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
803     if (tl_nthreads <= 0) {
804       tl_nthreads = 1;
805     }
806 
807     // If dyn-var is false, emit a 1-time warning.
808     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
809       __kmp_reserve_warn = 1;
810       __kmp_msg(kmp_ms_warning,
811                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
812                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
813     }
814     if (tl_nthreads == 1) {
815       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
816                     "reduced reservation to 1 thread\n",
817                     master_tid));
818       return 1;
819     }
820     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
821                   "reservation to %d threads\n",
822                   master_tid, tl_nthreads));
823     new_nthreads = tl_nthreads;
824   }
825 
826   // Respect OMP_THREAD_LIMIT
827   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
828   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
829   if (cg_nthreads + new_nthreads -
830           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
831       max_cg_threads) {
832     int tl_nthreads = max_cg_threads - cg_nthreads +
833                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
834     if (tl_nthreads <= 0) {
835       tl_nthreads = 1;
836     }
837 
838     // If dyn-var is false, emit a 1-time warning.
839     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
840       __kmp_reserve_warn = 1;
841       __kmp_msg(kmp_ms_warning,
842                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
843                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
844     }
845     if (tl_nthreads == 1) {
846       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
847                     "reduced reservation to 1 thread\n",
848                     master_tid));
849       return 1;
850     }
851     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
852                   "reservation to %d threads\n",
853                   master_tid, tl_nthreads));
854     new_nthreads = tl_nthreads;
855   }
856 
857   // Check if the threads array is large enough, or needs expanding.
858   // See comment in __kmp_register_root() about the adjustment if
859   // __kmp_threads[0] == NULL.
860   capacity = __kmp_threads_capacity;
861   if (TCR_PTR(__kmp_threads[0]) == NULL) {
862     --capacity;
863   }
864   // If it is not for initializing the hidden helper team, we need to take
865   // __kmp_hidden_helper_threads_num out of the capacity because it is included
866   // in __kmp_threads_capacity.
867   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
868     capacity -= __kmp_hidden_helper_threads_num;
869   }
870   if (__kmp_nth + new_nthreads -
871           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
872       capacity) {
873     // Expand the threads array.
874     int slotsRequired = __kmp_nth + new_nthreads -
875                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
876                         capacity;
877     int slotsAdded = __kmp_expand_threads(slotsRequired);
878     if (slotsAdded < slotsRequired) {
879       // The threads array was not expanded enough.
880       new_nthreads -= (slotsRequired - slotsAdded);
881       KMP_ASSERT(new_nthreads >= 1);
882 
883       // If dyn-var is false, emit a 1-time warning.
884       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885         __kmp_reserve_warn = 1;
886         if (__kmp_tp_cached) {
887           __kmp_msg(kmp_ms_warning,
888                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
889                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
890                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
891         } else {
892           __kmp_msg(kmp_ms_warning,
893                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
894                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
895         }
896       }
897     }
898   }
899 
900 #ifdef KMP_DEBUG
901   if (new_nthreads == 1) {
902     KC_TRACE(10,
903              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
904               "dead roots and rechecking; requested %d threads\n",
905               __kmp_get_gtid(), set_nthreads));
906   } else {
907     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
908                   " %d threads\n",
909                   __kmp_get_gtid(), new_nthreads, set_nthreads));
910   }
911 #endif // KMP_DEBUG
912   return new_nthreads;
913 }
914 
915 /* Allocate threads from the thread pool and assign them to the new team. We are
916    assured that there are enough threads available, because we checked on that
917    earlier within critical section forkjoin */
918 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
919                                     kmp_info_t *master_th, int master_gtid) {
920   int i;
921   int use_hot_team;
922 
923   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
924   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
925   KMP_MB();
926 
927   /* first, let's setup the primary thread */
928   master_th->th.th_info.ds.ds_tid = 0;
929   master_th->th.th_team = team;
930   master_th->th.th_team_nproc = team->t.t_nproc;
931   master_th->th.th_team_master = master_th;
932   master_th->th.th_team_serialized = FALSE;
933   master_th->th.th_dispatch = &team->t.t_dispatch[0];
934 
935 /* make sure we are not the optimized hot team */
936 #if KMP_NESTED_HOT_TEAMS
937   use_hot_team = 0;
938   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
939   if (hot_teams) { // hot teams array is not allocated if
940     // KMP_HOT_TEAMS_MAX_LEVEL=0
941     int level = team->t.t_active_level - 1; // index in array of hot teams
942     if (master_th->th.th_teams_microtask) { // are we inside the teams?
943       if (master_th->th.th_teams_size.nteams > 1) {
944         ++level; // level was not increased in teams construct for
945         // team_of_masters
946       }
947       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
948           master_th->th.th_teams_level == team->t.t_level) {
949         ++level; // level was not increased in teams construct for
950         // team_of_workers before the parallel
951       } // team->t.t_level will be increased inside parallel
952     }
953     if (level < __kmp_hot_teams_max_level) {
954       if (hot_teams[level].hot_team) {
955         // hot team has already been allocated for given level
956         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
957         use_hot_team = 1; // the team is ready to use
958       } else {
959         use_hot_team = 0; // AC: threads are not allocated yet
960         hot_teams[level].hot_team = team; // remember new hot team
961         hot_teams[level].hot_team_nth = team->t.t_nproc;
962       }
963     } else {
964       use_hot_team = 0;
965     }
966   }
967 #else
968   use_hot_team = team == root->r.r_hot_team;
969 #endif
970   if (!use_hot_team) {
971 
972     /* install the primary thread */
973     team->t.t_threads[0] = master_th;
974     __kmp_initialize_info(master_th, team, 0, master_gtid);
975 
976     /* now, install the worker threads */
977     for (i = 1; i < team->t.t_nproc; i++) {
978 
979       /* fork or reallocate a new thread and install it in team */
980       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
981       team->t.t_threads[i] = thr;
982       KMP_DEBUG_ASSERT(thr);
983       KMP_DEBUG_ASSERT(thr->th.th_team == team);
984       /* align team and thread arrived states */
985       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
986                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
987                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
988                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
989                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
990                     team->t.t_bar[bs_plain_barrier].b_arrived));
991       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
992       thr->th.th_teams_level = master_th->th.th_teams_level;
993       thr->th.th_teams_size = master_th->th.th_teams_size;
994       { // Initialize threads' barrier data.
995         int b;
996         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
997         for (b = 0; b < bs_last_barrier; ++b) {
998           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
999           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1000 #if USE_DEBUGGER
1001           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1002 #endif
1003         }
1004       }
1005     }
1006 
1007 #if KMP_AFFINITY_SUPPORTED
1008     __kmp_partition_places(team);
1009 #endif
1010   }
1011 
1012   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1013     for (i = 0; i < team->t.t_nproc; i++) {
1014       kmp_info_t *thr = team->t.t_threads[i];
1015       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1016           thr->th.th_prev_level != team->t.t_level) {
1017         team->t.t_display_affinity = 1;
1018         break;
1019       }
1020     }
1021   }
1022 
1023   KMP_MB();
1024 }
1025 
1026 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1027 // Propagate any changes to the floating point control registers out to the team
1028 // We try to avoid unnecessary writes to the relevant cache line in the team
1029 // structure, so we don't make changes unless they are needed.
1030 inline static void propagateFPControl(kmp_team_t *team) {
1031   if (__kmp_inherit_fp_control) {
1032     kmp_int16 x87_fpu_control_word;
1033     kmp_uint32 mxcsr;
1034 
1035     // Get primary thread's values of FPU control flags (both X87 and vector)
1036     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1037     __kmp_store_mxcsr(&mxcsr);
1038     mxcsr &= KMP_X86_MXCSR_MASK;
1039 
1040     // There is no point looking at t_fp_control_saved here.
1041     // If it is TRUE, we still have to update the values if they are different
1042     // from those we now have. If it is FALSE we didn't save anything yet, but
1043     // our objective is the same. We have to ensure that the values in the team
1044     // are the same as those we have.
1045     // So, this code achieves what we need whether or not t_fp_control_saved is
1046     // true. By checking whether the value needs updating we avoid unnecessary
1047     // writes that would put the cache-line into a written state, causing all
1048     // threads in the team to have to read it again.
1049     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1050     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1051     // Although we don't use this value, other code in the runtime wants to know
1052     // whether it should restore them. So we must ensure it is correct.
1053     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1054   } else {
1055     // Similarly here. Don't write to this cache-line in the team structure
1056     // unless we have to.
1057     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1058   }
1059 }
1060 
1061 // Do the opposite, setting the hardware registers to the updated values from
1062 // the team.
1063 inline static void updateHWFPControl(kmp_team_t *team) {
1064   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1065     // Only reset the fp control regs if they have been changed in the team.
1066     // the parallel region that we are exiting.
1067     kmp_int16 x87_fpu_control_word;
1068     kmp_uint32 mxcsr;
1069     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1070     __kmp_store_mxcsr(&mxcsr);
1071     mxcsr &= KMP_X86_MXCSR_MASK;
1072 
1073     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1074       __kmp_clear_x87_fpu_status_word();
1075       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1076     }
1077 
1078     if (team->t.t_mxcsr != mxcsr) {
1079       __kmp_load_mxcsr(&team->t.t_mxcsr);
1080     }
1081   }
1082 }
1083 #else
1084 #define propagateFPControl(x) ((void)0)
1085 #define updateHWFPControl(x) ((void)0)
1086 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1087 
1088 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1089                                      int realloc); // forward declaration
1090 
1091 /* Run a parallel region that has been serialized, so runs only in a team of the
1092    single primary thread. */
1093 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1094   kmp_info_t *this_thr;
1095   kmp_team_t *serial_team;
1096 
1097   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1098 
1099   /* Skip all this code for autopar serialized loops since it results in
1100      unacceptable overhead */
1101   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1102     return;
1103 
1104   if (!TCR_4(__kmp_init_parallel))
1105     __kmp_parallel_initialize();
1106   __kmp_resume_if_soft_paused();
1107 
1108   this_thr = __kmp_threads[global_tid];
1109   serial_team = this_thr->th.th_serial_team;
1110 
1111   /* utilize the serialized team held by this thread */
1112   KMP_DEBUG_ASSERT(serial_team);
1113   KMP_MB();
1114 
1115   if (__kmp_tasking_mode != tskm_immediate_exec) {
1116     KMP_DEBUG_ASSERT(
1117         this_thr->th.th_task_team ==
1118         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1119     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1120                      NULL);
1121     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1122                   "team %p, new task_team = NULL\n",
1123                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1124     this_thr->th.th_task_team = NULL;
1125   }
1126 
1127   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1128   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1129     proc_bind = proc_bind_false;
1130   } else if (proc_bind == proc_bind_default) {
1131     // No proc_bind clause was specified, so use the current value
1132     // of proc-bind-var for this parallel region.
1133     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1134   }
1135   // Reset for next parallel region
1136   this_thr->th.th_set_proc_bind = proc_bind_default;
1137 
1138 #if OMPT_SUPPORT
1139   ompt_data_t ompt_parallel_data = ompt_data_none;
1140   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1141   if (ompt_enabled.enabled &&
1142       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1143 
1144     ompt_task_info_t *parent_task_info;
1145     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1146 
1147     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1148     if (ompt_enabled.ompt_callback_parallel_begin) {
1149       int team_size = 1;
1150 
1151       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1152           &(parent_task_info->task_data), &(parent_task_info->frame),
1153           &ompt_parallel_data, team_size,
1154           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1155     }
1156   }
1157 #endif // OMPT_SUPPORT
1158 
1159   if (this_thr->th.th_team != serial_team) {
1160     // Nested level will be an index in the nested nthreads array
1161     int level = this_thr->th.th_team->t.t_level;
1162 
1163     if (serial_team->t.t_serialized) {
1164       /* this serial team was already used
1165          TODO increase performance by making this locks more specific */
1166       kmp_team_t *new_team;
1167 
1168       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1169 
1170       new_team =
1171           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1172 #if OMPT_SUPPORT
1173                               ompt_parallel_data,
1174 #endif
1175                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1176                               0 USE_NESTED_HOT_ARG(NULL));
1177       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1178       KMP_ASSERT(new_team);
1179 
1180       /* setup new serialized team and install it */
1181       new_team->t.t_threads[0] = this_thr;
1182       new_team->t.t_parent = this_thr->th.th_team;
1183       serial_team = new_team;
1184       this_thr->th.th_serial_team = serial_team;
1185 
1186       KF_TRACE(
1187           10,
1188           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1189            global_tid, serial_team));
1190 
1191       /* TODO the above breaks the requirement that if we run out of resources,
1192          then we can still guarantee that serialized teams are ok, since we may
1193          need to allocate a new one */
1194     } else {
1195       KF_TRACE(
1196           10,
1197           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1198            global_tid, serial_team));
1199     }
1200 
1201     /* we have to initialize this serial team */
1202     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1203     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1204     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1205     serial_team->t.t_ident = loc;
1206     serial_team->t.t_serialized = 1;
1207     serial_team->t.t_nproc = 1;
1208     serial_team->t.t_parent = this_thr->th.th_team;
1209     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1210     this_thr->th.th_team = serial_team;
1211     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1212 
1213     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1214                   this_thr->th.th_current_task));
1215     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1216     this_thr->th.th_current_task->td_flags.executing = 0;
1217 
1218     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1219 
1220     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1221        implicit task for each serialized task represented by
1222        team->t.t_serialized? */
1223     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1224               &this_thr->th.th_current_task->td_parent->td_icvs);
1225 
1226     // Thread value exists in the nested nthreads array for the next nested
1227     // level
1228     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1229       this_thr->th.th_current_task->td_icvs.nproc =
1230           __kmp_nested_nth.nth[level + 1];
1231     }
1232 
1233     if (__kmp_nested_proc_bind.used &&
1234         (level + 1 < __kmp_nested_proc_bind.used)) {
1235       this_thr->th.th_current_task->td_icvs.proc_bind =
1236           __kmp_nested_proc_bind.bind_types[level + 1];
1237     }
1238 
1239 #if USE_DEBUGGER
1240     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1241 #endif
1242     this_thr->th.th_info.ds.ds_tid = 0;
1243 
1244     /* set thread cache values */
1245     this_thr->th.th_team_nproc = 1;
1246     this_thr->th.th_team_master = this_thr;
1247     this_thr->th.th_team_serialized = 1;
1248 
1249     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1250     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1251     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1252 
1253     propagateFPControl(serial_team);
1254 
1255     /* check if we need to allocate dispatch buffers stack */
1256     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1257     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1258       serial_team->t.t_dispatch->th_disp_buffer =
1259           (dispatch_private_info_t *)__kmp_allocate(
1260               sizeof(dispatch_private_info_t));
1261     }
1262     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1263 
1264     KMP_MB();
1265 
1266   } else {
1267     /* this serialized team is already being used,
1268      * that's fine, just add another nested level */
1269     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1270     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1271     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1272     ++serial_team->t.t_serialized;
1273     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1274 
1275     // Nested level will be an index in the nested nthreads array
1276     int level = this_thr->th.th_team->t.t_level;
1277     // Thread value exists in the nested nthreads array for the next nested
1278     // level
1279     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1280       this_thr->th.th_current_task->td_icvs.nproc =
1281           __kmp_nested_nth.nth[level + 1];
1282     }
1283     serial_team->t.t_level++;
1284     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1285                   "of serial team %p to %d\n",
1286                   global_tid, serial_team, serial_team->t.t_level));
1287 
1288     /* allocate/push dispatch buffers stack */
1289     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1290     {
1291       dispatch_private_info_t *disp_buffer =
1292           (dispatch_private_info_t *)__kmp_allocate(
1293               sizeof(dispatch_private_info_t));
1294       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1295       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1296     }
1297     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1298 
1299     KMP_MB();
1300   }
1301   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1302 
1303   // Perform the display affinity functionality for
1304   // serialized parallel regions
1305   if (__kmp_display_affinity) {
1306     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1307         this_thr->th.th_prev_num_threads != 1) {
1308       // NULL means use the affinity-format-var ICV
1309       __kmp_aux_display_affinity(global_tid, NULL);
1310       this_thr->th.th_prev_level = serial_team->t.t_level;
1311       this_thr->th.th_prev_num_threads = 1;
1312     }
1313   }
1314 
1315   if (__kmp_env_consistency_check)
1316     __kmp_push_parallel(global_tid, NULL);
1317 #if OMPT_SUPPORT
1318   serial_team->t.ompt_team_info.master_return_address = codeptr;
1319   if (ompt_enabled.enabled &&
1320       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1321     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1322         OMPT_GET_FRAME_ADDRESS(0);
1323 
1324     ompt_lw_taskteam_t lw_taskteam;
1325     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1326                             &ompt_parallel_data, codeptr);
1327 
1328     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1329     // don't use lw_taskteam after linking. content was swaped
1330 
1331     /* OMPT implicit task begin */
1332     if (ompt_enabled.ompt_callback_implicit_task) {
1333       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1334           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1335           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1336           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1337       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1338           __kmp_tid_from_gtid(global_tid);
1339     }
1340 
1341     /* OMPT state */
1342     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1343     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1344         OMPT_GET_FRAME_ADDRESS(0);
1345   }
1346 #endif
1347 }
1348 
1349 /* most of the work for a fork */
1350 /* return true if we really went parallel, false if serialized */
1351 int __kmp_fork_call(ident_t *loc, int gtid,
1352                     enum fork_context_e call_context, // Intel, GNU, ...
1353                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1354                     kmp_va_list ap) {
1355   void **argv;
1356   int i;
1357   int master_tid;
1358   int master_this_cons;
1359   kmp_team_t *team;
1360   kmp_team_t *parent_team;
1361   kmp_info_t *master_th;
1362   kmp_root_t *root;
1363   int nthreads;
1364   int master_active;
1365   int master_set_numthreads;
1366   int level;
1367   int active_level;
1368   int teams_level;
1369 #if KMP_NESTED_HOT_TEAMS
1370   kmp_hot_team_ptr_t **p_hot_teams;
1371 #endif
1372   { // KMP_TIME_BLOCK
1373     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1374     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1375 
1376     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1377     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1378       /* Some systems prefer the stack for the root thread(s) to start with */
1379       /* some gap from the parent stack to prevent false sharing. */
1380       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1381       /* These 2 lines below are so this does not get optimized out */
1382       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1383         __kmp_stkpadding += (short)((kmp_int64)dummy);
1384     }
1385 
1386     /* initialize if needed */
1387     KMP_DEBUG_ASSERT(
1388         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1389     if (!TCR_4(__kmp_init_parallel))
1390       __kmp_parallel_initialize();
1391     __kmp_resume_if_soft_paused();
1392 
1393     /* setup current data */
1394     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1395     // shutdown
1396     parent_team = master_th->th.th_team;
1397     master_tid = master_th->th.th_info.ds.ds_tid;
1398     master_this_cons = master_th->th.th_local.this_construct;
1399     root = master_th->th.th_root;
1400     master_active = root->r.r_active;
1401     master_set_numthreads = master_th->th.th_set_nproc;
1402 
1403 #if OMPT_SUPPORT
1404     ompt_data_t ompt_parallel_data = ompt_data_none;
1405     ompt_data_t *parent_task_data;
1406     ompt_frame_t *ompt_frame;
1407     ompt_data_t *implicit_task_data;
1408     void *return_address = NULL;
1409 
1410     if (ompt_enabled.enabled) {
1411       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1412                                     NULL, NULL);
1413       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1414     }
1415 #endif
1416 
1417     // Assign affinity to root thread if it hasn't happened yet
1418     __kmp_assign_root_init_mask();
1419 
1420     // Nested level will be an index in the nested nthreads array
1421     level = parent_team->t.t_level;
1422     // used to launch non-serial teams even if nested is not allowed
1423     active_level = parent_team->t.t_active_level;
1424     // needed to check nesting inside the teams
1425     teams_level = master_th->th.th_teams_level;
1426 #if KMP_NESTED_HOT_TEAMS
1427     p_hot_teams = &master_th->th.th_hot_teams;
1428     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1429       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1430           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1431       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1432       // it is either actual or not needed (when active_level > 0)
1433       (*p_hot_teams)[0].hot_team_nth = 1;
1434     }
1435 #endif
1436 
1437 #if OMPT_SUPPORT
1438     if (ompt_enabled.enabled) {
1439       if (ompt_enabled.ompt_callback_parallel_begin) {
1440         int team_size = master_set_numthreads
1441                             ? master_set_numthreads
1442                             : get__nproc_2(parent_team, master_tid);
1443         int flags = OMPT_INVOKER(call_context) |
1444                     ((microtask == (microtask_t)__kmp_teams_master)
1445                          ? ompt_parallel_league
1446                          : ompt_parallel_team);
1447         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1448             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1449             return_address);
1450       }
1451       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1452     }
1453 #endif
1454 
1455     master_th->th.th_ident = loc;
1456 
1457     if (master_th->th.th_teams_microtask && ap &&
1458         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1459       // AC: This is start of parallel that is nested inside teams construct.
1460       // The team is actual (hot), all workers are ready at the fork barrier.
1461       // No lock needed to initialize the team a bit, then free workers.
1462       parent_team->t.t_ident = loc;
1463       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1464       parent_team->t.t_argc = argc;
1465       argv = (void **)parent_team->t.t_argv;
1466       for (i = argc - 1; i >= 0; --i)
1467         *argv++ = va_arg(kmp_va_deref(ap), void *);
1468       // Increment our nested depth levels, but not increase the serialization
1469       if (parent_team == master_th->th.th_serial_team) {
1470         // AC: we are in serialized parallel
1471         __kmpc_serialized_parallel(loc, gtid);
1472         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1473 
1474         if (call_context == fork_context_gnu) {
1475           // AC: need to decrement t_serialized for enquiry functions to work
1476           // correctly, will restore at join time
1477           parent_team->t.t_serialized--;
1478           return TRUE;
1479         }
1480 
1481 #if OMPD_SUPPORT
1482         parent_team->t.t_pkfn = microtask;
1483 #endif
1484 
1485 #if OMPT_SUPPORT
1486         void *dummy;
1487         void **exit_frame_p;
1488 
1489         ompt_lw_taskteam_t lw_taskteam;
1490 
1491         if (ompt_enabled.enabled) {
1492           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1493                                   &ompt_parallel_data, return_address);
1494           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1495 
1496           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1497           // don't use lw_taskteam after linking. content was swaped
1498 
1499           /* OMPT implicit task begin */
1500           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1501           if (ompt_enabled.ompt_callback_implicit_task) {
1502             OMPT_CUR_TASK_INFO(master_th)->thread_num =
1503                 __kmp_tid_from_gtid(gtid);
1504             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1505                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1506                 implicit_task_data, 1,
1507                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1508           }
1509 
1510           /* OMPT state */
1511           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1512         } else {
1513           exit_frame_p = &dummy;
1514         }
1515 #endif
1516         // AC: need to decrement t_serialized for enquiry functions to work
1517         // correctly, will restore at join time
1518         parent_team->t.t_serialized--;
1519 
1520         {
1521           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1522           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1523           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1524 #if OMPT_SUPPORT
1525                                  ,
1526                                  exit_frame_p
1527 #endif
1528           );
1529         }
1530 
1531 #if OMPT_SUPPORT
1532         if (ompt_enabled.enabled) {
1533           *exit_frame_p = NULL;
1534           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1535           if (ompt_enabled.ompt_callback_implicit_task) {
1536             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1537                 ompt_scope_end, NULL, implicit_task_data, 1,
1538                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1539           }
1540           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1541           __ompt_lw_taskteam_unlink(master_th);
1542           if (ompt_enabled.ompt_callback_parallel_end) {
1543             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1544                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1545                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1546                 return_address);
1547           }
1548           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1549         }
1550 #endif
1551         return TRUE;
1552       }
1553 
1554       parent_team->t.t_pkfn = microtask;
1555       parent_team->t.t_invoke = invoker;
1556       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1557       parent_team->t.t_active_level++;
1558       parent_team->t.t_level++;
1559       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1560 
1561 #if OMPT_SUPPORT
1562       if (ompt_enabled.enabled) {
1563         ompt_lw_taskteam_t lw_taskteam;
1564         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1565                                 &ompt_parallel_data, return_address);
1566         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1567       }
1568 #endif
1569 
1570       /* Change number of threads in the team if requested */
1571       if (master_set_numthreads) { // The parallel has num_threads clause
1572         if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1573           // AC: only can reduce number of threads dynamically, can't increase
1574           kmp_info_t **other_threads = parent_team->t.t_threads;
1575           // NOTE: if using distributed barrier, we need to run this code block
1576           // even when the team size appears not to have changed from the max.
1577           int old_proc = master_th->th.th_teams_size.nth;
1578           if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
1579               bp_dist_bar) {
1580             __kmp_resize_dist_barrier(parent_team, old_proc,
1581                                       master_set_numthreads);
1582             __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1583           }
1584           parent_team->t.t_nproc = master_set_numthreads;
1585           for (i = 0; i < master_set_numthreads; ++i) {
1586             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1587           }
1588         }
1589         // Keep extra threads hot in the team for possible next parallels
1590         master_th->th.th_set_nproc = 0;
1591       }
1592 
1593 #if USE_DEBUGGER
1594       if (__kmp_debugging) { // Let debugger override number of threads.
1595         int nth = __kmp_omp_num_threads(loc);
1596         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1597           master_set_numthreads = nth;
1598         }
1599       }
1600 #endif
1601 
1602 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1603       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1604            KMP_ITT_DEBUG) &&
1605           __kmp_forkjoin_frames_mode == 3 &&
1606           parent_team->t.t_active_level == 1 // only report frames at level 1
1607           && master_th->th.th_teams_size.nteams == 1) {
1608         kmp_uint64 tmp_time = __itt_get_timestamp();
1609         master_th->th.th_frame_time = tmp_time;
1610         parent_team->t.t_region_time = tmp_time;
1611       }
1612       if (__itt_stack_caller_create_ptr) {
1613         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1614         // create new stack stitching id before entering fork barrier
1615         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1616       }
1617 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1618 
1619       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1620                     "master_th=%p, gtid=%d\n",
1621                     root, parent_team, master_th, gtid));
1622       __kmp_internal_fork(loc, gtid, parent_team);
1623       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1624                     "master_th=%p, gtid=%d\n",
1625                     root, parent_team, master_th, gtid));
1626 
1627       if (call_context == fork_context_gnu)
1628         return TRUE;
1629 
1630       /* Invoke microtask for PRIMARY thread */
1631       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1632                     parent_team->t.t_id, parent_team->t.t_pkfn));
1633 
1634       if (!parent_team->t.t_invoke(gtid)) {
1635         KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1636       }
1637       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1638                     parent_team->t.t_id, parent_team->t.t_pkfn));
1639       KMP_MB(); /* Flush all pending memory write invalidates.  */
1640 
1641       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1642 
1643       return TRUE;
1644     } // Parallel closely nested in teams construct
1645 
1646 #if KMP_DEBUG
1647     if (__kmp_tasking_mode != tskm_immediate_exec) {
1648       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1649                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1650     }
1651 #endif
1652 
1653     // Need this to happen before we determine the number of threads, not while
1654     // we are allocating the team
1655     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1656     int enter_teams = 0;
1657     if (parent_team->t.t_active_level >=
1658         master_th->th.th_current_task->td_icvs.max_active_levels) {
1659       nthreads = 1;
1660     } else {
1661       enter_teams = ((ap == NULL && active_level == 0) ||
1662                      (ap && teams_level > 0 && teams_level == level));
1663       nthreads = master_set_numthreads
1664                      ? master_set_numthreads
1665                      // TODO: get nproc directly from current task
1666                      : get__nproc_2(parent_team, master_tid);
1667       // Check if we need to take forkjoin lock? (no need for serialized
1668       // parallel out of teams construct). This code moved here from
1669       // __kmp_reserve_threads() to speedup nested serialized parallels.
1670       if (nthreads > 1) {
1671         if ((get__max_active_levels(master_th) == 1 &&
1672              (root->r.r_in_parallel && !enter_teams)) ||
1673             (__kmp_library == library_serial)) {
1674           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1675                         " threads\n",
1676                         gtid, nthreads));
1677           nthreads = 1;
1678         }
1679       }
1680       if (nthreads > 1) {
1681         /* determine how many new threads we can use */
1682         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1683         /* AC: If we execute teams from parallel region (on host), then teams
1684            should be created but each can only have 1 thread if nesting is
1685            disabled. If teams called from serial region, then teams and their
1686            threads should be created regardless of the nesting setting. */
1687         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1688                                          nthreads, enter_teams);
1689         if (nthreads == 1) {
1690           // Free lock for single thread execution here; for multi-thread
1691           // execution it will be freed later after team of threads created
1692           // and initialized
1693           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1694         }
1695       }
1696     }
1697     KMP_DEBUG_ASSERT(nthreads > 0);
1698 
1699     // If we temporarily changed the set number of threads then restore it now
1700     master_th->th.th_set_nproc = 0;
1701 
1702     /* create a serialized parallel region? */
1703     if (nthreads == 1) {
1704 /* josh todo: hypothetical question: what do we do for OS X*? */
1705 #if KMP_OS_LINUX &&                                                            \
1706     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1707       void *args[argc];
1708 #else
1709       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1710 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1711           KMP_ARCH_AARCH64) */
1712 
1713       KA_TRACE(20,
1714                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1715 
1716       __kmpc_serialized_parallel(loc, gtid);
1717 
1718 #if OMPD_SUPPORT
1719       master_th->th.th_serial_team->t.t_pkfn = microtask;
1720 #endif
1721 
1722       if (call_context == fork_context_intel) {
1723         /* TODO this sucks, use the compiler itself to pass args! :) */
1724         master_th->th.th_serial_team->t.t_ident = loc;
1725         if (!ap) {
1726           // revert change made in __kmpc_serialized_parallel()
1727           master_th->th.th_serial_team->t.t_level--;
1728           // Get args from parent team for teams construct
1729 
1730 #if OMPT_SUPPORT
1731           void *dummy;
1732           void **exit_frame_p;
1733           ompt_task_info_t *task_info;
1734 
1735           ompt_lw_taskteam_t lw_taskteam;
1736 
1737           if (ompt_enabled.enabled) {
1738             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1739                                     &ompt_parallel_data, return_address);
1740 
1741             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1742             // don't use lw_taskteam after linking. content was swaped
1743 
1744             task_info = OMPT_CUR_TASK_INFO(master_th);
1745             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1746             if (ompt_enabled.ompt_callback_implicit_task) {
1747               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1748                   __kmp_tid_from_gtid(gtid);
1749               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1750                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1751                   &(task_info->task_data), 1,
1752                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1753                   ompt_task_implicit);
1754             }
1755 
1756             /* OMPT state */
1757             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1758           } else {
1759             exit_frame_p = &dummy;
1760           }
1761 #endif
1762 
1763           {
1764             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1765             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1766             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1767                                    parent_team->t.t_argv
1768 #if OMPT_SUPPORT
1769                                    ,
1770                                    exit_frame_p
1771 #endif
1772             );
1773           }
1774 
1775 #if OMPT_SUPPORT
1776           if (ompt_enabled.enabled) {
1777             *exit_frame_p = NULL;
1778             if (ompt_enabled.ompt_callback_implicit_task) {
1779               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1780                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1781                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1782                   ompt_task_implicit);
1783             }
1784             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1785             __ompt_lw_taskteam_unlink(master_th);
1786             if (ompt_enabled.ompt_callback_parallel_end) {
1787               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1788                   &ompt_parallel_data, parent_task_data,
1789                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1790                   return_address);
1791             }
1792             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1793           }
1794 #endif
1795         } else if (microtask == (microtask_t)__kmp_teams_master) {
1796           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1797                            master_th->th.th_serial_team);
1798           team = master_th->th.th_team;
1799           // team->t.t_pkfn = microtask;
1800           team->t.t_invoke = invoker;
1801           __kmp_alloc_argv_entries(argc, team, TRUE);
1802           team->t.t_argc = argc;
1803           argv = (void **)team->t.t_argv;
1804           if (ap) {
1805             for (i = argc - 1; i >= 0; --i)
1806               *argv++ = va_arg(kmp_va_deref(ap), void *);
1807           } else {
1808             for (i = 0; i < argc; ++i)
1809               // Get args from parent team for teams construct
1810               argv[i] = parent_team->t.t_argv[i];
1811           }
1812           // AC: revert change made in __kmpc_serialized_parallel()
1813           //     because initial code in teams should have level=0
1814           team->t.t_level--;
1815           // AC: call special invoker for outer "parallel" of teams construct
1816           invoker(gtid);
1817 #if OMPT_SUPPORT
1818           if (ompt_enabled.enabled) {
1819             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1820             if (ompt_enabled.ompt_callback_implicit_task) {
1821               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1822                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1823                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1824             }
1825             if (ompt_enabled.ompt_callback_parallel_end) {
1826               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1827                   &ompt_parallel_data, parent_task_data,
1828                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1829                   return_address);
1830             }
1831             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1832           }
1833 #endif
1834         } else {
1835           argv = args;
1836           for (i = argc - 1; i >= 0; --i)
1837             *argv++ = va_arg(kmp_va_deref(ap), void *);
1838           KMP_MB();
1839 
1840 #if OMPT_SUPPORT
1841           void *dummy;
1842           void **exit_frame_p;
1843           ompt_task_info_t *task_info;
1844 
1845           ompt_lw_taskteam_t lw_taskteam;
1846 
1847           if (ompt_enabled.enabled) {
1848             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1849                                     &ompt_parallel_data, return_address);
1850             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1851             // don't use lw_taskteam after linking. content was swaped
1852             task_info = OMPT_CUR_TASK_INFO(master_th);
1853             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1854 
1855             /* OMPT implicit task begin */
1856             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1857             if (ompt_enabled.ompt_callback_implicit_task) {
1858               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1859                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1860                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1861                   ompt_task_implicit);
1862               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1863                   __kmp_tid_from_gtid(gtid);
1864             }
1865 
1866             /* OMPT state */
1867             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1868           } else {
1869             exit_frame_p = &dummy;
1870           }
1871 #endif
1872 
1873           {
1874             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1875             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1876             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1877 #if OMPT_SUPPORT
1878                                    ,
1879                                    exit_frame_p
1880 #endif
1881             );
1882           }
1883 
1884 #if OMPT_SUPPORT
1885           if (ompt_enabled.enabled) {
1886             *exit_frame_p = NULL;
1887             if (ompt_enabled.ompt_callback_implicit_task) {
1888               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1889                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1890                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1891                   ompt_task_implicit);
1892             }
1893 
1894             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1895             __ompt_lw_taskteam_unlink(master_th);
1896             if (ompt_enabled.ompt_callback_parallel_end) {
1897               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1898                   &ompt_parallel_data, parent_task_data,
1899                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1900                   return_address);
1901             }
1902             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1903           }
1904 #endif
1905         }
1906       } else if (call_context == fork_context_gnu) {
1907 #if OMPT_SUPPORT
1908         ompt_lw_taskteam_t lwt;
1909         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1910                                 return_address);
1911 
1912         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1913         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1914 // don't use lw_taskteam after linking. content was swaped
1915 #endif
1916 
1917         // we were called from GNU native code
1918         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1919         return FALSE;
1920       } else {
1921         KMP_ASSERT2(call_context < fork_context_last,
1922                     "__kmp_fork_call: unknown fork_context parameter");
1923       }
1924 
1925       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1926       KMP_MB();
1927       return FALSE;
1928     } // if (nthreads == 1)
1929 
1930     // GEH: only modify the executing flag in the case when not serialized
1931     //      serialized case is handled in kmpc_serialized_parallel
1932     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1933                   "curtask=%p, curtask_max_aclevel=%d\n",
1934                   parent_team->t.t_active_level, master_th,
1935                   master_th->th.th_current_task,
1936                   master_th->th.th_current_task->td_icvs.max_active_levels));
1937     // TODO: GEH - cannot do this assertion because root thread not set up as
1938     // executing
1939     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1940     master_th->th.th_current_task->td_flags.executing = 0;
1941 
1942     if (!master_th->th.th_teams_microtask || level > teams_level) {
1943       /* Increment our nested depth level */
1944       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1945     }
1946 
1947     // See if we need to make a copy of the ICVs.
1948     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1949     if ((level + 1 < __kmp_nested_nth.used) &&
1950         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1951       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1952     } else {
1953       nthreads_icv = 0; // don't update
1954     }
1955 
1956     // Figure out the proc_bind_policy for the new team.
1957     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1958     kmp_proc_bind_t proc_bind_icv =
1959         proc_bind_default; // proc_bind_default means don't update
1960     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1961       proc_bind = proc_bind_false;
1962     } else {
1963       if (proc_bind == proc_bind_default) {
1964         // No proc_bind clause specified; use current proc-bind-var for this
1965         // parallel region
1966         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1967       }
1968       /* else: The proc_bind policy was specified explicitly on parallel clause.
1969          This overrides proc-bind-var for this parallel region, but does not
1970          change proc-bind-var. */
1971       // Figure the value of proc-bind-var for the child threads.
1972       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1973           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1974            master_th->th.th_current_task->td_icvs.proc_bind)) {
1975         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1976       }
1977     }
1978 
1979     // Reset for next parallel region
1980     master_th->th.th_set_proc_bind = proc_bind_default;
1981 
1982     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1983       kmp_internal_control_t new_icvs;
1984       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1985       new_icvs.next = NULL;
1986       if (nthreads_icv > 0) {
1987         new_icvs.nproc = nthreads_icv;
1988       }
1989       if (proc_bind_icv != proc_bind_default) {
1990         new_icvs.proc_bind = proc_bind_icv;
1991       }
1992 
1993       /* allocate a new parallel team */
1994       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1995       team = __kmp_allocate_team(root, nthreads, nthreads,
1996 #if OMPT_SUPPORT
1997                                  ompt_parallel_data,
1998 #endif
1999                                  proc_bind, &new_icvs,
2000                                  argc USE_NESTED_HOT_ARG(master_th));
2001       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2002         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2003     } else {
2004       /* allocate a new parallel team */
2005       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2006       team = __kmp_allocate_team(root, nthreads, nthreads,
2007 #if OMPT_SUPPORT
2008                                  ompt_parallel_data,
2009 #endif
2010                                  proc_bind,
2011                                  &master_th->th.th_current_task->td_icvs,
2012                                  argc USE_NESTED_HOT_ARG(master_th));
2013       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2014         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2015                   &master_th->th.th_current_task->td_icvs);
2016     }
2017     KF_TRACE(
2018         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2019 
2020     /* setup the new team */
2021     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2022     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2023     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2024     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2025     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2026 #if OMPT_SUPPORT
2027     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2028                           return_address);
2029 #endif
2030     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2031     // TODO: parent_team->t.t_level == INT_MAX ???
2032     if (!master_th->th.th_teams_microtask || level > teams_level) {
2033       int new_level = parent_team->t.t_level + 1;
2034       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2035       new_level = parent_team->t.t_active_level + 1;
2036       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2037     } else {
2038       // AC: Do not increase parallel level at start of the teams construct
2039       int new_level = parent_team->t.t_level;
2040       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2041       new_level = parent_team->t.t_active_level;
2042       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2043     }
2044     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2045     // set primary thread's schedule as new run-time schedule
2046     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2047 
2048     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2049     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2050 
2051     // Update the floating point rounding in the team if required.
2052     propagateFPControl(team);
2053 #if OMPD_SUPPORT
2054     if (ompd_state & OMPD_ENABLE_BP)
2055       ompd_bp_parallel_begin();
2056 #endif
2057 
2058     if (__kmp_tasking_mode != tskm_immediate_exec) {
2059       // Set primary thread's task team to team's task team. Unless this is hot
2060       // team, it should be NULL.
2061       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2062                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2063       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2064                     "%p, new task_team %p / team %p\n",
2065                     __kmp_gtid_from_thread(master_th),
2066                     master_th->th.th_task_team, parent_team,
2067                     team->t.t_task_team[master_th->th.th_task_state], team));
2068 
2069       if (active_level || master_th->th.th_task_team) {
2070         // Take a memo of primary thread's task_state
2071         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2072         if (master_th->th.th_task_state_top >=
2073             master_th->th.th_task_state_stack_sz) { // increase size
2074           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2075           kmp_uint8 *old_stack, *new_stack;
2076           kmp_uint32 i;
2077           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2078           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2079             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2080           }
2081           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2082                ++i) { // zero-init rest of stack
2083             new_stack[i] = 0;
2084           }
2085           old_stack = master_th->th.th_task_state_memo_stack;
2086           master_th->th.th_task_state_memo_stack = new_stack;
2087           master_th->th.th_task_state_stack_sz = new_size;
2088           __kmp_free(old_stack);
2089         }
2090         // Store primary thread's task_state on stack
2091         master_th->th
2092             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2093             master_th->th.th_task_state;
2094         master_th->th.th_task_state_top++;
2095 #if KMP_NESTED_HOT_TEAMS
2096         if (master_th->th.th_hot_teams &&
2097             active_level < __kmp_hot_teams_max_level &&
2098             team == master_th->th.th_hot_teams[active_level].hot_team) {
2099           // Restore primary thread's nested state if nested hot team
2100           master_th->th.th_task_state =
2101               master_th->th
2102                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2103         } else {
2104 #endif
2105           master_th->th.th_task_state = 0;
2106 #if KMP_NESTED_HOT_TEAMS
2107         }
2108 #endif
2109       }
2110 #if !KMP_NESTED_HOT_TEAMS
2111       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2112                        (team == root->r.r_hot_team));
2113 #endif
2114     }
2115 
2116     KA_TRACE(
2117         20,
2118         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2119          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2120          team->t.t_nproc));
2121     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2122                      (team->t.t_master_tid == 0 &&
2123                       (team->t.t_parent == root->r.r_root_team ||
2124                        team->t.t_parent->t.t_serialized)));
2125     KMP_MB();
2126 
2127     /* now, setup the arguments */
2128     argv = (void **)team->t.t_argv;
2129     if (ap) {
2130       for (i = argc - 1; i >= 0; --i) {
2131         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2132         KMP_CHECK_UPDATE(*argv, new_argv);
2133         argv++;
2134       }
2135     } else {
2136       for (i = 0; i < argc; ++i) {
2137         // Get args from parent team for teams construct
2138         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2139       }
2140     }
2141 
2142     /* now actually fork the threads */
2143     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2144     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2145       root->r.r_active = TRUE;
2146 
2147     __kmp_fork_team_threads(root, team, master_th, gtid);
2148     __kmp_setup_icv_copy(team, nthreads,
2149                          &master_th->th.th_current_task->td_icvs, loc);
2150 
2151 #if OMPT_SUPPORT
2152     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2153 #endif
2154 
2155     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2156 
2157 #if USE_ITT_BUILD
2158     if (team->t.t_active_level == 1 // only report frames at level 1
2159         && !master_th->th.th_teams_microtask) { // not in teams construct
2160 #if USE_ITT_NOTIFY
2161       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2162           (__kmp_forkjoin_frames_mode == 3 ||
2163            __kmp_forkjoin_frames_mode == 1)) {
2164         kmp_uint64 tmp_time = 0;
2165         if (__itt_get_timestamp_ptr)
2166           tmp_time = __itt_get_timestamp();
2167         // Internal fork - report frame begin
2168         master_th->th.th_frame_time = tmp_time;
2169         if (__kmp_forkjoin_frames_mode == 3)
2170           team->t.t_region_time = tmp_time;
2171       } else
2172 // only one notification scheme (either "submit" or "forking/joined", not both)
2173 #endif /* USE_ITT_NOTIFY */
2174           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2175               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2176         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2177         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2178       }
2179     }
2180 #endif /* USE_ITT_BUILD */
2181 
2182     /* now go on and do the work */
2183     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2184     KMP_MB();
2185     KF_TRACE(10,
2186              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2187               root, team, master_th, gtid));
2188 
2189 #if USE_ITT_BUILD
2190     if (__itt_stack_caller_create_ptr) {
2191       // create new stack stitching id before entering fork barrier
2192       if (!enter_teams) {
2193         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2194         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2195       } else if (parent_team->t.t_serialized) {
2196         // keep stack stitching id in the serialized parent_team;
2197         // current team will be used for parallel inside the teams;
2198         // if parent_team is active, then it already keeps stack stitching id
2199         // for the league of teams
2200         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2201         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2202       }
2203     }
2204 #endif /* USE_ITT_BUILD */
2205 
2206     // AC: skip __kmp_internal_fork at teams construct, let only primary
2207     // threads execute
2208     if (ap) {
2209       __kmp_internal_fork(loc, gtid, team);
2210       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2211                     "master_th=%p, gtid=%d\n",
2212                     root, team, master_th, gtid));
2213     }
2214 
2215     if (call_context == fork_context_gnu) {
2216       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2217       return TRUE;
2218     }
2219 
2220     /* Invoke microtask for PRIMARY thread */
2221     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2222                   team->t.t_id, team->t.t_pkfn));
2223   } // END of timer KMP_fork_call block
2224 
2225 #if KMP_STATS_ENABLED
2226   // If beginning a teams construct, then change thread state
2227   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2228   if (!ap) {
2229     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2230   }
2231 #endif
2232 
2233   if (!team->t.t_invoke(gtid)) {
2234     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2235   }
2236 
2237 #if KMP_STATS_ENABLED
2238   // If was beginning of a teams construct, then reset thread state
2239   if (!ap) {
2240     KMP_SET_THREAD_STATE(previous_state);
2241   }
2242 #endif
2243 
2244   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2245                 team->t.t_id, team->t.t_pkfn));
2246   KMP_MB(); /* Flush all pending memory write invalidates.  */
2247 
2248   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2249 #if OMPT_SUPPORT
2250   if (ompt_enabled.enabled) {
2251     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2252   }
2253 #endif
2254 
2255   return TRUE;
2256 }
2257 
2258 #if OMPT_SUPPORT
2259 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2260                                             kmp_team_t *team) {
2261   // restore state outside the region
2262   thread->th.ompt_thread_info.state =
2263       ((team->t.t_serialized) ? ompt_state_work_serial
2264                               : ompt_state_work_parallel);
2265 }
2266 
2267 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2268                                    kmp_team_t *team, ompt_data_t *parallel_data,
2269                                    int flags, void *codeptr) {
2270   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2271   if (ompt_enabled.ompt_callback_parallel_end) {
2272     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2273         parallel_data, &(task_info->task_data), flags, codeptr);
2274   }
2275 
2276   task_info->frame.enter_frame = ompt_data_none;
2277   __kmp_join_restore_state(thread, team);
2278 }
2279 #endif
2280 
2281 void __kmp_join_call(ident_t *loc, int gtid
2282 #if OMPT_SUPPORT
2283                      ,
2284                      enum fork_context_e fork_context
2285 #endif
2286                      ,
2287                      int exit_teams) {
2288   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2289   kmp_team_t *team;
2290   kmp_team_t *parent_team;
2291   kmp_info_t *master_th;
2292   kmp_root_t *root;
2293   int master_active;
2294 
2295   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2296 
2297   /* setup current data */
2298   master_th = __kmp_threads[gtid];
2299   root = master_th->th.th_root;
2300   team = master_th->th.th_team;
2301   parent_team = team->t.t_parent;
2302 
2303   master_th->th.th_ident = loc;
2304 
2305 #if OMPT_SUPPORT
2306   void *team_microtask = (void *)team->t.t_pkfn;
2307   // For GOMP interface with serialized parallel, need the
2308   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2309   // and end-parallel events.
2310   if (ompt_enabled.enabled &&
2311       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2312     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2313   }
2314 #endif
2315 
2316 #if KMP_DEBUG
2317   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2318     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2319                   "th_task_team = %p\n",
2320                   __kmp_gtid_from_thread(master_th), team,
2321                   team->t.t_task_team[master_th->th.th_task_state],
2322                   master_th->th.th_task_team));
2323     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2324                      team->t.t_task_team[master_th->th.th_task_state]);
2325   }
2326 #endif
2327 
2328   if (team->t.t_serialized) {
2329     if (master_th->th.th_teams_microtask) {
2330       // We are in teams construct
2331       int level = team->t.t_level;
2332       int tlevel = master_th->th.th_teams_level;
2333       if (level == tlevel) {
2334         // AC: we haven't incremented it earlier at start of teams construct,
2335         //     so do it here - at the end of teams construct
2336         team->t.t_level++;
2337       } else if (level == tlevel + 1) {
2338         // AC: we are exiting parallel inside teams, need to increment
2339         // serialization in order to restore it in the next call to
2340         // __kmpc_end_serialized_parallel
2341         team->t.t_serialized++;
2342       }
2343     }
2344     __kmpc_end_serialized_parallel(loc, gtid);
2345 
2346 #if OMPT_SUPPORT
2347     if (ompt_enabled.enabled) {
2348       __kmp_join_restore_state(master_th, parent_team);
2349     }
2350 #endif
2351 
2352     return;
2353   }
2354 
2355   master_active = team->t.t_master_active;
2356 
2357   if (!exit_teams) {
2358     // AC: No barrier for internal teams at exit from teams construct.
2359     //     But there is barrier for external team (league).
2360     __kmp_internal_join(loc, gtid, team);
2361 #if USE_ITT_BUILD
2362     if (__itt_stack_caller_create_ptr) {
2363       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2364       // destroy the stack stitching id after join barrier
2365       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2366       team->t.t_stack_id = NULL;
2367     }
2368 #endif
2369   } else {
2370     master_th->th.th_task_state =
2371         0; // AC: no tasking in teams (out of any parallel)
2372 #if USE_ITT_BUILD
2373     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2374       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2375       // destroy the stack stitching id on exit from the teams construct
2376       // if parent_team is active, then the id will be destroyed later on
2377       // by master of the league of teams
2378       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2379       parent_team->t.t_stack_id = NULL;
2380     }
2381 #endif
2382 
2383     if (team->t.t_nproc > 1 &&
2384         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2385       team->t.b->update_num_threads(team->t.t_nproc);
2386       __kmp_add_threads_to_team(team, team->t.t_nproc);
2387     }
2388   }
2389 
2390   KMP_MB();
2391 
2392 #if OMPT_SUPPORT
2393   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2394   void *codeptr = team->t.ompt_team_info.master_return_address;
2395 #endif
2396 
2397 #if USE_ITT_BUILD
2398   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2399   if (team->t.t_active_level == 1 &&
2400       (!master_th->th.th_teams_microtask || /* not in teams construct */
2401        master_th->th.th_teams_size.nteams == 1)) {
2402     master_th->th.th_ident = loc;
2403     // only one notification scheme (either "submit" or "forking/joined", not
2404     // both)
2405     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2406         __kmp_forkjoin_frames_mode == 3)
2407       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2408                              master_th->th.th_frame_time, 0, loc,
2409                              master_th->th.th_team_nproc, 1);
2410     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2411              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2412       __kmp_itt_region_joined(gtid);
2413   } // active_level == 1
2414 #endif /* USE_ITT_BUILD */
2415 
2416   if (master_th->th.th_teams_microtask && !exit_teams &&
2417       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2418       team->t.t_level == master_th->th.th_teams_level + 1) {
2419 // AC: We need to leave the team structure intact at the end of parallel
2420 // inside the teams construct, so that at the next parallel same (hot) team
2421 // works, only adjust nesting levels
2422 #if OMPT_SUPPORT
2423     ompt_data_t ompt_parallel_data = ompt_data_none;
2424     if (ompt_enabled.enabled) {
2425       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2426       if (ompt_enabled.ompt_callback_implicit_task) {
2427         int ompt_team_size = team->t.t_nproc;
2428         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2429             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2430             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2431       }
2432       task_info->frame.exit_frame = ompt_data_none;
2433       task_info->task_data = ompt_data_none;
2434       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2435       __ompt_lw_taskteam_unlink(master_th);
2436     }
2437 #endif
2438     /* Decrement our nested depth level */
2439     team->t.t_level--;
2440     team->t.t_active_level--;
2441     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2442 
2443     // Restore number of threads in the team if needed. This code relies on
2444     // the proper adjustment of th_teams_size.nth after the fork in
2445     // __kmp_teams_master on each teams primary thread in the case that
2446     // __kmp_reserve_threads reduced it.
2447     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2448       int old_num = master_th->th.th_team_nproc;
2449       int new_num = master_th->th.th_teams_size.nth;
2450       kmp_info_t **other_threads = team->t.t_threads;
2451       team->t.t_nproc = new_num;
2452       for (int i = 0; i < old_num; ++i) {
2453         other_threads[i]->th.th_team_nproc = new_num;
2454       }
2455       // Adjust states of non-used threads of the team
2456       for (int i = old_num; i < new_num; ++i) {
2457         // Re-initialize thread's barrier data.
2458         KMP_DEBUG_ASSERT(other_threads[i]);
2459         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2460         for (int b = 0; b < bs_last_barrier; ++b) {
2461           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2462           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2463 #if USE_DEBUGGER
2464           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2465 #endif
2466         }
2467         if (__kmp_tasking_mode != tskm_immediate_exec) {
2468           // Synchronize thread's task state
2469           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2470         }
2471       }
2472     }
2473 
2474 #if OMPT_SUPPORT
2475     if (ompt_enabled.enabled) {
2476       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2477                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2478     }
2479 #endif
2480 
2481     return;
2482   }
2483 
2484   /* do cleanup and restore the parent team */
2485   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2486   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2487 
2488   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2489 
2490   /* jc: The following lock has instructions with REL and ACQ semantics,
2491      separating the parallel user code called in this parallel region
2492      from the serial user code called after this function returns. */
2493   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2494 
2495   if (!master_th->th.th_teams_microtask ||
2496       team->t.t_level > master_th->th.th_teams_level) {
2497     /* Decrement our nested depth level */
2498     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2499   }
2500   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2501 
2502 #if OMPT_SUPPORT
2503   if (ompt_enabled.enabled) {
2504     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2505     if (ompt_enabled.ompt_callback_implicit_task) {
2506       int flags = (team_microtask == (void *)__kmp_teams_master)
2507                       ? ompt_task_initial
2508                       : ompt_task_implicit;
2509       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2510       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2511           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2512           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2513     }
2514     task_info->frame.exit_frame = ompt_data_none;
2515     task_info->task_data = ompt_data_none;
2516   }
2517 #endif
2518 
2519   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2520                 master_th, team));
2521   __kmp_pop_current_task_from_thread(master_th);
2522 
2523 #if KMP_AFFINITY_SUPPORTED
2524   // Restore master thread's partition.
2525   master_th->th.th_first_place = team->t.t_first_place;
2526   master_th->th.th_last_place = team->t.t_last_place;
2527 #endif // KMP_AFFINITY_SUPPORTED
2528   master_th->th.th_def_allocator = team->t.t_def_allocator;
2529 
2530 #if OMPD_SUPPORT
2531   if (ompd_state & OMPD_ENABLE_BP)
2532     ompd_bp_parallel_end();
2533 #endif
2534   updateHWFPControl(team);
2535 
2536   if (root->r.r_active != master_active)
2537     root->r.r_active = master_active;
2538 
2539   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2540                             master_th)); // this will free worker threads
2541 
2542   /* this race was fun to find. make sure the following is in the critical
2543      region otherwise assertions may fail occasionally since the old team may be
2544      reallocated and the hierarchy appears inconsistent. it is actually safe to
2545      run and won't cause any bugs, but will cause those assertion failures. it's
2546      only one deref&assign so might as well put this in the critical region */
2547   master_th->th.th_team = parent_team;
2548   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2549   master_th->th.th_team_master = parent_team->t.t_threads[0];
2550   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2551 
2552   /* restore serialized team, if need be */
2553   if (parent_team->t.t_serialized &&
2554       parent_team != master_th->th.th_serial_team &&
2555       parent_team != root->r.r_root_team) {
2556     __kmp_free_team(root,
2557                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2558     master_th->th.th_serial_team = parent_team;
2559   }
2560 
2561   if (__kmp_tasking_mode != tskm_immediate_exec) {
2562     if (master_th->th.th_task_state_top >
2563         0) { // Restore task state from memo stack
2564       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2565       // Remember primary thread's state if we re-use this nested hot team
2566       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2567           master_th->th.th_task_state;
2568       --master_th->th.th_task_state_top; // pop
2569       // Now restore state at this level
2570       master_th->th.th_task_state =
2571           master_th->th
2572               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2573     }
2574     // Copy the task team from the parent team to the primary thread
2575     master_th->th.th_task_team =
2576         parent_team->t.t_task_team[master_th->th.th_task_state];
2577     KA_TRACE(20,
2578              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2579               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2580               parent_team));
2581   }
2582 
2583   // TODO: GEH - cannot do this assertion because root thread not set up as
2584   // executing
2585   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2586   master_th->th.th_current_task->td_flags.executing = 1;
2587 
2588   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2589 
2590 #if OMPT_SUPPORT
2591   int flags =
2592       OMPT_INVOKER(fork_context) |
2593       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2594                                                       : ompt_parallel_team);
2595   if (ompt_enabled.enabled) {
2596     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2597                     codeptr);
2598   }
2599 #endif
2600 
2601   KMP_MB();
2602   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2603 }
2604 
2605 /* Check whether we should push an internal control record onto the
2606    serial team stack.  If so, do it.  */
2607 void __kmp_save_internal_controls(kmp_info_t *thread) {
2608 
2609   if (thread->th.th_team != thread->th.th_serial_team) {
2610     return;
2611   }
2612   if (thread->th.th_team->t.t_serialized > 1) {
2613     int push = 0;
2614 
2615     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2616       push = 1;
2617     } else {
2618       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2619           thread->th.th_team->t.t_serialized) {
2620         push = 1;
2621       }
2622     }
2623     if (push) { /* push a record on the serial team's stack */
2624       kmp_internal_control_t *control =
2625           (kmp_internal_control_t *)__kmp_allocate(
2626               sizeof(kmp_internal_control_t));
2627 
2628       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2629 
2630       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2631 
2632       control->next = thread->th.th_team->t.t_control_stack_top;
2633       thread->th.th_team->t.t_control_stack_top = control;
2634     }
2635   }
2636 }
2637 
2638 /* Changes set_nproc */
2639 void __kmp_set_num_threads(int new_nth, int gtid) {
2640   kmp_info_t *thread;
2641   kmp_root_t *root;
2642 
2643   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2644   KMP_DEBUG_ASSERT(__kmp_init_serial);
2645 
2646   if (new_nth < 1)
2647     new_nth = 1;
2648   else if (new_nth > __kmp_max_nth)
2649     new_nth = __kmp_max_nth;
2650 
2651   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2652   thread = __kmp_threads[gtid];
2653   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2654     return; // nothing to do
2655 
2656   __kmp_save_internal_controls(thread);
2657 
2658   set__nproc(thread, new_nth);
2659 
2660   // If this omp_set_num_threads() call will cause the hot team size to be
2661   // reduced (in the absence of a num_threads clause), then reduce it now,
2662   // rather than waiting for the next parallel region.
2663   root = thread->th.th_root;
2664   if (__kmp_init_parallel && (!root->r.r_active) &&
2665       (root->r.r_hot_team->t.t_nproc > new_nth)
2666 #if KMP_NESTED_HOT_TEAMS
2667       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2668 #endif
2669   ) {
2670     kmp_team_t *hot_team = root->r.r_hot_team;
2671     int f;
2672 
2673     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2674 
2675     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2676       __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2677     }
2678     // Release the extra threads we don't need any more.
2679     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2680       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2681       if (__kmp_tasking_mode != tskm_immediate_exec) {
2682         // When decreasing team size, threads no longer in the team should unref
2683         // task team.
2684         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2685       }
2686       __kmp_free_thread(hot_team->t.t_threads[f]);
2687       hot_team->t.t_threads[f] = NULL;
2688     }
2689     hot_team->t.t_nproc = new_nth;
2690 #if KMP_NESTED_HOT_TEAMS
2691     if (thread->th.th_hot_teams) {
2692       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2693       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2694     }
2695 #endif
2696 
2697     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2698       hot_team->t.b->update_num_threads(new_nth);
2699       __kmp_add_threads_to_team(hot_team, new_nth);
2700     }
2701 
2702     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2703 
2704     // Update the t_nproc field in the threads that are still active.
2705     for (f = 0; f < new_nth; f++) {
2706       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2707       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2708     }
2709     // Special flag in case omp_set_num_threads() call
2710     hot_team->t.t_size_changed = -1;
2711   }
2712 }
2713 
2714 /* Changes max_active_levels */
2715 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2716   kmp_info_t *thread;
2717 
2718   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2719                 "%d = (%d)\n",
2720                 gtid, max_active_levels));
2721   KMP_DEBUG_ASSERT(__kmp_init_serial);
2722 
2723   // validate max_active_levels
2724   if (max_active_levels < 0) {
2725     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2726     // We ignore this call if the user has specified a negative value.
2727     // The current setting won't be changed. The last valid setting will be
2728     // used. A warning will be issued (if warnings are allowed as controlled by
2729     // the KMP_WARNINGS env var).
2730     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2731                   "max_active_levels for thread %d = (%d)\n",
2732                   gtid, max_active_levels));
2733     return;
2734   }
2735   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2736     // it's OK, the max_active_levels is within the valid range: [ 0;
2737     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2738     // We allow a zero value. (implementation defined behavior)
2739   } else {
2740     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2741                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2742     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2743     // Current upper limit is MAX_INT. (implementation defined behavior)
2744     // If the input exceeds the upper limit, we correct the input to be the
2745     // upper limit. (implementation defined behavior)
2746     // Actually, the flow should never get here until we use MAX_INT limit.
2747   }
2748   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2749                 "max_active_levels for thread %d = (%d)\n",
2750                 gtid, max_active_levels));
2751 
2752   thread = __kmp_threads[gtid];
2753 
2754   __kmp_save_internal_controls(thread);
2755 
2756   set__max_active_levels(thread, max_active_levels);
2757 }
2758 
2759 /* Gets max_active_levels */
2760 int __kmp_get_max_active_levels(int gtid) {
2761   kmp_info_t *thread;
2762 
2763   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2764   KMP_DEBUG_ASSERT(__kmp_init_serial);
2765 
2766   thread = __kmp_threads[gtid];
2767   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2768   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2769                 "curtask_maxaclevel=%d\n",
2770                 gtid, thread->th.th_current_task,
2771                 thread->th.th_current_task->td_icvs.max_active_levels));
2772   return thread->th.th_current_task->td_icvs.max_active_levels;
2773 }
2774 
2775 // nteams-var per-device ICV
2776 void __kmp_set_num_teams(int num_teams) {
2777   if (num_teams > 0)
2778     __kmp_nteams = num_teams;
2779 }
2780 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2781 // teams-thread-limit-var per-device ICV
2782 void __kmp_set_teams_thread_limit(int limit) {
2783   if (limit > 0)
2784     __kmp_teams_thread_limit = limit;
2785 }
2786 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2787 
2788 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2789 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2790 
2791 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2792 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2793   kmp_info_t *thread;
2794   kmp_sched_t orig_kind;
2795   //    kmp_team_t *team;
2796 
2797   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2798                 gtid, (int)kind, chunk));
2799   KMP_DEBUG_ASSERT(__kmp_init_serial);
2800 
2801   // Check if the kind parameter is valid, correct if needed.
2802   // Valid parameters should fit in one of two intervals - standard or extended:
2803   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2804   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2805   orig_kind = kind;
2806   kind = __kmp_sched_without_mods(kind);
2807 
2808   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2809       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2810     // TODO: Hint needs attention in case we change the default schedule.
2811     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2812               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2813               __kmp_msg_null);
2814     kind = kmp_sched_default;
2815     chunk = 0; // ignore chunk value in case of bad kind
2816   }
2817 
2818   thread = __kmp_threads[gtid];
2819 
2820   __kmp_save_internal_controls(thread);
2821 
2822   if (kind < kmp_sched_upper_std) {
2823     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2824       // differ static chunked vs. unchunked:  chunk should be invalid to
2825       // indicate unchunked schedule (which is the default)
2826       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2827     } else {
2828       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2829           __kmp_sch_map[kind - kmp_sched_lower - 1];
2830     }
2831   } else {
2832     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2833     //    kmp_sched_lower - 2 ];
2834     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2835         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2836                       kmp_sched_lower - 2];
2837   }
2838   __kmp_sched_apply_mods_intkind(
2839       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2840   if (kind == kmp_sched_auto || chunk < 1) {
2841     // ignore parameter chunk for schedule auto
2842     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2843   } else {
2844     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2845   }
2846 }
2847 
2848 /* Gets def_sched_var ICV values */
2849 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2850   kmp_info_t *thread;
2851   enum sched_type th_type;
2852 
2853   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2854   KMP_DEBUG_ASSERT(__kmp_init_serial);
2855 
2856   thread = __kmp_threads[gtid];
2857 
2858   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2859   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2860   case kmp_sch_static:
2861   case kmp_sch_static_greedy:
2862   case kmp_sch_static_balanced:
2863     *kind = kmp_sched_static;
2864     __kmp_sched_apply_mods_stdkind(kind, th_type);
2865     *chunk = 0; // chunk was not set, try to show this fact via zero value
2866     return;
2867   case kmp_sch_static_chunked:
2868     *kind = kmp_sched_static;
2869     break;
2870   case kmp_sch_dynamic_chunked:
2871     *kind = kmp_sched_dynamic;
2872     break;
2873   case kmp_sch_guided_chunked:
2874   case kmp_sch_guided_iterative_chunked:
2875   case kmp_sch_guided_analytical_chunked:
2876     *kind = kmp_sched_guided;
2877     break;
2878   case kmp_sch_auto:
2879     *kind = kmp_sched_auto;
2880     break;
2881   case kmp_sch_trapezoidal:
2882     *kind = kmp_sched_trapezoidal;
2883     break;
2884 #if KMP_STATIC_STEAL_ENABLED
2885   case kmp_sch_static_steal:
2886     *kind = kmp_sched_static_steal;
2887     break;
2888 #endif
2889   default:
2890     KMP_FATAL(UnknownSchedulingType, th_type);
2891   }
2892 
2893   __kmp_sched_apply_mods_stdkind(kind, th_type);
2894   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2895 }
2896 
2897 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2898 
2899   int ii, dd;
2900   kmp_team_t *team;
2901   kmp_info_t *thr;
2902 
2903   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2904   KMP_DEBUG_ASSERT(__kmp_init_serial);
2905 
2906   // validate level
2907   if (level == 0)
2908     return 0;
2909   if (level < 0)
2910     return -1;
2911   thr = __kmp_threads[gtid];
2912   team = thr->th.th_team;
2913   ii = team->t.t_level;
2914   if (level > ii)
2915     return -1;
2916 
2917   if (thr->th.th_teams_microtask) {
2918     // AC: we are in teams region where multiple nested teams have same level
2919     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2920     if (level <=
2921         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2922       KMP_DEBUG_ASSERT(ii >= tlevel);
2923       // AC: As we need to pass by the teams league, we need to artificially
2924       // increase ii
2925       if (ii == tlevel) {
2926         ii += 2; // three teams have same level
2927       } else {
2928         ii++; // two teams have same level
2929       }
2930     }
2931   }
2932 
2933   if (ii == level)
2934     return __kmp_tid_from_gtid(gtid);
2935 
2936   dd = team->t.t_serialized;
2937   level++;
2938   while (ii > level) {
2939     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2940     }
2941     if ((team->t.t_serialized) && (!dd)) {
2942       team = team->t.t_parent;
2943       continue;
2944     }
2945     if (ii > level) {
2946       team = team->t.t_parent;
2947       dd = team->t.t_serialized;
2948       ii--;
2949     }
2950   }
2951 
2952   return (dd > 1) ? (0) : (team->t.t_master_tid);
2953 }
2954 
2955 int __kmp_get_team_size(int gtid, int level) {
2956 
2957   int ii, dd;
2958   kmp_team_t *team;
2959   kmp_info_t *thr;
2960 
2961   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2962   KMP_DEBUG_ASSERT(__kmp_init_serial);
2963 
2964   // validate level
2965   if (level == 0)
2966     return 1;
2967   if (level < 0)
2968     return -1;
2969   thr = __kmp_threads[gtid];
2970   team = thr->th.th_team;
2971   ii = team->t.t_level;
2972   if (level > ii)
2973     return -1;
2974 
2975   if (thr->th.th_teams_microtask) {
2976     // AC: we are in teams region where multiple nested teams have same level
2977     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2978     if (level <=
2979         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2980       KMP_DEBUG_ASSERT(ii >= tlevel);
2981       // AC: As we need to pass by the teams league, we need to artificially
2982       // increase ii
2983       if (ii == tlevel) {
2984         ii += 2; // three teams have same level
2985       } else {
2986         ii++; // two teams have same level
2987       }
2988     }
2989   }
2990 
2991   while (ii > level) {
2992     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2993     }
2994     if (team->t.t_serialized && (!dd)) {
2995       team = team->t.t_parent;
2996       continue;
2997     }
2998     if (ii > level) {
2999       team = team->t.t_parent;
3000       ii--;
3001     }
3002   }
3003 
3004   return team->t.t_nproc;
3005 }
3006 
3007 kmp_r_sched_t __kmp_get_schedule_global() {
3008   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3009   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3010   // independently. So one can get the updated schedule here.
3011 
3012   kmp_r_sched_t r_sched;
3013 
3014   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3015   // __kmp_guided. __kmp_sched should keep original value, so that user can set
3016   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3017   // different roots (even in OMP 2.5)
3018   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3019   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3020   if (s == kmp_sch_static) {
3021     // replace STATIC with more detailed schedule (balanced or greedy)
3022     r_sched.r_sched_type = __kmp_static;
3023   } else if (s == kmp_sch_guided_chunked) {
3024     // replace GUIDED with more detailed schedule (iterative or analytical)
3025     r_sched.r_sched_type = __kmp_guided;
3026   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3027     r_sched.r_sched_type = __kmp_sched;
3028   }
3029   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3030 
3031   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3032     // __kmp_chunk may be wrong here (if it was not ever set)
3033     r_sched.chunk = KMP_DEFAULT_CHUNK;
3034   } else {
3035     r_sched.chunk = __kmp_chunk;
3036   }
3037 
3038   return r_sched;
3039 }
3040 
3041 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3042    at least argc number of *t_argv entries for the requested team. */
3043 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3044 
3045   KMP_DEBUG_ASSERT(team);
3046   if (!realloc || argc > team->t.t_max_argc) {
3047 
3048     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3049                    "current entries=%d\n",
3050                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3051     /* if previously allocated heap space for args, free them */
3052     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3053       __kmp_free((void *)team->t.t_argv);
3054 
3055     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3056       /* use unused space in the cache line for arguments */
3057       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3058       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3059                      "argv entries\n",
3060                      team->t.t_id, team->t.t_max_argc));
3061       team->t.t_argv = &team->t.t_inline_argv[0];
3062       if (__kmp_storage_map) {
3063         __kmp_print_storage_map_gtid(
3064             -1, &team->t.t_inline_argv[0],
3065             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3066             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3067             team->t.t_id);
3068       }
3069     } else {
3070       /* allocate space for arguments in the heap */
3071       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3072                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3073                                : 2 * argc;
3074       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3075                      "argv entries\n",
3076                      team->t.t_id, team->t.t_max_argc));
3077       team->t.t_argv =
3078           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3079       if (__kmp_storage_map) {
3080         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3081                                      &team->t.t_argv[team->t.t_max_argc],
3082                                      sizeof(void *) * team->t.t_max_argc,
3083                                      "team_%d.t_argv", team->t.t_id);
3084       }
3085     }
3086   }
3087 }
3088 
3089 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3090   int i;
3091   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3092   team->t.t_threads =
3093       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3094   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3095       sizeof(dispatch_shared_info_t) * num_disp_buff);
3096   team->t.t_dispatch =
3097       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3098   team->t.t_implicit_task_taskdata =
3099       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3100   team->t.t_max_nproc = max_nth;
3101 
3102   /* setup dispatch buffers */
3103   for (i = 0; i < num_disp_buff; ++i) {
3104     team->t.t_disp_buffer[i].buffer_index = i;
3105     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3106   }
3107 }
3108 
3109 static void __kmp_free_team_arrays(kmp_team_t *team) {
3110   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3111   int i;
3112   for (i = 0; i < team->t.t_max_nproc; ++i) {
3113     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3114       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3115       team->t.t_dispatch[i].th_disp_buffer = NULL;
3116     }
3117   }
3118 #if KMP_USE_HIER_SCHED
3119   __kmp_dispatch_free_hierarchies(team);
3120 #endif
3121   __kmp_free(team->t.t_threads);
3122   __kmp_free(team->t.t_disp_buffer);
3123   __kmp_free(team->t.t_dispatch);
3124   __kmp_free(team->t.t_implicit_task_taskdata);
3125   team->t.t_threads = NULL;
3126   team->t.t_disp_buffer = NULL;
3127   team->t.t_dispatch = NULL;
3128   team->t.t_implicit_task_taskdata = 0;
3129 }
3130 
3131 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3132   kmp_info_t **oldThreads = team->t.t_threads;
3133 
3134   __kmp_free(team->t.t_disp_buffer);
3135   __kmp_free(team->t.t_dispatch);
3136   __kmp_free(team->t.t_implicit_task_taskdata);
3137   __kmp_allocate_team_arrays(team, max_nth);
3138 
3139   KMP_MEMCPY(team->t.t_threads, oldThreads,
3140              team->t.t_nproc * sizeof(kmp_info_t *));
3141 
3142   __kmp_free(oldThreads);
3143 }
3144 
3145 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3146 
3147   kmp_r_sched_t r_sched =
3148       __kmp_get_schedule_global(); // get current state of scheduling globals
3149 
3150   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3151 
3152   kmp_internal_control_t g_icvs = {
3153     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3154     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3155     // adjustment of threads (per thread)
3156     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3157     // whether blocktime is explicitly set
3158     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3159 #if KMP_USE_MONITOR
3160     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3161 // intervals
3162 #endif
3163     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3164     // next parallel region (per thread)
3165     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3166     __kmp_cg_max_nth, // int thread_limit;
3167     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3168     // for max_active_levels
3169     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3170     // {sched,chunk} pair
3171     __kmp_nested_proc_bind.bind_types[0],
3172     __kmp_default_device,
3173     NULL // struct kmp_internal_control *next;
3174   };
3175 
3176   return g_icvs;
3177 }
3178 
3179 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3180 
3181   kmp_internal_control_t gx_icvs;
3182   gx_icvs.serial_nesting_level =
3183       0; // probably =team->t.t_serial like in save_inter_controls
3184   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3185   gx_icvs.next = NULL;
3186 
3187   return gx_icvs;
3188 }
3189 
3190 static void __kmp_initialize_root(kmp_root_t *root) {
3191   int f;
3192   kmp_team_t *root_team;
3193   kmp_team_t *hot_team;
3194   int hot_team_max_nth;
3195   kmp_r_sched_t r_sched =
3196       __kmp_get_schedule_global(); // get current state of scheduling globals
3197   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3198   KMP_DEBUG_ASSERT(root);
3199   KMP_ASSERT(!root->r.r_begin);
3200 
3201   /* setup the root state structure */
3202   __kmp_init_lock(&root->r.r_begin_lock);
3203   root->r.r_begin = FALSE;
3204   root->r.r_active = FALSE;
3205   root->r.r_in_parallel = 0;
3206   root->r.r_blocktime = __kmp_dflt_blocktime;
3207 #if KMP_AFFINITY_SUPPORTED
3208   root->r.r_affinity_assigned = FALSE;
3209 #endif
3210 
3211   /* setup the root team for this task */
3212   /* allocate the root team structure */
3213   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3214 
3215   root_team =
3216       __kmp_allocate_team(root,
3217                           1, // new_nproc
3218                           1, // max_nproc
3219 #if OMPT_SUPPORT
3220                           ompt_data_none, // root parallel id
3221 #endif
3222                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3223                           0 // argc
3224                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3225                           );
3226 #if USE_DEBUGGER
3227   // Non-NULL value should be assigned to make the debugger display the root
3228   // team.
3229   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3230 #endif
3231 
3232   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3233 
3234   root->r.r_root_team = root_team;
3235   root_team->t.t_control_stack_top = NULL;
3236 
3237   /* initialize root team */
3238   root_team->t.t_threads[0] = NULL;
3239   root_team->t.t_nproc = 1;
3240   root_team->t.t_serialized = 1;
3241   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3242   root_team->t.t_sched.sched = r_sched.sched;
3243   KA_TRACE(
3244       20,
3245       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3246        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3247 
3248   /* setup the  hot team for this task */
3249   /* allocate the hot team structure */
3250   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3251 
3252   hot_team =
3253       __kmp_allocate_team(root,
3254                           1, // new_nproc
3255                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3256 #if OMPT_SUPPORT
3257                           ompt_data_none, // root parallel id
3258 #endif
3259                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3260                           0 // argc
3261                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3262                           );
3263   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3264 
3265   root->r.r_hot_team = hot_team;
3266   root_team->t.t_control_stack_top = NULL;
3267 
3268   /* first-time initialization */
3269   hot_team->t.t_parent = root_team;
3270 
3271   /* initialize hot team */
3272   hot_team_max_nth = hot_team->t.t_max_nproc;
3273   for (f = 0; f < hot_team_max_nth; ++f) {
3274     hot_team->t.t_threads[f] = NULL;
3275   }
3276   hot_team->t.t_nproc = 1;
3277   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3278   hot_team->t.t_sched.sched = r_sched.sched;
3279   hot_team->t.t_size_changed = 0;
3280 }
3281 
3282 #ifdef KMP_DEBUG
3283 
3284 typedef struct kmp_team_list_item {
3285   kmp_team_p const *entry;
3286   struct kmp_team_list_item *next;
3287 } kmp_team_list_item_t;
3288 typedef kmp_team_list_item_t *kmp_team_list_t;
3289 
3290 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3291     kmp_team_list_t list, // List of teams.
3292     kmp_team_p const *team // Team to add.
3293 ) {
3294 
3295   // List must terminate with item where both entry and next are NULL.
3296   // Team is added to the list only once.
3297   // List is sorted in ascending order by team id.
3298   // Team id is *not* a key.
3299 
3300   kmp_team_list_t l;
3301 
3302   KMP_DEBUG_ASSERT(list != NULL);
3303   if (team == NULL) {
3304     return;
3305   }
3306 
3307   __kmp_print_structure_team_accum(list, team->t.t_parent);
3308   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3309 
3310   // Search list for the team.
3311   l = list;
3312   while (l->next != NULL && l->entry != team) {
3313     l = l->next;
3314   }
3315   if (l->next != NULL) {
3316     return; // Team has been added before, exit.
3317   }
3318 
3319   // Team is not found. Search list again for insertion point.
3320   l = list;
3321   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3322     l = l->next;
3323   }
3324 
3325   // Insert team.
3326   {
3327     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3328         sizeof(kmp_team_list_item_t));
3329     *item = *l;
3330     l->entry = team;
3331     l->next = item;
3332   }
3333 }
3334 
3335 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3336 
3337 ) {
3338   __kmp_printf("%s", title);
3339   if (team != NULL) {
3340     __kmp_printf("%2x %p\n", team->t.t_id, team);
3341   } else {
3342     __kmp_printf(" - (nil)\n");
3343   }
3344 }
3345 
3346 static void __kmp_print_structure_thread(char const *title,
3347                                          kmp_info_p const *thread) {
3348   __kmp_printf("%s", title);
3349   if (thread != NULL) {
3350     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3351   } else {
3352     __kmp_printf(" - (nil)\n");
3353   }
3354 }
3355 
3356 void __kmp_print_structure(void) {
3357 
3358   kmp_team_list_t list;
3359 
3360   // Initialize list of teams.
3361   list =
3362       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3363   list->entry = NULL;
3364   list->next = NULL;
3365 
3366   __kmp_printf("\n------------------------------\nGlobal Thread "
3367                "Table\n------------------------------\n");
3368   {
3369     int gtid;
3370     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3371       __kmp_printf("%2d", gtid);
3372       if (__kmp_threads != NULL) {
3373         __kmp_printf(" %p", __kmp_threads[gtid]);
3374       }
3375       if (__kmp_root != NULL) {
3376         __kmp_printf(" %p", __kmp_root[gtid]);
3377       }
3378       __kmp_printf("\n");
3379     }
3380   }
3381 
3382   // Print out __kmp_threads array.
3383   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3384                "----------\n");
3385   if (__kmp_threads != NULL) {
3386     int gtid;
3387     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3388       kmp_info_t const *thread = __kmp_threads[gtid];
3389       if (thread != NULL) {
3390         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3391         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3392         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3393         __kmp_print_structure_team("    Serial Team:  ",
3394                                    thread->th.th_serial_team);
3395         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3396         __kmp_print_structure_thread("    Primary:      ",
3397                                      thread->th.th_team_master);
3398         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3399         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3400         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3401         __kmp_print_structure_thread("    Next in pool: ",
3402                                      thread->th.th_next_pool);
3403         __kmp_printf("\n");
3404         __kmp_print_structure_team_accum(list, thread->th.th_team);
3405         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3406       }
3407     }
3408   } else {
3409     __kmp_printf("Threads array is not allocated.\n");
3410   }
3411 
3412   // Print out __kmp_root array.
3413   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3414                "--------\n");
3415   if (__kmp_root != NULL) {
3416     int gtid;
3417     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3418       kmp_root_t const *root = __kmp_root[gtid];
3419       if (root != NULL) {
3420         __kmp_printf("GTID %2d %p:\n", gtid, root);
3421         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3422         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3423         __kmp_print_structure_thread("    Uber Thread:  ",
3424                                      root->r.r_uber_thread);
3425         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3426         __kmp_printf("    In Parallel:  %2d\n",
3427                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3428         __kmp_printf("\n");
3429         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3430         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3431       }
3432     }
3433   } else {
3434     __kmp_printf("Ubers array is not allocated.\n");
3435   }
3436 
3437   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3438                "--------\n");
3439   while (list->next != NULL) {
3440     kmp_team_p const *team = list->entry;
3441     int i;
3442     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3443     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3444     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3445     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3446     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3447     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3448     for (i = 0; i < team->t.t_nproc; ++i) {
3449       __kmp_printf("    Thread %2d:      ", i);
3450       __kmp_print_structure_thread("", team->t.t_threads[i]);
3451     }
3452     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3453     __kmp_printf("\n");
3454     list = list->next;
3455   }
3456 
3457   // Print out __kmp_thread_pool and __kmp_team_pool.
3458   __kmp_printf("\n------------------------------\nPools\n----------------------"
3459                "--------\n");
3460   __kmp_print_structure_thread("Thread pool:          ",
3461                                CCAST(kmp_info_t *, __kmp_thread_pool));
3462   __kmp_print_structure_team("Team pool:            ",
3463                              CCAST(kmp_team_t *, __kmp_team_pool));
3464   __kmp_printf("\n");
3465 
3466   // Free team list.
3467   while (list != NULL) {
3468     kmp_team_list_item_t *item = list;
3469     list = list->next;
3470     KMP_INTERNAL_FREE(item);
3471   }
3472 }
3473 
3474 #endif
3475 
3476 //---------------------------------------------------------------------------
3477 //  Stuff for per-thread fast random number generator
3478 //  Table of primes
3479 static const unsigned __kmp_primes[] = {
3480     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3481     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3482     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3483     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3484     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3485     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3486     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3487     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3488     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3489     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3490     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3491 
3492 //---------------------------------------------------------------------------
3493 //  __kmp_get_random: Get a random number using a linear congruential method.
3494 unsigned short __kmp_get_random(kmp_info_t *thread) {
3495   unsigned x = thread->th.th_x;
3496   unsigned short r = (unsigned short)(x >> 16);
3497 
3498   thread->th.th_x = x * thread->th.th_a + 1;
3499 
3500   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3501                 thread->th.th_info.ds.ds_tid, r));
3502 
3503   return r;
3504 }
3505 //--------------------------------------------------------
3506 // __kmp_init_random: Initialize a random number generator
3507 void __kmp_init_random(kmp_info_t *thread) {
3508   unsigned seed = thread->th.th_info.ds.ds_tid;
3509 
3510   thread->th.th_a =
3511       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3512   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3513   KA_TRACE(30,
3514            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3515 }
3516 
3517 #if KMP_OS_WINDOWS
3518 /* reclaim array entries for root threads that are already dead, returns number
3519  * reclaimed */
3520 static int __kmp_reclaim_dead_roots(void) {
3521   int i, r = 0;
3522 
3523   for (i = 0; i < __kmp_threads_capacity; ++i) {
3524     if (KMP_UBER_GTID(i) &&
3525         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3526         !__kmp_root[i]
3527              ->r.r_active) { // AC: reclaim only roots died in non-active state
3528       r += __kmp_unregister_root_other_thread(i);
3529     }
3530   }
3531   return r;
3532 }
3533 #endif
3534 
3535 /* This function attempts to create free entries in __kmp_threads and
3536    __kmp_root, and returns the number of free entries generated.
3537 
3538    For Windows* OS static library, the first mechanism used is to reclaim array
3539    entries for root threads that are already dead.
3540 
3541    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3542    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3543    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3544    threadprivate cache array has been created. Synchronization with
3545    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3546 
3547    After any dead root reclamation, if the clipping value allows array expansion
3548    to result in the generation of a total of nNeed free slots, the function does
3549    that expansion. If not, nothing is done beyond the possible initial root
3550    thread reclamation.
3551 
3552    If any argument is negative, the behavior is undefined. */
3553 static int __kmp_expand_threads(int nNeed) {
3554   int added = 0;
3555   int minimumRequiredCapacity;
3556   int newCapacity;
3557   kmp_info_t **newThreads;
3558   kmp_root_t **newRoot;
3559 
3560   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3561   // resizing __kmp_threads does not need additional protection if foreign
3562   // threads are present
3563 
3564 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3565   /* only for Windows static library */
3566   /* reclaim array entries for root threads that are already dead */
3567   added = __kmp_reclaim_dead_roots();
3568 
3569   if (nNeed) {
3570     nNeed -= added;
3571     if (nNeed < 0)
3572       nNeed = 0;
3573   }
3574 #endif
3575   if (nNeed <= 0)
3576     return added;
3577 
3578   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3579   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3580   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3581   // > __kmp_max_nth in one of two ways:
3582   //
3583   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3584   //    may not be reused by another thread, so we may need to increase
3585   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3586   //
3587   // 2) New foreign root(s) are encountered.  We always register new foreign
3588   //    roots. This may cause a smaller # of threads to be allocated at
3589   //    subsequent parallel regions, but the worker threads hang around (and
3590   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3591   //
3592   // Anyway, that is the reason for moving the check to see if
3593   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3594   // instead of having it performed here. -BB
3595 
3596   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3597 
3598   /* compute expansion headroom to check if we can expand */
3599   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3600     /* possible expansion too small -- give up */
3601     return added;
3602   }
3603   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3604 
3605   newCapacity = __kmp_threads_capacity;
3606   do {
3607     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3608                                                           : __kmp_sys_max_nth;
3609   } while (newCapacity < minimumRequiredCapacity);
3610   newThreads = (kmp_info_t **)__kmp_allocate(
3611       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3612   newRoot =
3613       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3614   KMP_MEMCPY(newThreads, __kmp_threads,
3615              __kmp_threads_capacity * sizeof(kmp_info_t *));
3616   KMP_MEMCPY(newRoot, __kmp_root,
3617              __kmp_threads_capacity * sizeof(kmp_root_t *));
3618 
3619   kmp_info_t **temp_threads = __kmp_threads;
3620   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3621   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3622   __kmp_free(temp_threads);
3623   added += newCapacity - __kmp_threads_capacity;
3624   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3625 
3626   if (newCapacity > __kmp_tp_capacity) {
3627     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3628     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3629       __kmp_threadprivate_resize_cache(newCapacity);
3630     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3631       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3632     }
3633     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3634   }
3635 
3636   return added;
3637 }
3638 
3639 /* Register the current thread as a root thread and obtain our gtid. We must
3640    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3641    thread that calls from __kmp_do_serial_initialize() */
3642 int __kmp_register_root(int initial_thread) {
3643   kmp_info_t *root_thread;
3644   kmp_root_t *root;
3645   int gtid;
3646   int capacity;
3647   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3648   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3649   KMP_MB();
3650 
3651   /* 2007-03-02:
3652      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3653      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3654      work as expected -- it may return false (that means there is at least one
3655      empty slot in __kmp_threads array), but it is possible the only free slot
3656      is #0, which is reserved for initial thread and so cannot be used for this
3657      one. Following code workarounds this bug.
3658 
3659      However, right solution seems to be not reserving slot #0 for initial
3660      thread because:
3661      (1) there is no magic in slot #0,
3662      (2) we cannot detect initial thread reliably (the first thread which does
3663         serial initialization may be not a real initial thread).
3664   */
3665   capacity = __kmp_threads_capacity;
3666   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3667     --capacity;
3668   }
3669 
3670   // If it is not for initializing the hidden helper team, we need to take
3671   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3672   // in __kmp_threads_capacity.
3673   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3674     capacity -= __kmp_hidden_helper_threads_num;
3675   }
3676 
3677   /* see if there are too many threads */
3678   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3679     if (__kmp_tp_cached) {
3680       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3681                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3682                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3683     } else {
3684       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3685                   __kmp_msg_null);
3686     }
3687   }
3688 
3689   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3690   // 0: initial thread, also a regular OpenMP thread.
3691   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3692   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3693   // regular OpenMP threads.
3694   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3695     // Find an available thread slot for hidden helper thread. Slots for hidden
3696     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3697     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3698                    gtid <= __kmp_hidden_helper_threads_num;
3699          gtid++)
3700       ;
3701     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3702     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3703                  "hidden helper thread: T#%d\n",
3704                  gtid));
3705   } else {
3706     /* find an available thread slot */
3707     // Don't reassign the zero slot since we need that to only be used by
3708     // initial thread. Slots for hidden helper threads should also be skipped.
3709     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3710       gtid = 0;
3711     } else {
3712       for (gtid = __kmp_hidden_helper_threads_num + 1;
3713            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3714         ;
3715     }
3716     KA_TRACE(
3717         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3718     KMP_ASSERT(gtid < __kmp_threads_capacity);
3719   }
3720 
3721   /* update global accounting */
3722   __kmp_all_nth++;
3723   TCW_4(__kmp_nth, __kmp_nth + 1);
3724 
3725   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3726   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3727   if (__kmp_adjust_gtid_mode) {
3728     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3729       if (TCR_4(__kmp_gtid_mode) != 2) {
3730         TCW_4(__kmp_gtid_mode, 2);
3731       }
3732     } else {
3733       if (TCR_4(__kmp_gtid_mode) != 1) {
3734         TCW_4(__kmp_gtid_mode, 1);
3735       }
3736     }
3737   }
3738 
3739 #ifdef KMP_ADJUST_BLOCKTIME
3740   /* Adjust blocktime to zero if necessary            */
3741   /* Middle initialization might not have occurred yet */
3742   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3743     if (__kmp_nth > __kmp_avail_proc) {
3744       __kmp_zero_bt = TRUE;
3745     }
3746   }
3747 #endif /* KMP_ADJUST_BLOCKTIME */
3748 
3749   /* setup this new hierarchy */
3750   if (!(root = __kmp_root[gtid])) {
3751     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3752     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3753   }
3754 
3755 #if KMP_STATS_ENABLED
3756   // Initialize stats as soon as possible (right after gtid assignment).
3757   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3758   __kmp_stats_thread_ptr->startLife();
3759   KMP_SET_THREAD_STATE(SERIAL_REGION);
3760   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3761 #endif
3762   __kmp_initialize_root(root);
3763 
3764   /* setup new root thread structure */
3765   if (root->r.r_uber_thread) {
3766     root_thread = root->r.r_uber_thread;
3767   } else {
3768     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3769     if (__kmp_storage_map) {
3770       __kmp_print_thread_storage_map(root_thread, gtid);
3771     }
3772     root_thread->th.th_info.ds.ds_gtid = gtid;
3773 #if OMPT_SUPPORT
3774     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3775 #endif
3776     root_thread->th.th_root = root;
3777     if (__kmp_env_consistency_check) {
3778       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3779     }
3780 #if USE_FAST_MEMORY
3781     __kmp_initialize_fast_memory(root_thread);
3782 #endif /* USE_FAST_MEMORY */
3783 
3784 #if KMP_USE_BGET
3785     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3786     __kmp_initialize_bget(root_thread);
3787 #endif
3788     __kmp_init_random(root_thread); // Initialize random number generator
3789   }
3790 
3791   /* setup the serial team held in reserve by the root thread */
3792   if (!root_thread->th.th_serial_team) {
3793     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3794     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3795     root_thread->th.th_serial_team = __kmp_allocate_team(
3796         root, 1, 1,
3797 #if OMPT_SUPPORT
3798         ompt_data_none, // root parallel id
3799 #endif
3800         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3801   }
3802   KMP_ASSERT(root_thread->th.th_serial_team);
3803   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3804                 root_thread->th.th_serial_team));
3805 
3806   /* drop root_thread into place */
3807   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3808 
3809   root->r.r_root_team->t.t_threads[0] = root_thread;
3810   root->r.r_hot_team->t.t_threads[0] = root_thread;
3811   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3812   // AC: the team created in reserve, not for execution (it is unused for now).
3813   root_thread->th.th_serial_team->t.t_serialized = 0;
3814   root->r.r_uber_thread = root_thread;
3815 
3816   /* initialize the thread, get it ready to go */
3817   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3818   TCW_4(__kmp_init_gtid, TRUE);
3819 
3820   /* prepare the primary thread for get_gtid() */
3821   __kmp_gtid_set_specific(gtid);
3822 
3823 #if USE_ITT_BUILD
3824   __kmp_itt_thread_name(gtid);
3825 #endif /* USE_ITT_BUILD */
3826 
3827 #ifdef KMP_TDATA_GTID
3828   __kmp_gtid = gtid;
3829 #endif
3830   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3831   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3832 
3833   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3834                 "plain=%u\n",
3835                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3836                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3837                 KMP_INIT_BARRIER_STATE));
3838   { // Initialize barrier data.
3839     int b;
3840     for (b = 0; b < bs_last_barrier; ++b) {
3841       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3842 #if USE_DEBUGGER
3843       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3844 #endif
3845     }
3846   }
3847   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3848                    KMP_INIT_BARRIER_STATE);
3849 
3850 #if KMP_AFFINITY_SUPPORTED
3851   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3852   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3853   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3854   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3855 #endif /* KMP_AFFINITY_SUPPORTED */
3856   root_thread->th.th_def_allocator = __kmp_def_allocator;
3857   root_thread->th.th_prev_level = 0;
3858   root_thread->th.th_prev_num_threads = 1;
3859 
3860   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3861   tmp->cg_root = root_thread;
3862   tmp->cg_thread_limit = __kmp_cg_max_nth;
3863   tmp->cg_nthreads = 1;
3864   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3865                  " cg_nthreads init to 1\n",
3866                  root_thread, tmp));
3867   tmp->up = NULL;
3868   root_thread->th.th_cg_roots = tmp;
3869 
3870   __kmp_root_counter++;
3871 
3872 #if OMPT_SUPPORT
3873   if (!initial_thread && ompt_enabled.enabled) {
3874 
3875     kmp_info_t *root_thread = ompt_get_thread();
3876 
3877     ompt_set_thread_state(root_thread, ompt_state_overhead);
3878 
3879     if (ompt_enabled.ompt_callback_thread_begin) {
3880       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3881           ompt_thread_initial, __ompt_get_thread_data_internal());
3882     }
3883     ompt_data_t *task_data;
3884     ompt_data_t *parallel_data;
3885     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3886                                   NULL);
3887     if (ompt_enabled.ompt_callback_implicit_task) {
3888       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3889           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3890     }
3891 
3892     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3893   }
3894 #endif
3895 #if OMPD_SUPPORT
3896   if (ompd_state & OMPD_ENABLE_BP)
3897     ompd_bp_thread_begin();
3898 #endif
3899 
3900   KMP_MB();
3901   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3902 
3903   return gtid;
3904 }
3905 
3906 #if KMP_NESTED_HOT_TEAMS
3907 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3908                                 const int max_level) {
3909   int i, n, nth;
3910   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3911   if (!hot_teams || !hot_teams[level].hot_team) {
3912     return 0;
3913   }
3914   KMP_DEBUG_ASSERT(level < max_level);
3915   kmp_team_t *team = hot_teams[level].hot_team;
3916   nth = hot_teams[level].hot_team_nth;
3917   n = nth - 1; // primary thread is not freed
3918   if (level < max_level - 1) {
3919     for (i = 0; i < nth; ++i) {
3920       kmp_info_t *th = team->t.t_threads[i];
3921       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3922       if (i > 0 && th->th.th_hot_teams) {
3923         __kmp_free(th->th.th_hot_teams);
3924         th->th.th_hot_teams = NULL;
3925       }
3926     }
3927   }
3928   __kmp_free_team(root, team, NULL);
3929   return n;
3930 }
3931 #endif
3932 
3933 // Resets a root thread and clear its root and hot teams.
3934 // Returns the number of __kmp_threads entries directly and indirectly freed.
3935 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3936   kmp_team_t *root_team = root->r.r_root_team;
3937   kmp_team_t *hot_team = root->r.r_hot_team;
3938   int n = hot_team->t.t_nproc;
3939   int i;
3940 
3941   KMP_DEBUG_ASSERT(!root->r.r_active);
3942 
3943   root->r.r_root_team = NULL;
3944   root->r.r_hot_team = NULL;
3945   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3946   // before call to __kmp_free_team().
3947   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3948 #if KMP_NESTED_HOT_TEAMS
3949   if (__kmp_hot_teams_max_level >
3950       0) { // need to free nested hot teams and their threads if any
3951     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3952       kmp_info_t *th = hot_team->t.t_threads[i];
3953       if (__kmp_hot_teams_max_level > 1) {
3954         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3955       }
3956       if (th->th.th_hot_teams) {
3957         __kmp_free(th->th.th_hot_teams);
3958         th->th.th_hot_teams = NULL;
3959       }
3960     }
3961   }
3962 #endif
3963   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3964 
3965   // Before we can reap the thread, we need to make certain that all other
3966   // threads in the teams that had this root as ancestor have stopped trying to
3967   // steal tasks.
3968   if (__kmp_tasking_mode != tskm_immediate_exec) {
3969     __kmp_wait_to_unref_task_teams();
3970   }
3971 
3972 #if KMP_OS_WINDOWS
3973   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3974   KA_TRACE(
3975       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3976            "\n",
3977            (LPVOID) & (root->r.r_uber_thread->th),
3978            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3979   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3980 #endif /* KMP_OS_WINDOWS */
3981 
3982 #if OMPD_SUPPORT
3983   if (ompd_state & OMPD_ENABLE_BP)
3984     ompd_bp_thread_end();
3985 #endif
3986 
3987 #if OMPT_SUPPORT
3988   ompt_data_t *task_data;
3989   ompt_data_t *parallel_data;
3990   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3991                                 NULL);
3992   if (ompt_enabled.ompt_callback_implicit_task) {
3993     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3994         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3995   }
3996   if (ompt_enabled.ompt_callback_thread_end) {
3997     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3998         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3999   }
4000 #endif
4001 
4002   TCW_4(__kmp_nth,
4003         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4004   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4005   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4006                  " to %d\n",
4007                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4008                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4009   if (i == 1) {
4010     // need to free contention group structure
4011     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4012                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
4013     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4014     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4015     root->r.r_uber_thread->th.th_cg_roots = NULL;
4016   }
4017   __kmp_reap_thread(root->r.r_uber_thread, 1);
4018 
4019   // We canot put root thread to __kmp_thread_pool, so we have to reap it
4020   // instead of freeing.
4021   root->r.r_uber_thread = NULL;
4022   /* mark root as no longer in use */
4023   root->r.r_begin = FALSE;
4024 
4025   return n;
4026 }
4027 
4028 void __kmp_unregister_root_current_thread(int gtid) {
4029   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4030   /* this lock should be ok, since unregister_root_current_thread is never
4031      called during an abort, only during a normal close. furthermore, if you
4032      have the forkjoin lock, you should never try to get the initz lock */
4033   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4034   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4035     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4036                   "exiting T#%d\n",
4037                   gtid));
4038     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4039     return;
4040   }
4041   kmp_root_t *root = __kmp_root[gtid];
4042 
4043   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4044   KMP_ASSERT(KMP_UBER_GTID(gtid));
4045   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4046   KMP_ASSERT(root->r.r_active == FALSE);
4047 
4048   KMP_MB();
4049 
4050   kmp_info_t *thread = __kmp_threads[gtid];
4051   kmp_team_t *team = thread->th.th_team;
4052   kmp_task_team_t *task_team = thread->th.th_task_team;
4053 
4054   // we need to wait for the proxy tasks before finishing the thread
4055   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4056 #if OMPT_SUPPORT
4057     // the runtime is shutting down so we won't report any events
4058     thread->th.ompt_thread_info.state = ompt_state_undefined;
4059 #endif
4060     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4061   }
4062 
4063   __kmp_reset_root(gtid, root);
4064 
4065   KMP_MB();
4066   KC_TRACE(10,
4067            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4068 
4069   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4070 }
4071 
4072 #if KMP_OS_WINDOWS
4073 /* __kmp_forkjoin_lock must be already held
4074    Unregisters a root thread that is not the current thread.  Returns the number
4075    of __kmp_threads entries freed as a result. */
4076 static int __kmp_unregister_root_other_thread(int gtid) {
4077   kmp_root_t *root = __kmp_root[gtid];
4078   int r;
4079 
4080   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4081   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4082   KMP_ASSERT(KMP_UBER_GTID(gtid));
4083   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4084   KMP_ASSERT(root->r.r_active == FALSE);
4085 
4086   r = __kmp_reset_root(gtid, root);
4087   KC_TRACE(10,
4088            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4089   return r;
4090 }
4091 #endif
4092 
4093 #if KMP_DEBUG
4094 void __kmp_task_info() {
4095 
4096   kmp_int32 gtid = __kmp_entry_gtid();
4097   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4098   kmp_info_t *this_thr = __kmp_threads[gtid];
4099   kmp_team_t *steam = this_thr->th.th_serial_team;
4100   kmp_team_t *team = this_thr->th.th_team;
4101 
4102   __kmp_printf(
4103       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4104       "ptask=%p\n",
4105       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4106       team->t.t_implicit_task_taskdata[tid].td_parent);
4107 }
4108 #endif // KMP_DEBUG
4109 
4110 /* TODO optimize with one big memclr, take out what isn't needed, split
4111    responsibility to workers as much as possible, and delay initialization of
4112    features as much as possible  */
4113 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4114                                   int tid, int gtid) {
4115   /* this_thr->th.th_info.ds.ds_gtid is setup in
4116      kmp_allocate_thread/create_worker.
4117      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4118   KMP_DEBUG_ASSERT(this_thr != NULL);
4119   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4120   KMP_DEBUG_ASSERT(team);
4121   KMP_DEBUG_ASSERT(team->t.t_threads);
4122   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4123   kmp_info_t *master = team->t.t_threads[0];
4124   KMP_DEBUG_ASSERT(master);
4125   KMP_DEBUG_ASSERT(master->th.th_root);
4126 
4127   KMP_MB();
4128 
4129   TCW_SYNC_PTR(this_thr->th.th_team, team);
4130 
4131   this_thr->th.th_info.ds.ds_tid = tid;
4132   this_thr->th.th_set_nproc = 0;
4133   if (__kmp_tasking_mode != tskm_immediate_exec)
4134     // When tasking is possible, threads are not safe to reap until they are
4135     // done tasking; this will be set when tasking code is exited in wait
4136     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4137   else // no tasking --> always safe to reap
4138     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4139   this_thr->th.th_set_proc_bind = proc_bind_default;
4140 #if KMP_AFFINITY_SUPPORTED
4141   this_thr->th.th_new_place = this_thr->th.th_current_place;
4142 #endif
4143   this_thr->th.th_root = master->th.th_root;
4144 
4145   /* setup the thread's cache of the team structure */
4146   this_thr->th.th_team_nproc = team->t.t_nproc;
4147   this_thr->th.th_team_master = master;
4148   this_thr->th.th_team_serialized = team->t.t_serialized;
4149 
4150   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4151 
4152   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4153                 tid, gtid, this_thr, this_thr->th.th_current_task));
4154 
4155   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4156                            team, tid, TRUE);
4157 
4158   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4159                 tid, gtid, this_thr, this_thr->th.th_current_task));
4160   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4161   // __kmp_initialize_team()?
4162 
4163   /* TODO no worksharing in speculative threads */
4164   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4165 
4166   this_thr->th.th_local.this_construct = 0;
4167 
4168   if (!this_thr->th.th_pri_common) {
4169     this_thr->th.th_pri_common =
4170         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4171     if (__kmp_storage_map) {
4172       __kmp_print_storage_map_gtid(
4173           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4174           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4175     }
4176     this_thr->th.th_pri_head = NULL;
4177   }
4178 
4179   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4180       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4181     // Make new thread's CG root same as primary thread's
4182     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4183     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4184     if (tmp) {
4185       // worker changes CG, need to check if old CG should be freed
4186       int i = tmp->cg_nthreads--;
4187       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4188                      " on node %p of thread %p to %d\n",
4189                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4190       if (i == 1) {
4191         __kmp_free(tmp); // last thread left CG --> free it
4192       }
4193     }
4194     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4195     // Increment new thread's CG root's counter to add the new thread
4196     this_thr->th.th_cg_roots->cg_nthreads++;
4197     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4198                    " node %p of thread %p to %d\n",
4199                    this_thr, this_thr->th.th_cg_roots,
4200                    this_thr->th.th_cg_roots->cg_root,
4201                    this_thr->th.th_cg_roots->cg_nthreads));
4202     this_thr->th.th_current_task->td_icvs.thread_limit =
4203         this_thr->th.th_cg_roots->cg_thread_limit;
4204   }
4205 
4206   /* Initialize dynamic dispatch */
4207   {
4208     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4209     // Use team max_nproc since this will never change for the team.
4210     size_t disp_size =
4211         sizeof(dispatch_private_info_t) *
4212         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4213     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4214                   team->t.t_max_nproc));
4215     KMP_ASSERT(dispatch);
4216     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4217     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4218 
4219     dispatch->th_disp_index = 0;
4220     dispatch->th_doacross_buf_idx = 0;
4221     if (!dispatch->th_disp_buffer) {
4222       dispatch->th_disp_buffer =
4223           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4224 
4225       if (__kmp_storage_map) {
4226         __kmp_print_storage_map_gtid(
4227             gtid, &dispatch->th_disp_buffer[0],
4228             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4229                                           ? 1
4230                                           : __kmp_dispatch_num_buffers],
4231             disp_size,
4232             "th_%d.th_dispatch.th_disp_buffer "
4233             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4234             gtid, team->t.t_id, gtid);
4235       }
4236     } else {
4237       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4238     }
4239 
4240     dispatch->th_dispatch_pr_current = 0;
4241     dispatch->th_dispatch_sh_current = 0;
4242 
4243     dispatch->th_deo_fcn = 0; /* ORDERED     */
4244     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4245   }
4246 
4247   this_thr->th.th_next_pool = NULL;
4248 
4249   if (!this_thr->th.th_task_state_memo_stack) {
4250     size_t i;
4251     this_thr->th.th_task_state_memo_stack =
4252         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4253     this_thr->th.th_task_state_top = 0;
4254     this_thr->th.th_task_state_stack_sz = 4;
4255     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4256          ++i) // zero init the stack
4257       this_thr->th.th_task_state_memo_stack[i] = 0;
4258   }
4259 
4260   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4261   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4262 
4263   KMP_MB();
4264 }
4265 
4266 /* allocate a new thread for the requesting team. this is only called from
4267    within a forkjoin critical section. we will first try to get an available
4268    thread from the thread pool. if none is available, we will fork a new one
4269    assuming we are able to create a new one. this should be assured, as the
4270    caller should check on this first. */
4271 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4272                                   int new_tid) {
4273   kmp_team_t *serial_team;
4274   kmp_info_t *new_thr;
4275   int new_gtid;
4276 
4277   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4278   KMP_DEBUG_ASSERT(root && team);
4279 #if !KMP_NESTED_HOT_TEAMS
4280   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4281 #endif
4282   KMP_MB();
4283 
4284   /* first, try to get one from the thread pool */
4285   if (__kmp_thread_pool) {
4286     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4287     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4288     if (new_thr == __kmp_thread_pool_insert_pt) {
4289       __kmp_thread_pool_insert_pt = NULL;
4290     }
4291     TCW_4(new_thr->th.th_in_pool, FALSE);
4292     __kmp_suspend_initialize_thread(new_thr);
4293     __kmp_lock_suspend_mx(new_thr);
4294     if (new_thr->th.th_active_in_pool == TRUE) {
4295       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4296       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4297       new_thr->th.th_active_in_pool = FALSE;
4298     }
4299     __kmp_unlock_suspend_mx(new_thr);
4300 
4301     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4302                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4303     KMP_ASSERT(!new_thr->th.th_team);
4304     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4305 
4306     /* setup the thread structure */
4307     __kmp_initialize_info(new_thr, team, new_tid,
4308                           new_thr->th.th_info.ds.ds_gtid);
4309     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4310 
4311     TCW_4(__kmp_nth, __kmp_nth + 1);
4312 
4313     new_thr->th.th_task_state = 0;
4314     new_thr->th.th_task_state_top = 0;
4315     new_thr->th.th_task_state_stack_sz = 4;
4316 
4317     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4318       // Make sure pool thread has transitioned to waiting on own thread struct
4319       KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4320       // Thread activated in __kmp_allocate_team when increasing team size
4321     }
4322 
4323 #ifdef KMP_ADJUST_BLOCKTIME
4324     /* Adjust blocktime back to zero if necessary */
4325     /* Middle initialization might not have occurred yet */
4326     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4327       if (__kmp_nth > __kmp_avail_proc) {
4328         __kmp_zero_bt = TRUE;
4329       }
4330     }
4331 #endif /* KMP_ADJUST_BLOCKTIME */
4332 
4333 #if KMP_DEBUG
4334     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4335     // KMP_BARRIER_PARENT_FLAG.
4336     int b;
4337     kmp_balign_t *balign = new_thr->th.th_bar;
4338     for (b = 0; b < bs_last_barrier; ++b)
4339       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4340 #endif
4341 
4342     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4343                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4344 
4345     KMP_MB();
4346     return new_thr;
4347   }
4348 
4349   /* no, well fork a new one */
4350   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4351   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4352 
4353 #if KMP_USE_MONITOR
4354   // If this is the first worker thread the RTL is creating, then also
4355   // launch the monitor thread.  We try to do this as early as possible.
4356   if (!TCR_4(__kmp_init_monitor)) {
4357     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4358     if (!TCR_4(__kmp_init_monitor)) {
4359       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4360       TCW_4(__kmp_init_monitor, 1);
4361       __kmp_create_monitor(&__kmp_monitor);
4362       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4363 #if KMP_OS_WINDOWS
4364       // AC: wait until monitor has started. This is a fix for CQ232808.
4365       // The reason is that if the library is loaded/unloaded in a loop with
4366       // small (parallel) work in between, then there is high probability that
4367       // monitor thread started after the library shutdown. At shutdown it is
4368       // too late to cope with the problem, because when the primary thread is
4369       // in DllMain (process detach) the monitor has no chances to start (it is
4370       // blocked), and primary thread has no means to inform the monitor that
4371       // the library has gone, because all the memory which the monitor can
4372       // access is going to be released/reset.
4373       while (TCR_4(__kmp_init_monitor) < 2) {
4374         KMP_YIELD(TRUE);
4375       }
4376       KF_TRACE(10, ("after monitor thread has started\n"));
4377 #endif
4378     }
4379     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4380   }
4381 #endif
4382 
4383   KMP_MB();
4384 
4385   {
4386     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4387                              ? 1
4388                              : __kmp_hidden_helper_threads_num + 1;
4389 
4390     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4391          ++new_gtid) {
4392       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4393     }
4394 
4395     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4396       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4397     }
4398   }
4399 
4400   /* allocate space for it. */
4401   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4402 
4403   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4404 
4405 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4406   // suppress race conditions detection on synchronization flags in debug mode
4407   // this helps to analyze library internals eliminating false positives
4408   __itt_suppress_mark_range(
4409       __itt_suppress_range, __itt_suppress_threading_errors,
4410       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4411   __itt_suppress_mark_range(
4412       __itt_suppress_range, __itt_suppress_threading_errors,
4413       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4414 #if KMP_OS_WINDOWS
4415   __itt_suppress_mark_range(
4416       __itt_suppress_range, __itt_suppress_threading_errors,
4417       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4418 #else
4419   __itt_suppress_mark_range(__itt_suppress_range,
4420                             __itt_suppress_threading_errors,
4421                             &new_thr->th.th_suspend_init_count,
4422                             sizeof(new_thr->th.th_suspend_init_count));
4423 #endif
4424   // TODO: check if we need to also suppress b_arrived flags
4425   __itt_suppress_mark_range(__itt_suppress_range,
4426                             __itt_suppress_threading_errors,
4427                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4428                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4429   __itt_suppress_mark_range(__itt_suppress_range,
4430                             __itt_suppress_threading_errors,
4431                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4432                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4433   __itt_suppress_mark_range(__itt_suppress_range,
4434                             __itt_suppress_threading_errors,
4435                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4436                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4437 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4438   if (__kmp_storage_map) {
4439     __kmp_print_thread_storage_map(new_thr, new_gtid);
4440   }
4441 
4442   // add the reserve serialized team, initialized from the team's primary thread
4443   {
4444     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4445     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4446     new_thr->th.th_serial_team = serial_team =
4447         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4448 #if OMPT_SUPPORT
4449                                           ompt_data_none, // root parallel id
4450 #endif
4451                                           proc_bind_default, &r_icvs,
4452                                           0 USE_NESTED_HOT_ARG(NULL));
4453   }
4454   KMP_ASSERT(serial_team);
4455   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4456   // execution (it is unused for now).
4457   serial_team->t.t_threads[0] = new_thr;
4458   KF_TRACE(10,
4459            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4460             new_thr));
4461 
4462   /* setup the thread structures */
4463   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4464 
4465 #if USE_FAST_MEMORY
4466   __kmp_initialize_fast_memory(new_thr);
4467 #endif /* USE_FAST_MEMORY */
4468 
4469 #if KMP_USE_BGET
4470   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4471   __kmp_initialize_bget(new_thr);
4472 #endif
4473 
4474   __kmp_init_random(new_thr); // Initialize random number generator
4475 
4476   /* Initialize these only once when thread is grabbed for a team allocation */
4477   KA_TRACE(20,
4478            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4479             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4480 
4481   int b;
4482   kmp_balign_t *balign = new_thr->th.th_bar;
4483   for (b = 0; b < bs_last_barrier; ++b) {
4484     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4485     balign[b].bb.team = NULL;
4486     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4487     balign[b].bb.use_oncore_barrier = 0;
4488   }
4489 
4490   TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4491   new_thr->th.th_sleep_loc_type = flag_unset;
4492 
4493   new_thr->th.th_spin_here = FALSE;
4494   new_thr->th.th_next_waiting = 0;
4495 #if KMP_OS_UNIX
4496   new_thr->th.th_blocking = false;
4497 #endif
4498 
4499 #if KMP_AFFINITY_SUPPORTED
4500   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4501   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4502   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4503   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4504 #endif
4505   new_thr->th.th_def_allocator = __kmp_def_allocator;
4506   new_thr->th.th_prev_level = 0;
4507   new_thr->th.th_prev_num_threads = 1;
4508 
4509   TCW_4(new_thr->th.th_in_pool, FALSE);
4510   new_thr->th.th_active_in_pool = FALSE;
4511   TCW_4(new_thr->th.th_active, TRUE);
4512 
4513   /* adjust the global counters */
4514   __kmp_all_nth++;
4515   __kmp_nth++;
4516 
4517   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4518   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4519   if (__kmp_adjust_gtid_mode) {
4520     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4521       if (TCR_4(__kmp_gtid_mode) != 2) {
4522         TCW_4(__kmp_gtid_mode, 2);
4523       }
4524     } else {
4525       if (TCR_4(__kmp_gtid_mode) != 1) {
4526         TCW_4(__kmp_gtid_mode, 1);
4527       }
4528     }
4529   }
4530 
4531 #ifdef KMP_ADJUST_BLOCKTIME
4532   /* Adjust blocktime back to zero if necessary       */
4533   /* Middle initialization might not have occurred yet */
4534   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4535     if (__kmp_nth > __kmp_avail_proc) {
4536       __kmp_zero_bt = TRUE;
4537     }
4538   }
4539 #endif /* KMP_ADJUST_BLOCKTIME */
4540 
4541   /* actually fork it and create the new worker thread */
4542   KF_TRACE(
4543       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4544   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4545   KF_TRACE(10,
4546            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4547 
4548   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4549                 new_gtid));
4550   KMP_MB();
4551   return new_thr;
4552 }
4553 
4554 /* Reinitialize team for reuse.
4555    The hot team code calls this case at every fork barrier, so EPCC barrier
4556    test are extremely sensitive to changes in it, esp. writes to the team
4557    struct, which cause a cache invalidation in all threads.
4558    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4559 static void __kmp_reinitialize_team(kmp_team_t *team,
4560                                     kmp_internal_control_t *new_icvs,
4561                                     ident_t *loc) {
4562   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4563                 team->t.t_threads[0], team));
4564   KMP_DEBUG_ASSERT(team && new_icvs);
4565   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4566   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4567 
4568   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4569   // Copy ICVs to the primary thread's implicit taskdata
4570   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4571   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4572 
4573   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4574                 team->t.t_threads[0], team));
4575 }
4576 
4577 /* Initialize the team data structure.
4578    This assumes the t_threads and t_max_nproc are already set.
4579    Also, we don't touch the arguments */
4580 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4581                                   kmp_internal_control_t *new_icvs,
4582                                   ident_t *loc) {
4583   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4584 
4585   /* verify */
4586   KMP_DEBUG_ASSERT(team);
4587   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4588   KMP_DEBUG_ASSERT(team->t.t_threads);
4589   KMP_MB();
4590 
4591   team->t.t_master_tid = 0; /* not needed */
4592   /* team->t.t_master_bar;        not needed */
4593   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4594   team->t.t_nproc = new_nproc;
4595 
4596   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4597   team->t.t_next_pool = NULL;
4598   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4599    * up hot team */
4600 
4601   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4602   team->t.t_invoke = NULL; /* not needed */
4603 
4604   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4605   team->t.t_sched.sched = new_icvs->sched.sched;
4606 
4607 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4608   team->t.t_fp_control_saved = FALSE; /* not needed */
4609   team->t.t_x87_fpu_control_word = 0; /* not needed */
4610   team->t.t_mxcsr = 0; /* not needed */
4611 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4612 
4613   team->t.t_construct = 0;
4614 
4615   team->t.t_ordered.dt.t_value = 0;
4616   team->t.t_master_active = FALSE;
4617 
4618 #ifdef KMP_DEBUG
4619   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4620 #endif
4621 #if KMP_OS_WINDOWS
4622   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4623 #endif
4624 
4625   team->t.t_control_stack_top = NULL;
4626 
4627   __kmp_reinitialize_team(team, new_icvs, loc);
4628 
4629   KMP_MB();
4630   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4631 }
4632 
4633 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4634 /* Sets full mask for thread and returns old mask, no changes to structures. */
4635 static void
4636 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4637   if (KMP_AFFINITY_CAPABLE()) {
4638     int status;
4639     if (old_mask != NULL) {
4640       status = __kmp_get_system_affinity(old_mask, TRUE);
4641       int error = errno;
4642       if (status != 0) {
4643         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4644                     __kmp_msg_null);
4645       }
4646     }
4647     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4648   }
4649 }
4650 #endif
4651 
4652 #if KMP_AFFINITY_SUPPORTED
4653 
4654 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4655 // It calculates the worker + primary thread's partition based upon the parent
4656 // thread's partition, and binds each worker to a thread in their partition.
4657 // The primary thread's partition should already include its current binding.
4658 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4659   // Do not partition places for the hidden helper team
4660   if (KMP_HIDDEN_HELPER_TEAM(team))
4661     return;
4662   // Copy the primary thread's place partition to the team struct
4663   kmp_info_t *master_th = team->t.t_threads[0];
4664   KMP_DEBUG_ASSERT(master_th != NULL);
4665   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4666   int first_place = master_th->th.th_first_place;
4667   int last_place = master_th->th.th_last_place;
4668   int masters_place = master_th->th.th_current_place;
4669   team->t.t_first_place = first_place;
4670   team->t.t_last_place = last_place;
4671 
4672   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4673                 "bound to place %d partition = [%d,%d]\n",
4674                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4675                 team->t.t_id, masters_place, first_place, last_place));
4676 
4677   switch (proc_bind) {
4678 
4679   case proc_bind_default:
4680     // Serial teams might have the proc_bind policy set to proc_bind_default.
4681     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4682     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4683     break;
4684 
4685   case proc_bind_primary: {
4686     int f;
4687     int n_th = team->t.t_nproc;
4688     for (f = 1; f < n_th; f++) {
4689       kmp_info_t *th = team->t.t_threads[f];
4690       KMP_DEBUG_ASSERT(th != NULL);
4691       th->th.th_first_place = first_place;
4692       th->th.th_last_place = last_place;
4693       th->th.th_new_place = masters_place;
4694       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4695           team->t.t_display_affinity != 1) {
4696         team->t.t_display_affinity = 1;
4697       }
4698 
4699       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4700                      "partition = [%d,%d]\n",
4701                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4702                      f, masters_place, first_place, last_place));
4703     }
4704   } break;
4705 
4706   case proc_bind_close: {
4707     int f;
4708     int n_th = team->t.t_nproc;
4709     int n_places;
4710     if (first_place <= last_place) {
4711       n_places = last_place - first_place + 1;
4712     } else {
4713       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4714     }
4715     if (n_th <= n_places) {
4716       int place = masters_place;
4717       for (f = 1; f < n_th; f++) {
4718         kmp_info_t *th = team->t.t_threads[f];
4719         KMP_DEBUG_ASSERT(th != NULL);
4720 
4721         if (place == last_place) {
4722           place = first_place;
4723         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4724           place = 0;
4725         } else {
4726           place++;
4727         }
4728         th->th.th_first_place = first_place;
4729         th->th.th_last_place = last_place;
4730         th->th.th_new_place = place;
4731         if (__kmp_display_affinity && place != th->th.th_current_place &&
4732             team->t.t_display_affinity != 1) {
4733           team->t.t_display_affinity = 1;
4734         }
4735 
4736         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4737                        "partition = [%d,%d]\n",
4738                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4739                        team->t.t_id, f, place, first_place, last_place));
4740       }
4741     } else {
4742       int S, rem, gap, s_count;
4743       S = n_th / n_places;
4744       s_count = 0;
4745       rem = n_th - (S * n_places);
4746       gap = rem > 0 ? n_places / rem : n_places;
4747       int place = masters_place;
4748       int gap_ct = gap;
4749       for (f = 0; f < n_th; f++) {
4750         kmp_info_t *th = team->t.t_threads[f];
4751         KMP_DEBUG_ASSERT(th != NULL);
4752 
4753         th->th.th_first_place = first_place;
4754         th->th.th_last_place = last_place;
4755         th->th.th_new_place = place;
4756         if (__kmp_display_affinity && place != th->th.th_current_place &&
4757             team->t.t_display_affinity != 1) {
4758           team->t.t_display_affinity = 1;
4759         }
4760         s_count++;
4761 
4762         if ((s_count == S) && rem && (gap_ct == gap)) {
4763           // do nothing, add an extra thread to place on next iteration
4764         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4765           // we added an extra thread to this place; move to next place
4766           if (place == last_place) {
4767             place = first_place;
4768           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4769             place = 0;
4770           } else {
4771             place++;
4772           }
4773           s_count = 0;
4774           gap_ct = 1;
4775           rem--;
4776         } else if (s_count == S) { // place full; don't add extra
4777           if (place == last_place) {
4778             place = first_place;
4779           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4780             place = 0;
4781           } else {
4782             place++;
4783           }
4784           gap_ct++;
4785           s_count = 0;
4786         }
4787 
4788         KA_TRACE(100,
4789                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4790                   "partition = [%d,%d]\n",
4791                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4792                   th->th.th_new_place, first_place, last_place));
4793       }
4794       KMP_DEBUG_ASSERT(place == masters_place);
4795     }
4796   } break;
4797 
4798   case proc_bind_spread: {
4799     int f;
4800     int n_th = team->t.t_nproc;
4801     int n_places;
4802     int thidx;
4803     if (first_place <= last_place) {
4804       n_places = last_place - first_place + 1;
4805     } else {
4806       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4807     }
4808     if (n_th <= n_places) {
4809       int place = -1;
4810 
4811       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4812         int S = n_places / n_th;
4813         int s_count, rem, gap, gap_ct;
4814 
4815         place = masters_place;
4816         rem = n_places - n_th * S;
4817         gap = rem ? n_th / rem : 1;
4818         gap_ct = gap;
4819         thidx = n_th;
4820         if (update_master_only == 1)
4821           thidx = 1;
4822         for (f = 0; f < thidx; f++) {
4823           kmp_info_t *th = team->t.t_threads[f];
4824           KMP_DEBUG_ASSERT(th != NULL);
4825 
4826           th->th.th_first_place = place;
4827           th->th.th_new_place = place;
4828           if (__kmp_display_affinity && place != th->th.th_current_place &&
4829               team->t.t_display_affinity != 1) {
4830             team->t.t_display_affinity = 1;
4831           }
4832           s_count = 1;
4833           while (s_count < S) {
4834             if (place == last_place) {
4835               place = first_place;
4836             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4837               place = 0;
4838             } else {
4839               place++;
4840             }
4841             s_count++;
4842           }
4843           if (rem && (gap_ct == gap)) {
4844             if (place == last_place) {
4845               place = first_place;
4846             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4847               place = 0;
4848             } else {
4849               place++;
4850             }
4851             rem--;
4852             gap_ct = 0;
4853           }
4854           th->th.th_last_place = place;
4855           gap_ct++;
4856 
4857           if (place == last_place) {
4858             place = first_place;
4859           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4860             place = 0;
4861           } else {
4862             place++;
4863           }
4864 
4865           KA_TRACE(100,
4866                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4867                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4868                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4869                     f, th->th.th_new_place, th->th.th_first_place,
4870                     th->th.th_last_place, __kmp_affinity_num_masks));
4871         }
4872       } else {
4873         /* Having uniform space of available computation places I can create
4874            T partitions of round(P/T) size and put threads into the first
4875            place of each partition. */
4876         double current = static_cast<double>(masters_place);
4877         double spacing =
4878             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4879         int first, last;
4880         kmp_info_t *th;
4881 
4882         thidx = n_th + 1;
4883         if (update_master_only == 1)
4884           thidx = 1;
4885         for (f = 0; f < thidx; f++) {
4886           first = static_cast<int>(current);
4887           last = static_cast<int>(current + spacing) - 1;
4888           KMP_DEBUG_ASSERT(last >= first);
4889           if (first >= n_places) {
4890             if (masters_place) {
4891               first -= n_places;
4892               last -= n_places;
4893               if (first == (masters_place + 1)) {
4894                 KMP_DEBUG_ASSERT(f == n_th);
4895                 first--;
4896               }
4897               if (last == masters_place) {
4898                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4899                 last--;
4900               }
4901             } else {
4902               KMP_DEBUG_ASSERT(f == n_th);
4903               first = 0;
4904               last = 0;
4905             }
4906           }
4907           if (last >= n_places) {
4908             last = (n_places - 1);
4909           }
4910           place = first;
4911           current += spacing;
4912           if (f < n_th) {
4913             KMP_DEBUG_ASSERT(0 <= first);
4914             KMP_DEBUG_ASSERT(n_places > first);
4915             KMP_DEBUG_ASSERT(0 <= last);
4916             KMP_DEBUG_ASSERT(n_places > last);
4917             KMP_DEBUG_ASSERT(last_place >= first_place);
4918             th = team->t.t_threads[f];
4919             KMP_DEBUG_ASSERT(th);
4920             th->th.th_first_place = first;
4921             th->th.th_new_place = place;
4922             th->th.th_last_place = last;
4923             if (__kmp_display_affinity && place != th->th.th_current_place &&
4924                 team->t.t_display_affinity != 1) {
4925               team->t.t_display_affinity = 1;
4926             }
4927             KA_TRACE(100,
4928                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4929                       "partition = [%d,%d], spacing = %.4f\n",
4930                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4931                       team->t.t_id, f, th->th.th_new_place,
4932                       th->th.th_first_place, th->th.th_last_place, spacing));
4933           }
4934         }
4935       }
4936       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4937     } else {
4938       int S, rem, gap, s_count;
4939       S = n_th / n_places;
4940       s_count = 0;
4941       rem = n_th - (S * n_places);
4942       gap = rem > 0 ? n_places / rem : n_places;
4943       int place = masters_place;
4944       int gap_ct = gap;
4945       thidx = n_th;
4946       if (update_master_only == 1)
4947         thidx = 1;
4948       for (f = 0; f < thidx; f++) {
4949         kmp_info_t *th = team->t.t_threads[f];
4950         KMP_DEBUG_ASSERT(th != NULL);
4951 
4952         th->th.th_first_place = place;
4953         th->th.th_last_place = place;
4954         th->th.th_new_place = place;
4955         if (__kmp_display_affinity && place != th->th.th_current_place &&
4956             team->t.t_display_affinity != 1) {
4957           team->t.t_display_affinity = 1;
4958         }
4959         s_count++;
4960 
4961         if ((s_count == S) && rem && (gap_ct == gap)) {
4962           // do nothing, add an extra thread to place on next iteration
4963         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4964           // we added an extra thread to this place; move on to next place
4965           if (place == last_place) {
4966             place = first_place;
4967           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4968             place = 0;
4969           } else {
4970             place++;
4971           }
4972           s_count = 0;
4973           gap_ct = 1;
4974           rem--;
4975         } else if (s_count == S) { // place is full; don't add extra thread
4976           if (place == last_place) {
4977             place = first_place;
4978           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4979             place = 0;
4980           } else {
4981             place++;
4982           }
4983           gap_ct++;
4984           s_count = 0;
4985         }
4986 
4987         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4988                        "partition = [%d,%d]\n",
4989                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4990                        team->t.t_id, f, th->th.th_new_place,
4991                        th->th.th_first_place, th->th.th_last_place));
4992       }
4993       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4994     }
4995   } break;
4996 
4997   default:
4998     break;
4999   }
5000 
5001   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5002 }
5003 
5004 #endif // KMP_AFFINITY_SUPPORTED
5005 
5006 /* allocate a new team data structure to use.  take one off of the free pool if
5007    available */
5008 kmp_team_t *
5009 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5010 #if OMPT_SUPPORT
5011                     ompt_data_t ompt_parallel_data,
5012 #endif
5013                     kmp_proc_bind_t new_proc_bind,
5014                     kmp_internal_control_t *new_icvs,
5015                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5016   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5017   int f;
5018   kmp_team_t *team;
5019   int use_hot_team = !root->r.r_active;
5020   int level = 0;
5021 
5022   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5023   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5024   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5025   KMP_MB();
5026 
5027 #if KMP_NESTED_HOT_TEAMS
5028   kmp_hot_team_ptr_t *hot_teams;
5029   if (master) {
5030     team = master->th.th_team;
5031     level = team->t.t_active_level;
5032     if (master->th.th_teams_microtask) { // in teams construct?
5033       if (master->th.th_teams_size.nteams > 1 &&
5034           ( // #teams > 1
5035               team->t.t_pkfn ==
5036                   (microtask_t)__kmp_teams_master || // inner fork of the teams
5037               master->th.th_teams_level <
5038                   team->t.t_level)) { // or nested parallel inside the teams
5039         ++level; // not increment if #teams==1, or for outer fork of the teams;
5040         // increment otherwise
5041       }
5042     }
5043     hot_teams = master->th.th_hot_teams;
5044     if (level < __kmp_hot_teams_max_level && hot_teams &&
5045         hot_teams[level].hot_team) {
5046       // hot team has already been allocated for given level
5047       use_hot_team = 1;
5048     } else {
5049       use_hot_team = 0;
5050     }
5051   } else {
5052     // check we won't access uninitialized hot_teams, just in case
5053     KMP_DEBUG_ASSERT(new_nproc == 1);
5054   }
5055 #endif
5056   // Optimization to use a "hot" team
5057   if (use_hot_team && new_nproc > 1) {
5058     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5059 #if KMP_NESTED_HOT_TEAMS
5060     team = hot_teams[level].hot_team;
5061 #else
5062     team = root->r.r_hot_team;
5063 #endif
5064 #if KMP_DEBUG
5065     if (__kmp_tasking_mode != tskm_immediate_exec) {
5066       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5067                     "task_team[1] = %p before reinit\n",
5068                     team->t.t_task_team[0], team->t.t_task_team[1]));
5069     }
5070 #endif
5071 
5072     if (team->t.t_nproc != new_nproc &&
5073         __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5074       // Distributed barrier may need a resize
5075       int old_nthr = team->t.t_nproc;
5076       __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5077     }
5078 
5079     // Has the number of threads changed?
5080     /* Let's assume the most common case is that the number of threads is
5081        unchanged, and put that case first. */
5082     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5083       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5084       // This case can mean that omp_set_num_threads() was called and the hot
5085       // team size was already reduced, so we check the special flag
5086       if (team->t.t_size_changed == -1) {
5087         team->t.t_size_changed = 1;
5088       } else {
5089         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5090       }
5091 
5092       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5093       kmp_r_sched_t new_sched = new_icvs->sched;
5094       // set primary thread's schedule as new run-time schedule
5095       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5096 
5097       __kmp_reinitialize_team(team, new_icvs,
5098                               root->r.r_uber_thread->th.th_ident);
5099 
5100       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5101                     team->t.t_threads[0], team));
5102       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5103 
5104 #if KMP_AFFINITY_SUPPORTED
5105       if ((team->t.t_size_changed == 0) &&
5106           (team->t.t_proc_bind == new_proc_bind)) {
5107         if (new_proc_bind == proc_bind_spread) {
5108           __kmp_partition_places(
5109               team, 1); // add flag to update only master for spread
5110         }
5111         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5112                        "proc_bind = %d, partition = [%d,%d]\n",
5113                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5114                        team->t.t_last_place));
5115       } else {
5116         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5117         __kmp_partition_places(team);
5118       }
5119 #else
5120       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5121 #endif /* KMP_AFFINITY_SUPPORTED */
5122     } else if (team->t.t_nproc > new_nproc) {
5123       KA_TRACE(20,
5124                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5125                 new_nproc));
5126 
5127       team->t.t_size_changed = 1;
5128       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5129         // Barrier size already reduced earlier in this function
5130         // Activate team threads via th_used_in_team
5131         __kmp_add_threads_to_team(team, new_nproc);
5132       }
5133 #if KMP_NESTED_HOT_TEAMS
5134       if (__kmp_hot_teams_mode == 0) {
5135         // AC: saved number of threads should correspond to team's value in this
5136         // mode, can be bigger in mode 1, when hot team has threads in reserve
5137         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5138         hot_teams[level].hot_team_nth = new_nproc;
5139 #endif // KMP_NESTED_HOT_TEAMS
5140         /* release the extra threads we don't need any more */
5141         for (f = new_nproc; f < team->t.t_nproc; f++) {
5142           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5143           if (__kmp_tasking_mode != tskm_immediate_exec) {
5144             // When decreasing team size, threads no longer in the team should
5145             // unref task team.
5146             team->t.t_threads[f]->th.th_task_team = NULL;
5147           }
5148           __kmp_free_thread(team->t.t_threads[f]);
5149           team->t.t_threads[f] = NULL;
5150         }
5151 #if KMP_NESTED_HOT_TEAMS
5152       } // (__kmp_hot_teams_mode == 0)
5153       else {
5154         // When keeping extra threads in team, switch threads to wait on own
5155         // b_go flag
5156         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5157           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5158           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5159           for (int b = 0; b < bs_last_barrier; ++b) {
5160             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5161               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5162             }
5163             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5164           }
5165         }
5166       }
5167 #endif // KMP_NESTED_HOT_TEAMS
5168       team->t.t_nproc = new_nproc;
5169       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5170       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5171       __kmp_reinitialize_team(team, new_icvs,
5172                               root->r.r_uber_thread->th.th_ident);
5173 
5174       // Update remaining threads
5175       for (f = 0; f < new_nproc; ++f) {
5176         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5177       }
5178 
5179       // restore the current task state of the primary thread: should be the
5180       // implicit task
5181       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5182                     team->t.t_threads[0], team));
5183 
5184       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5185 
5186 #ifdef KMP_DEBUG
5187       for (f = 0; f < team->t.t_nproc; f++) {
5188         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5189                          team->t.t_threads[f]->th.th_team_nproc ==
5190                              team->t.t_nproc);
5191       }
5192 #endif
5193 
5194       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5195 #if KMP_AFFINITY_SUPPORTED
5196       __kmp_partition_places(team);
5197 #endif
5198     } else { // team->t.t_nproc < new_nproc
5199 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5200       kmp_affin_mask_t *old_mask;
5201       if (KMP_AFFINITY_CAPABLE()) {
5202         KMP_CPU_ALLOC(old_mask);
5203       }
5204 #endif
5205 
5206       KA_TRACE(20,
5207                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5208                 new_nproc));
5209       int old_nproc = team->t.t_nproc; // save old value and use to update only
5210       team->t.t_size_changed = 1;
5211 
5212 #if KMP_NESTED_HOT_TEAMS
5213       int avail_threads = hot_teams[level].hot_team_nth;
5214       if (new_nproc < avail_threads)
5215         avail_threads = new_nproc;
5216       kmp_info_t **other_threads = team->t.t_threads;
5217       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5218         // Adjust barrier data of reserved threads (if any) of the team
5219         // Other data will be set in __kmp_initialize_info() below.
5220         int b;
5221         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5222         for (b = 0; b < bs_last_barrier; ++b) {
5223           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5224           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5225 #if USE_DEBUGGER
5226           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5227 #endif
5228         }
5229       }
5230       if (hot_teams[level].hot_team_nth >= new_nproc) {
5231         // we have all needed threads in reserve, no need to allocate any
5232         // this only possible in mode 1, cannot have reserved threads in mode 0
5233         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5234         team->t.t_nproc = new_nproc; // just get reserved threads involved
5235       } else {
5236         // We may have some threads in reserve, but not enough;
5237         // get reserved threads involved if any.
5238         team->t.t_nproc = hot_teams[level].hot_team_nth;
5239         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5240 #endif // KMP_NESTED_HOT_TEAMS
5241         if (team->t.t_max_nproc < new_nproc) {
5242           /* reallocate larger arrays */
5243           __kmp_reallocate_team_arrays(team, new_nproc);
5244           __kmp_reinitialize_team(team, new_icvs, NULL);
5245         }
5246 
5247 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5248         /* Temporarily set full mask for primary thread before creation of
5249            workers. The reason is that workers inherit the affinity from the
5250            primary thread, so if a lot of workers are created on the single
5251            core quickly, they don't get a chance to set their own affinity for
5252            a long time. */
5253         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5254 #endif
5255 
5256         /* allocate new threads for the hot team */
5257         for (f = team->t.t_nproc; f < new_nproc; f++) {
5258           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5259           KMP_DEBUG_ASSERT(new_worker);
5260           team->t.t_threads[f] = new_worker;
5261 
5262           KA_TRACE(20,
5263                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5264                     "join=%llu, plain=%llu\n",
5265                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5266                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5267                     team->t.t_bar[bs_plain_barrier].b_arrived));
5268 
5269           { // Initialize barrier data for new threads.
5270             int b;
5271             kmp_balign_t *balign = new_worker->th.th_bar;
5272             for (b = 0; b < bs_last_barrier; ++b) {
5273               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5274               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5275                                KMP_BARRIER_PARENT_FLAG);
5276 #if USE_DEBUGGER
5277               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5278 #endif
5279             }
5280           }
5281         }
5282 
5283 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5284         if (KMP_AFFINITY_CAPABLE()) {
5285           /* Restore initial primary thread's affinity mask */
5286           __kmp_set_system_affinity(old_mask, TRUE);
5287           KMP_CPU_FREE(old_mask);
5288         }
5289 #endif
5290 #if KMP_NESTED_HOT_TEAMS
5291       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5292 #endif // KMP_NESTED_HOT_TEAMS
5293       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5294         // Barrier size already increased earlier in this function
5295         // Activate team threads via th_used_in_team
5296         __kmp_add_threads_to_team(team, new_nproc);
5297       }
5298       /* make sure everyone is syncronized */
5299       // new threads below
5300       __kmp_initialize_team(team, new_nproc, new_icvs,
5301                             root->r.r_uber_thread->th.th_ident);
5302 
5303       /* reinitialize the threads */
5304       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5305       for (f = 0; f < team->t.t_nproc; ++f)
5306         __kmp_initialize_info(team->t.t_threads[f], team, f,
5307                               __kmp_gtid_from_tid(f, team));
5308 
5309       if (level) { // set th_task_state for new threads in nested hot team
5310         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5311         // only need to set the th_task_state for the new threads. th_task_state
5312         // for primary thread will not be accurate until after this in
5313         // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5314         // get the correct value.
5315         for (f = old_nproc; f < team->t.t_nproc; ++f)
5316           team->t.t_threads[f]->th.th_task_state =
5317               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5318       } else { // set th_task_state for new threads in non-nested hot team
5319         // copy primary thread's state
5320         kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5321         for (f = old_nproc; f < team->t.t_nproc; ++f)
5322           team->t.t_threads[f]->th.th_task_state = old_state;
5323       }
5324 
5325 #ifdef KMP_DEBUG
5326       for (f = 0; f < team->t.t_nproc; ++f) {
5327         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5328                          team->t.t_threads[f]->th.th_team_nproc ==
5329                              team->t.t_nproc);
5330       }
5331 #endif
5332 
5333       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5334 #if KMP_AFFINITY_SUPPORTED
5335       __kmp_partition_places(team);
5336 #endif
5337     } // Check changes in number of threads
5338 
5339     kmp_info_t *master = team->t.t_threads[0];
5340     if (master->th.th_teams_microtask) {
5341       for (f = 1; f < new_nproc; ++f) {
5342         // propagate teams construct specific info to workers
5343         kmp_info_t *thr = team->t.t_threads[f];
5344         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5345         thr->th.th_teams_level = master->th.th_teams_level;
5346         thr->th.th_teams_size = master->th.th_teams_size;
5347       }
5348     }
5349 #if KMP_NESTED_HOT_TEAMS
5350     if (level) {
5351       // Sync barrier state for nested hot teams, not needed for outermost hot
5352       // team.
5353       for (f = 1; f < new_nproc; ++f) {
5354         kmp_info_t *thr = team->t.t_threads[f];
5355         int b;
5356         kmp_balign_t *balign = thr->th.th_bar;
5357         for (b = 0; b < bs_last_barrier; ++b) {
5358           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5359           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5360 #if USE_DEBUGGER
5361           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5362 #endif
5363         }
5364       }
5365     }
5366 #endif // KMP_NESTED_HOT_TEAMS
5367 
5368     /* reallocate space for arguments if necessary */
5369     __kmp_alloc_argv_entries(argc, team, TRUE);
5370     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5371     // The hot team re-uses the previous task team,
5372     // if untouched during the previous release->gather phase.
5373 
5374     KF_TRACE(10, (" hot_team = %p\n", team));
5375 
5376 #if KMP_DEBUG
5377     if (__kmp_tasking_mode != tskm_immediate_exec) {
5378       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5379                     "task_team[1] = %p after reinit\n",
5380                     team->t.t_task_team[0], team->t.t_task_team[1]));
5381     }
5382 #endif
5383 
5384 #if OMPT_SUPPORT
5385     __ompt_team_assign_id(team, ompt_parallel_data);
5386 #endif
5387 
5388     KMP_MB();
5389 
5390     return team;
5391   }
5392 
5393   /* next, let's try to take one from the team pool */
5394   KMP_MB();
5395   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5396     /* TODO: consider resizing undersized teams instead of reaping them, now
5397        that we have a resizing mechanism */
5398     if (team->t.t_max_nproc >= max_nproc) {
5399       /* take this team from the team pool */
5400       __kmp_team_pool = team->t.t_next_pool;
5401 
5402       if (max_nproc > 1 &&
5403           __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5404         if (!team->t.b) { // Allocate barrier structure
5405           team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5406         }
5407       }
5408 
5409       /* setup the team for fresh use */
5410       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5411 
5412       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5413                     "task_team[1] %p to NULL\n",
5414                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5415       team->t.t_task_team[0] = NULL;
5416       team->t.t_task_team[1] = NULL;
5417 
5418       /* reallocate space for arguments if necessary */
5419       __kmp_alloc_argv_entries(argc, team, TRUE);
5420       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5421 
5422       KA_TRACE(
5423           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5424                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5425       { // Initialize barrier data.
5426         int b;
5427         for (b = 0; b < bs_last_barrier; ++b) {
5428           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5429 #if USE_DEBUGGER
5430           team->t.t_bar[b].b_master_arrived = 0;
5431           team->t.t_bar[b].b_team_arrived = 0;
5432 #endif
5433         }
5434       }
5435 
5436       team->t.t_proc_bind = new_proc_bind;
5437 
5438       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5439                     team->t.t_id));
5440 
5441 #if OMPT_SUPPORT
5442       __ompt_team_assign_id(team, ompt_parallel_data);
5443 #endif
5444 
5445       KMP_MB();
5446 
5447       return team;
5448     }
5449 
5450     /* reap team if it is too small, then loop back and check the next one */
5451     // not sure if this is wise, but, will be redone during the hot-teams
5452     // rewrite.
5453     /* TODO: Use technique to find the right size hot-team, don't reap them */
5454     team = __kmp_reap_team(team);
5455     __kmp_team_pool = team;
5456   }
5457 
5458   /* nothing available in the pool, no matter, make a new team! */
5459   KMP_MB();
5460   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5461 
5462   /* and set it up */
5463   team->t.t_max_nproc = max_nproc;
5464   if (max_nproc > 1 &&
5465       __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5466     // Allocate barrier structure
5467     team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5468   }
5469 
5470   /* NOTE well, for some reason allocating one big buffer and dividing it up
5471      seems to really hurt performance a lot on the P4, so, let's not use this */
5472   __kmp_allocate_team_arrays(team, max_nproc);
5473 
5474   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5475   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5476 
5477   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5478                 "%p to NULL\n",
5479                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5480   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5481   // memory, no need to duplicate
5482   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5483   // memory, no need to duplicate
5484 
5485   if (__kmp_storage_map) {
5486     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5487   }
5488 
5489   /* allocate space for arguments */
5490   __kmp_alloc_argv_entries(argc, team, FALSE);
5491   team->t.t_argc = argc;
5492 
5493   KA_TRACE(20,
5494            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5495             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5496   { // Initialize barrier data.
5497     int b;
5498     for (b = 0; b < bs_last_barrier; ++b) {
5499       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5500 #if USE_DEBUGGER
5501       team->t.t_bar[b].b_master_arrived = 0;
5502       team->t.t_bar[b].b_team_arrived = 0;
5503 #endif
5504     }
5505   }
5506 
5507   team->t.t_proc_bind = new_proc_bind;
5508 
5509 #if OMPT_SUPPORT
5510   __ompt_team_assign_id(team, ompt_parallel_data);
5511   team->t.ompt_serialized_team_info = NULL;
5512 #endif
5513 
5514   KMP_MB();
5515 
5516   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5517                 team->t.t_id));
5518 
5519   return team;
5520 }
5521 
5522 /* TODO implement hot-teams at all levels */
5523 /* TODO implement lazy thread release on demand (disband request) */
5524 
5525 /* free the team.  return it to the team pool.  release all the threads
5526  * associated with it */
5527 void __kmp_free_team(kmp_root_t *root,
5528                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5529   int f;
5530   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5531                 team->t.t_id));
5532 
5533   /* verify state */
5534   KMP_DEBUG_ASSERT(root);
5535   KMP_DEBUG_ASSERT(team);
5536   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5537   KMP_DEBUG_ASSERT(team->t.t_threads);
5538 
5539   int use_hot_team = team == root->r.r_hot_team;
5540 #if KMP_NESTED_HOT_TEAMS
5541   int level;
5542   kmp_hot_team_ptr_t *hot_teams;
5543   if (master) {
5544     level = team->t.t_active_level - 1;
5545     if (master->th.th_teams_microtask) { // in teams construct?
5546       if (master->th.th_teams_size.nteams > 1) {
5547         ++level; // level was not increased in teams construct for
5548         // team_of_masters
5549       }
5550       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5551           master->th.th_teams_level == team->t.t_level) {
5552         ++level; // level was not increased in teams construct for
5553         // team_of_workers before the parallel
5554       } // team->t.t_level will be increased inside parallel
5555     }
5556     hot_teams = master->th.th_hot_teams;
5557     if (level < __kmp_hot_teams_max_level) {
5558       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5559       use_hot_team = 1;
5560     }
5561   }
5562 #endif // KMP_NESTED_HOT_TEAMS
5563 
5564   /* team is done working */
5565   TCW_SYNC_PTR(team->t.t_pkfn,
5566                NULL); // Important for Debugging Support Library.
5567 #if KMP_OS_WINDOWS
5568   team->t.t_copyin_counter = 0; // init counter for possible reuse
5569 #endif
5570   // Do not reset pointer to parent team to NULL for hot teams.
5571 
5572   /* if we are non-hot team, release our threads */
5573   if (!use_hot_team) {
5574     if (__kmp_tasking_mode != tskm_immediate_exec) {
5575       // Wait for threads to reach reapable state
5576       for (f = 1; f < team->t.t_nproc; ++f) {
5577         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5578         kmp_info_t *th = team->t.t_threads[f];
5579         volatile kmp_uint32 *state = &th->th.th_reap_state;
5580         while (*state != KMP_SAFE_TO_REAP) {
5581 #if KMP_OS_WINDOWS
5582           // On Windows a thread can be killed at any time, check this
5583           DWORD ecode;
5584           if (!__kmp_is_thread_alive(th, &ecode)) {
5585             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5586             break;
5587           }
5588 #endif
5589           // first check if thread is sleeping
5590           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5591           if (fl.is_sleeping())
5592             fl.resume(__kmp_gtid_from_thread(th));
5593           KMP_CPU_PAUSE();
5594         }
5595       }
5596 
5597       // Delete task teams
5598       int tt_idx;
5599       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5600         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5601         if (task_team != NULL) {
5602           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5603             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5604             team->t.t_threads[f]->th.th_task_team = NULL;
5605           }
5606           KA_TRACE(
5607               20,
5608               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5609                __kmp_get_gtid(), task_team, team->t.t_id));
5610 #if KMP_NESTED_HOT_TEAMS
5611           __kmp_free_task_team(master, task_team);
5612 #endif
5613           team->t.t_task_team[tt_idx] = NULL;
5614         }
5615       }
5616     }
5617 
5618     // Reset pointer to parent team only for non-hot teams.
5619     team->t.t_parent = NULL;
5620     team->t.t_level = 0;
5621     team->t.t_active_level = 0;
5622 
5623     /* free the worker threads */
5624     for (f = 1; f < team->t.t_nproc; ++f) {
5625       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5626       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5627         KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5628                                     1, 2);
5629       }
5630       __kmp_free_thread(team->t.t_threads[f]);
5631     }
5632 
5633     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5634       if (team->t.b) {
5635         // wake up thread at old location
5636         team->t.b->go_release();
5637         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5638           for (f = 1; f < team->t.t_nproc; ++f) {
5639             if (team->t.b->sleep[f].sleep) {
5640               __kmp_atomic_resume_64(
5641                   team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5642                   (kmp_atomic_flag_64<> *)NULL);
5643             }
5644           }
5645         }
5646         // Wait for threads to be removed from team
5647         for (int f = 1; f < team->t.t_nproc; ++f) {
5648           while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5649             KMP_CPU_PAUSE();
5650         }
5651       }
5652     }
5653 
5654     for (f = 1; f < team->t.t_nproc; ++f) {
5655       team->t.t_threads[f] = NULL;
5656     }
5657 
5658     if (team->t.t_max_nproc > 1 &&
5659         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5660       distributedBarrier::deallocate(team->t.b);
5661       team->t.b = NULL;
5662     }
5663     /* put the team back in the team pool */
5664     /* TODO limit size of team pool, call reap_team if pool too large */
5665     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5666     __kmp_team_pool = (volatile kmp_team_t *)team;
5667   } else { // Check if team was created for primary threads in teams construct
5668     // See if first worker is a CG root
5669     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5670                      team->t.t_threads[1]->th.th_cg_roots);
5671     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5672       // Clean up the CG root nodes on workers so that this team can be re-used
5673       for (f = 1; f < team->t.t_nproc; ++f) {
5674         kmp_info_t *thr = team->t.t_threads[f];
5675         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5676                          thr->th.th_cg_roots->cg_root == thr);
5677         // Pop current CG root off list
5678         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5679         thr->th.th_cg_roots = tmp->up;
5680         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5681                        " up to node %p. cg_nthreads was %d\n",
5682                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5683         int i = tmp->cg_nthreads--;
5684         if (i == 1) {
5685           __kmp_free(tmp); // free CG if we are the last thread in it
5686         }
5687         // Restore current task's thread_limit from CG root
5688         if (thr->th.th_cg_roots)
5689           thr->th.th_current_task->td_icvs.thread_limit =
5690               thr->th.th_cg_roots->cg_thread_limit;
5691       }
5692     }
5693   }
5694 
5695   KMP_MB();
5696 }
5697 
5698 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5699 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5700   kmp_team_t *next_pool = team->t.t_next_pool;
5701 
5702   KMP_DEBUG_ASSERT(team);
5703   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5704   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5705   KMP_DEBUG_ASSERT(team->t.t_threads);
5706   KMP_DEBUG_ASSERT(team->t.t_argv);
5707 
5708   /* TODO clean the threads that are a part of this? */
5709 
5710   /* free stuff */
5711   __kmp_free_team_arrays(team);
5712   if (team->t.t_argv != &team->t.t_inline_argv[0])
5713     __kmp_free((void *)team->t.t_argv);
5714   __kmp_free(team);
5715 
5716   KMP_MB();
5717   return next_pool;
5718 }
5719 
5720 // Free the thread.  Don't reap it, just place it on the pool of available
5721 // threads.
5722 //
5723 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5724 // binding for the affinity mechanism to be useful.
5725 //
5726 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5727 // However, we want to avoid a potential performance problem by always
5728 // scanning through the list to find the correct point at which to insert
5729 // the thread (potential N**2 behavior).  To do this we keep track of the
5730 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5731 // With single-level parallelism, threads will always be added to the tail
5732 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5733 // parallelism, all bets are off and we may need to scan through the entire
5734 // free list.
5735 //
5736 // This change also has a potentially large performance benefit, for some
5737 // applications.  Previously, as threads were freed from the hot team, they
5738 // would be placed back on the free list in inverse order.  If the hot team
5739 // grew back to it's original size, then the freed thread would be placed
5740 // back on the hot team in reverse order.  This could cause bad cache
5741 // locality problems on programs where the size of the hot team regularly
5742 // grew and shrunk.
5743 //
5744 // Now, for single-level parallelism, the OMP tid is always == gtid.
5745 void __kmp_free_thread(kmp_info_t *this_th) {
5746   int gtid;
5747   kmp_info_t **scan;
5748 
5749   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5750                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5751 
5752   KMP_DEBUG_ASSERT(this_th);
5753 
5754   // When moving thread to pool, switch thread to wait on own b_go flag, and
5755   // uninitialized (NULL team).
5756   int b;
5757   kmp_balign_t *balign = this_th->th.th_bar;
5758   for (b = 0; b < bs_last_barrier; ++b) {
5759     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5760       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5761     balign[b].bb.team = NULL;
5762     balign[b].bb.leaf_kids = 0;
5763   }
5764   this_th->th.th_task_state = 0;
5765   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5766 
5767   /* put thread back on the free pool */
5768   TCW_PTR(this_th->th.th_team, NULL);
5769   TCW_PTR(this_th->th.th_root, NULL);
5770   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5771 
5772   while (this_th->th.th_cg_roots) {
5773     this_th->th.th_cg_roots->cg_nthreads--;
5774     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5775                    " %p of thread  %p to %d\n",
5776                    this_th, this_th->th.th_cg_roots,
5777                    this_th->th.th_cg_roots->cg_root,
5778                    this_th->th.th_cg_roots->cg_nthreads));
5779     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5780     if (tmp->cg_root == this_th) { // Thread is a cg_root
5781       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5782       KA_TRACE(
5783           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5784       this_th->th.th_cg_roots = tmp->up;
5785       __kmp_free(tmp);
5786     } else { // Worker thread
5787       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5788         __kmp_free(tmp);
5789       }
5790       this_th->th.th_cg_roots = NULL;
5791       break;
5792     }
5793   }
5794 
5795   /* If the implicit task assigned to this thread can be used by other threads
5796    * -> multiple threads can share the data and try to free the task at
5797    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5798    * with higher probability when hot team is disabled but can occurs even when
5799    * the hot team is enabled */
5800   __kmp_free_implicit_task(this_th);
5801   this_th->th.th_current_task = NULL;
5802 
5803   // If the __kmp_thread_pool_insert_pt is already past the new insert
5804   // point, then we need to re-scan the entire list.
5805   gtid = this_th->th.th_info.ds.ds_gtid;
5806   if (__kmp_thread_pool_insert_pt != NULL) {
5807     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5808     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5809       __kmp_thread_pool_insert_pt = NULL;
5810     }
5811   }
5812 
5813   // Scan down the list to find the place to insert the thread.
5814   // scan is the address of a link in the list, possibly the address of
5815   // __kmp_thread_pool itself.
5816   //
5817   // In the absence of nested parallelism, the for loop will have 0 iterations.
5818   if (__kmp_thread_pool_insert_pt != NULL) {
5819     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5820   } else {
5821     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5822   }
5823   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5824        scan = &((*scan)->th.th_next_pool))
5825     ;
5826 
5827   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5828   // to its address.
5829   TCW_PTR(this_th->th.th_next_pool, *scan);
5830   __kmp_thread_pool_insert_pt = *scan = this_th;
5831   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5832                    (this_th->th.th_info.ds.ds_gtid <
5833                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5834   TCW_4(this_th->th.th_in_pool, TRUE);
5835   __kmp_suspend_initialize_thread(this_th);
5836   __kmp_lock_suspend_mx(this_th);
5837   if (this_th->th.th_active == TRUE) {
5838     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5839     this_th->th.th_active_in_pool = TRUE;
5840   }
5841 #if KMP_DEBUG
5842   else {
5843     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5844   }
5845 #endif
5846   __kmp_unlock_suspend_mx(this_th);
5847 
5848   TCW_4(__kmp_nth, __kmp_nth - 1);
5849 
5850 #ifdef KMP_ADJUST_BLOCKTIME
5851   /* Adjust blocktime back to user setting or default if necessary */
5852   /* Middle initialization might never have occurred                */
5853   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5854     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5855     if (__kmp_nth <= __kmp_avail_proc) {
5856       __kmp_zero_bt = FALSE;
5857     }
5858   }
5859 #endif /* KMP_ADJUST_BLOCKTIME */
5860 
5861   KMP_MB();
5862 }
5863 
5864 /* ------------------------------------------------------------------------ */
5865 
5866 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5867 #if OMP_PROFILING_SUPPORT
5868   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5869   // TODO: add a configuration option for time granularity
5870   if (ProfileTraceFile)
5871     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5872 #endif
5873 
5874   int gtid = this_thr->th.th_info.ds.ds_gtid;
5875   /*    void                 *stack_data;*/
5876   kmp_team_t **volatile pteam;
5877 
5878   KMP_MB();
5879   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5880 
5881   if (__kmp_env_consistency_check) {
5882     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5883   }
5884 
5885 #if OMPD_SUPPORT
5886   if (ompd_state & OMPD_ENABLE_BP)
5887     ompd_bp_thread_begin();
5888 #endif
5889 
5890 #if OMPT_SUPPORT
5891   ompt_data_t *thread_data = nullptr;
5892   if (ompt_enabled.enabled) {
5893     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5894     *thread_data = ompt_data_none;
5895 
5896     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5897     this_thr->th.ompt_thread_info.wait_id = 0;
5898     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5899     this_thr->th.ompt_thread_info.parallel_flags = 0;
5900     if (ompt_enabled.ompt_callback_thread_begin) {
5901       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5902           ompt_thread_worker, thread_data);
5903     }
5904     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5905   }
5906 #endif
5907 
5908   /* This is the place where threads wait for work */
5909   while (!TCR_4(__kmp_global.g.g_done)) {
5910     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5911     KMP_MB();
5912 
5913     /* wait for work to do */
5914     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5915 
5916     /* No tid yet since not part of a team */
5917     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5918 
5919 #if OMPT_SUPPORT
5920     if (ompt_enabled.enabled) {
5921       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5922     }
5923 #endif
5924 
5925     pteam = &this_thr->th.th_team;
5926 
5927     /* have we been allocated? */
5928     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5929       /* we were just woken up, so run our new task */
5930       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5931         int rc;
5932         KA_TRACE(20,
5933                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5934                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5935                   (*pteam)->t.t_pkfn));
5936 
5937         updateHWFPControl(*pteam);
5938 
5939 #if OMPT_SUPPORT
5940         if (ompt_enabled.enabled) {
5941           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5942         }
5943 #endif
5944 
5945         rc = (*pteam)->t.t_invoke(gtid);
5946         KMP_ASSERT(rc);
5947 
5948         KMP_MB();
5949         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5950                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5951                       (*pteam)->t.t_pkfn));
5952       }
5953 #if OMPT_SUPPORT
5954       if (ompt_enabled.enabled) {
5955         /* no frame set while outside task */
5956         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5957 
5958         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5959       }
5960 #endif
5961       /* join barrier after parallel region */
5962       __kmp_join_barrier(gtid);
5963     }
5964   }
5965   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5966 
5967 #if OMPD_SUPPORT
5968   if (ompd_state & OMPD_ENABLE_BP)
5969     ompd_bp_thread_end();
5970 #endif
5971 
5972 #if OMPT_SUPPORT
5973   if (ompt_enabled.ompt_callback_thread_end) {
5974     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5975   }
5976 #endif
5977 
5978   this_thr->th.th_task_team = NULL;
5979   /* run the destructors for the threadprivate data for this thread */
5980   __kmp_common_destroy_gtid(gtid);
5981 
5982   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5983   KMP_MB();
5984 
5985 #if OMP_PROFILING_SUPPORT
5986   llvm::timeTraceProfilerFinishThread();
5987 #endif
5988   return this_thr;
5989 }
5990 
5991 /* ------------------------------------------------------------------------ */
5992 
5993 void __kmp_internal_end_dest(void *specific_gtid) {
5994   // Make sure no significant bits are lost
5995   int gtid;
5996   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5997 
5998   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5999   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6000    * this is because 0 is reserved for the nothing-stored case */
6001 
6002   __kmp_internal_end_thread(gtid);
6003 }
6004 
6005 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6006 
6007 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6008   __kmp_internal_end_atexit();
6009 }
6010 
6011 #endif
6012 
6013 /* [Windows] josh: when the atexit handler is called, there may still be more
6014    than one thread alive */
6015 void __kmp_internal_end_atexit(void) {
6016   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6017   /* [Windows]
6018      josh: ideally, we want to completely shutdown the library in this atexit
6019      handler, but stat code that depends on thread specific data for gtid fails
6020      because that data becomes unavailable at some point during the shutdown, so
6021      we call __kmp_internal_end_thread instead. We should eventually remove the
6022      dependency on __kmp_get_specific_gtid in the stat code and use
6023      __kmp_internal_end_library to cleanly shutdown the library.
6024 
6025      // TODO: Can some of this comment about GVS be removed?
6026      I suspect that the offending stat code is executed when the calling thread
6027      tries to clean up a dead root thread's data structures, resulting in GVS
6028      code trying to close the GVS structures for that thread, but since the stat
6029      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6030      the calling thread is cleaning up itself instead of another thread, it get
6031      confused. This happens because allowing a thread to unregister and cleanup
6032      another thread is a recent modification for addressing an issue.
6033      Based on the current design (20050722), a thread may end up
6034      trying to unregister another thread only if thread death does not trigger
6035      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
6036      thread specific data destructor function to detect thread death. For
6037      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6038      is nothing.  Thus, the workaround is applicable only for Windows static
6039      stat library. */
6040   __kmp_internal_end_library(-1);
6041 #if KMP_OS_WINDOWS
6042   __kmp_close_console();
6043 #endif
6044 }
6045 
6046 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6047   // It is assumed __kmp_forkjoin_lock is acquired.
6048 
6049   int gtid;
6050 
6051   KMP_DEBUG_ASSERT(thread != NULL);
6052 
6053   gtid = thread->th.th_info.ds.ds_gtid;
6054 
6055   if (!is_root) {
6056     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6057       /* Assume the threads are at the fork barrier here */
6058       KA_TRACE(
6059           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6060                gtid));
6061       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6062         while (
6063             !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6064           KMP_CPU_PAUSE();
6065         __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6066       } else {
6067         /* Need release fence here to prevent seg faults for tree forkjoin
6068            barrier (GEH) */
6069         ANNOTATE_HAPPENS_BEFORE(thread);
6070         kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6071                            thread);
6072         __kmp_release_64(&flag);
6073       }
6074     }
6075 
6076     // Terminate OS thread.
6077     __kmp_reap_worker(thread);
6078 
6079     // The thread was killed asynchronously.  If it was actively
6080     // spinning in the thread pool, decrement the global count.
6081     //
6082     // There is a small timing hole here - if the worker thread was just waking
6083     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6084     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6085     // the global counter might not get updated.
6086     //
6087     // Currently, this can only happen as the library is unloaded,
6088     // so there are no harmful side effects.
6089     if (thread->th.th_active_in_pool) {
6090       thread->th.th_active_in_pool = FALSE;
6091       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6092       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6093     }
6094   }
6095 
6096   __kmp_free_implicit_task(thread);
6097 
6098 // Free the fast memory for tasking
6099 #if USE_FAST_MEMORY
6100   __kmp_free_fast_memory(thread);
6101 #endif /* USE_FAST_MEMORY */
6102 
6103   __kmp_suspend_uninitialize_thread(thread);
6104 
6105   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6106   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6107 
6108   --__kmp_all_nth;
6109   // __kmp_nth was decremented when thread is added to the pool.
6110 
6111 #ifdef KMP_ADJUST_BLOCKTIME
6112   /* Adjust blocktime back to user setting or default if necessary */
6113   /* Middle initialization might never have occurred                */
6114   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6115     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6116     if (__kmp_nth <= __kmp_avail_proc) {
6117       __kmp_zero_bt = FALSE;
6118     }
6119   }
6120 #endif /* KMP_ADJUST_BLOCKTIME */
6121 
6122   /* free the memory being used */
6123   if (__kmp_env_consistency_check) {
6124     if (thread->th.th_cons) {
6125       __kmp_free_cons_stack(thread->th.th_cons);
6126       thread->th.th_cons = NULL;
6127     }
6128   }
6129 
6130   if (thread->th.th_pri_common != NULL) {
6131     __kmp_free(thread->th.th_pri_common);
6132     thread->th.th_pri_common = NULL;
6133   }
6134 
6135   if (thread->th.th_task_state_memo_stack != NULL) {
6136     __kmp_free(thread->th.th_task_state_memo_stack);
6137     thread->th.th_task_state_memo_stack = NULL;
6138   }
6139 
6140 #if KMP_USE_BGET
6141   if (thread->th.th_local.bget_data != NULL) {
6142     __kmp_finalize_bget(thread);
6143   }
6144 #endif
6145 
6146 #if KMP_AFFINITY_SUPPORTED
6147   if (thread->th.th_affin_mask != NULL) {
6148     KMP_CPU_FREE(thread->th.th_affin_mask);
6149     thread->th.th_affin_mask = NULL;
6150   }
6151 #endif /* KMP_AFFINITY_SUPPORTED */
6152 
6153 #if KMP_USE_HIER_SCHED
6154   if (thread->th.th_hier_bar_data != NULL) {
6155     __kmp_free(thread->th.th_hier_bar_data);
6156     thread->th.th_hier_bar_data = NULL;
6157   }
6158 #endif
6159 
6160   __kmp_reap_team(thread->th.th_serial_team);
6161   thread->th.th_serial_team = NULL;
6162   __kmp_free(thread);
6163 
6164   KMP_MB();
6165 
6166 } // __kmp_reap_thread
6167 
6168 static void __kmp_internal_end(void) {
6169   int i;
6170 
6171   /* First, unregister the library */
6172   __kmp_unregister_library();
6173 
6174 #if KMP_OS_WINDOWS
6175   /* In Win static library, we can't tell when a root actually dies, so we
6176      reclaim the data structures for any root threads that have died but not
6177      unregistered themselves, in order to shut down cleanly.
6178      In Win dynamic library we also can't tell when a thread dies.  */
6179   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6180 // dead roots
6181 #endif
6182 
6183   for (i = 0; i < __kmp_threads_capacity; i++)
6184     if (__kmp_root[i])
6185       if (__kmp_root[i]->r.r_active)
6186         break;
6187   KMP_MB(); /* Flush all pending memory write invalidates.  */
6188   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6189 
6190   if (i < __kmp_threads_capacity) {
6191 #if KMP_USE_MONITOR
6192     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6193     KMP_MB(); /* Flush all pending memory write invalidates.  */
6194 
6195     // Need to check that monitor was initialized before reaping it. If we are
6196     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6197     // __kmp_monitor will appear to contain valid data, but it is only valid in
6198     // the parent process, not the child.
6199     // New behavior (201008): instead of keying off of the flag
6200     // __kmp_init_parallel, the monitor thread creation is keyed off
6201     // of the new flag __kmp_init_monitor.
6202     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6203     if (TCR_4(__kmp_init_monitor)) {
6204       __kmp_reap_monitor(&__kmp_monitor);
6205       TCW_4(__kmp_init_monitor, 0);
6206     }
6207     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6208     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6209 #endif // KMP_USE_MONITOR
6210   } else {
6211 /* TODO move this to cleanup code */
6212 #ifdef KMP_DEBUG
6213     /* make sure that everything has properly ended */
6214     for (i = 0; i < __kmp_threads_capacity; i++) {
6215       if (__kmp_root[i]) {
6216         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6217         //                    there can be uber threads alive here
6218         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6219       }
6220     }
6221 #endif
6222 
6223     KMP_MB();
6224 
6225     // Reap the worker threads.
6226     // This is valid for now, but be careful if threads are reaped sooner.
6227     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6228       // Get the next thread from the pool.
6229       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6230       __kmp_thread_pool = thread->th.th_next_pool;
6231       // Reap it.
6232       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6233       thread->th.th_next_pool = NULL;
6234       thread->th.th_in_pool = FALSE;
6235       __kmp_reap_thread(thread, 0);
6236     }
6237     __kmp_thread_pool_insert_pt = NULL;
6238 
6239     // Reap teams.
6240     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6241       // Get the next team from the pool.
6242       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6243       __kmp_team_pool = team->t.t_next_pool;
6244       // Reap it.
6245       team->t.t_next_pool = NULL;
6246       __kmp_reap_team(team);
6247     }
6248 
6249     __kmp_reap_task_teams();
6250 
6251 #if KMP_OS_UNIX
6252     // Threads that are not reaped should not access any resources since they
6253     // are going to be deallocated soon, so the shutdown sequence should wait
6254     // until all threads either exit the final spin-waiting loop or begin
6255     // sleeping after the given blocktime.
6256     for (i = 0; i < __kmp_threads_capacity; i++) {
6257       kmp_info_t *thr = __kmp_threads[i];
6258       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6259         KMP_CPU_PAUSE();
6260     }
6261 #endif
6262 
6263     for (i = 0; i < __kmp_threads_capacity; ++i) {
6264       // TBD: Add some checking...
6265       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6266     }
6267 
6268     /* Make sure all threadprivate destructors get run by joining with all
6269        worker threads before resetting this flag */
6270     TCW_SYNC_4(__kmp_init_common, FALSE);
6271 
6272     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6273     KMP_MB();
6274 
6275 #if KMP_USE_MONITOR
6276     // See note above: One of the possible fixes for CQ138434 / CQ140126
6277     //
6278     // FIXME: push both code fragments down and CSE them?
6279     // push them into __kmp_cleanup() ?
6280     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6281     if (TCR_4(__kmp_init_monitor)) {
6282       __kmp_reap_monitor(&__kmp_monitor);
6283       TCW_4(__kmp_init_monitor, 0);
6284     }
6285     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6286     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6287 #endif
6288   } /* else !__kmp_global.t_active */
6289   TCW_4(__kmp_init_gtid, FALSE);
6290   KMP_MB(); /* Flush all pending memory write invalidates.  */
6291 
6292   __kmp_cleanup();
6293 #if OMPT_SUPPORT
6294   ompt_fini();
6295 #endif
6296 }
6297 
6298 void __kmp_internal_end_library(int gtid_req) {
6299   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6300   /* this shouldn't be a race condition because __kmp_internal_end() is the
6301      only place to clear __kmp_serial_init */
6302   /* we'll check this later too, after we get the lock */
6303   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6304   // redundant, because the next check will work in any case.
6305   if (__kmp_global.g.g_abort) {
6306     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6307     /* TODO abort? */
6308     return;
6309   }
6310   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6311     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6312     return;
6313   }
6314 
6315   KMP_MB(); /* Flush all pending memory write invalidates.  */
6316   /* find out who we are and what we should do */
6317   {
6318     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6319     KA_TRACE(
6320         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6321     if (gtid == KMP_GTID_SHUTDOWN) {
6322       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6323                     "already shutdown\n"));
6324       return;
6325     } else if (gtid == KMP_GTID_MONITOR) {
6326       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6327                     "registered, or system shutdown\n"));
6328       return;
6329     } else if (gtid == KMP_GTID_DNE) {
6330       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6331                     "shutdown\n"));
6332       /* we don't know who we are, but we may still shutdown the library */
6333     } else if (KMP_UBER_GTID(gtid)) {
6334       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6335       if (__kmp_root[gtid]->r.r_active) {
6336         __kmp_global.g.g_abort = -1;
6337         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6338         __kmp_unregister_library();
6339         KA_TRACE(10,
6340                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6341                   gtid));
6342         return;
6343       } else {
6344         KA_TRACE(
6345             10,
6346             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6347         __kmp_unregister_root_current_thread(gtid);
6348       }
6349     } else {
6350 /* worker threads may call this function through the atexit handler, if they
6351  * call exit() */
6352 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6353    TODO: do a thorough shutdown instead */
6354 #ifdef DUMP_DEBUG_ON_EXIT
6355       if (__kmp_debug_buf)
6356         __kmp_dump_debug_buffer();
6357 #endif
6358       // added unregister library call here when we switch to shm linux
6359       // if we don't, it will leave lots of files in /dev/shm
6360       // cleanup shared memory file before exiting.
6361       __kmp_unregister_library();
6362       return;
6363     }
6364   }
6365   /* synchronize the termination process */
6366   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6367 
6368   /* have we already finished */
6369   if (__kmp_global.g.g_abort) {
6370     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6371     /* TODO abort? */
6372     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6373     return;
6374   }
6375   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6376     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6377     return;
6378   }
6379 
6380   /* We need this lock to enforce mutex between this reading of
6381      __kmp_threads_capacity and the writing by __kmp_register_root.
6382      Alternatively, we can use a counter of roots that is atomically updated by
6383      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6384      __kmp_internal_end_*.  */
6385   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6386 
6387   /* now we can safely conduct the actual termination */
6388   __kmp_internal_end();
6389 
6390   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6391   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6392 
6393   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6394 
6395 #ifdef DUMP_DEBUG_ON_EXIT
6396   if (__kmp_debug_buf)
6397     __kmp_dump_debug_buffer();
6398 #endif
6399 
6400 #if KMP_OS_WINDOWS
6401   __kmp_close_console();
6402 #endif
6403 
6404   __kmp_fini_allocator();
6405 
6406 } // __kmp_internal_end_library
6407 
6408 void __kmp_internal_end_thread(int gtid_req) {
6409   int i;
6410 
6411   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6412   /* this shouldn't be a race condition because __kmp_internal_end() is the
6413    * only place to clear __kmp_serial_init */
6414   /* we'll check this later too, after we get the lock */
6415   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6416   // redundant, because the next check will work in any case.
6417   if (__kmp_global.g.g_abort) {
6418     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6419     /* TODO abort? */
6420     return;
6421   }
6422   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6423     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6424     return;
6425   }
6426 
6427   // If hidden helper team has been initialized, we need to deinit it
6428   if (TCR_4(__kmp_init_hidden_helper)) {
6429     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6430     // First release the main thread to let it continue its work
6431     __kmp_hidden_helper_main_thread_release();
6432     // Wait until the hidden helper team has been destroyed
6433     __kmp_hidden_helper_threads_deinitz_wait();
6434   }
6435 
6436   KMP_MB(); /* Flush all pending memory write invalidates.  */
6437 
6438   /* find out who we are and what we should do */
6439   {
6440     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6441     KA_TRACE(10,
6442              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6443     if (gtid == KMP_GTID_SHUTDOWN) {
6444       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6445                     "already shutdown\n"));
6446       return;
6447     } else if (gtid == KMP_GTID_MONITOR) {
6448       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6449                     "registered, or system shutdown\n"));
6450       return;
6451     } else if (gtid == KMP_GTID_DNE) {
6452       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6453                     "shutdown\n"));
6454       return;
6455       /* we don't know who we are */
6456     } else if (KMP_UBER_GTID(gtid)) {
6457       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6458       if (__kmp_root[gtid]->r.r_active) {
6459         __kmp_global.g.g_abort = -1;
6460         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6461         KA_TRACE(10,
6462                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6463                   gtid));
6464         return;
6465       } else {
6466         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6467                       gtid));
6468         __kmp_unregister_root_current_thread(gtid);
6469       }
6470     } else {
6471       /* just a worker thread, let's leave */
6472       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6473 
6474       if (gtid >= 0) {
6475         __kmp_threads[gtid]->th.th_task_team = NULL;
6476       }
6477 
6478       KA_TRACE(10,
6479                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6480                 gtid));
6481       return;
6482     }
6483   }
6484 #if KMP_DYNAMIC_LIB
6485   if (__kmp_pause_status != kmp_hard_paused)
6486   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6487   // because we will better shutdown later in the library destructor.
6488   {
6489     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6490     return;
6491   }
6492 #endif
6493   /* synchronize the termination process */
6494   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6495 
6496   /* have we already finished */
6497   if (__kmp_global.g.g_abort) {
6498     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6499     /* TODO abort? */
6500     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6501     return;
6502   }
6503   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6504     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6505     return;
6506   }
6507 
6508   /* We need this lock to enforce mutex between this reading of
6509      __kmp_threads_capacity and the writing by __kmp_register_root.
6510      Alternatively, we can use a counter of roots that is atomically updated by
6511      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6512      __kmp_internal_end_*.  */
6513 
6514   /* should we finish the run-time?  are all siblings done? */
6515   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6516 
6517   for (i = 0; i < __kmp_threads_capacity; ++i) {
6518     if (KMP_UBER_GTID(i)) {
6519       KA_TRACE(
6520           10,
6521           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6522       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6523       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6524       return;
6525     }
6526   }
6527 
6528   /* now we can safely conduct the actual termination */
6529 
6530   __kmp_internal_end();
6531 
6532   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6533   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6534 
6535   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6536 
6537 #ifdef DUMP_DEBUG_ON_EXIT
6538   if (__kmp_debug_buf)
6539     __kmp_dump_debug_buffer();
6540 #endif
6541 } // __kmp_internal_end_thread
6542 
6543 // -----------------------------------------------------------------------------
6544 // Library registration stuff.
6545 
6546 static long __kmp_registration_flag = 0;
6547 // Random value used to indicate library initialization.
6548 static char *__kmp_registration_str = NULL;
6549 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6550 
6551 static inline char *__kmp_reg_status_name() {
6552 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6553    each thread. If registration and unregistration go in different threads
6554    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6555    env var can not be found, because the name will contain different pid. */
6556 // macOS* complains about name being too long with additional getuid()
6557 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6558   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6559                           (int)getuid());
6560 #else
6561   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6562 #endif
6563 } // __kmp_reg_status_get
6564 
6565 void __kmp_register_library_startup(void) {
6566 
6567   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6568   int done = 0;
6569   union {
6570     double dtime;
6571     long ltime;
6572   } time;
6573 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6574   __kmp_initialize_system_tick();
6575 #endif
6576   __kmp_read_system_time(&time.dtime);
6577   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6578   __kmp_registration_str =
6579       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6580                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6581 
6582   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6583                 __kmp_registration_str));
6584 
6585   while (!done) {
6586 
6587     char *value = NULL; // Actual value of the environment variable.
6588 
6589 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6590     char *shm_name = __kmp_str_format("/%s", name);
6591     int shm_preexist = 0;
6592     char *data1;
6593     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6594     if ((fd1 == -1) && (errno == EEXIST)) {
6595       // file didn't open because it already exists.
6596       // try opening existing file
6597       fd1 = shm_open(shm_name, O_RDWR, 0666);
6598       if (fd1 == -1) { // file didn't open
6599         // error out here
6600         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6601                     __kmp_msg_null);
6602       } else {
6603         // able to open existing file
6604         shm_preexist = 1;
6605       }
6606     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6607       // already exists.
6608       // error out here.
6609       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6610                   __kmp_msg_null);
6611     }
6612     if (shm_preexist == 0) {
6613       // we created SHM now set size
6614       if (ftruncate(fd1, SHM_SIZE) == -1) {
6615         // error occured setting size;
6616         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6617                     KMP_ERR(errno), __kmp_msg_null);
6618       }
6619     }
6620     data1 =
6621         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6622     if (data1 == MAP_FAILED) {
6623       // failed to map shared memory
6624       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6625                   __kmp_msg_null);
6626     }
6627     if (shm_preexist == 0) { // set data to SHM, set value
6628       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6629     }
6630     // Read value from either what we just wrote or existing file.
6631     value = __kmp_str_format("%s", data1); // read value from SHM
6632     munmap(data1, SHM_SIZE);
6633     close(fd1);
6634 #else // Windows and unix with static library
6635     // Set environment variable, but do not overwrite if it is exist.
6636     __kmp_env_set(name, __kmp_registration_str, 0);
6637     // read value to see if it got set
6638     value = __kmp_env_get(name);
6639 #endif
6640 
6641     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6642       done = 1; // Ok, environment variable set successfully, exit the loop.
6643     } else {
6644       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6645       // Check whether it alive or dead.
6646       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6647       char *tail = value;
6648       char *flag_addr_str = NULL;
6649       char *flag_val_str = NULL;
6650       char const *file_name = NULL;
6651       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6652       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6653       file_name = tail;
6654       if (tail != NULL) {
6655         long *flag_addr = 0;
6656         unsigned long flag_val = 0;
6657         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6658         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6659         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6660           // First, check whether environment-encoded address is mapped into
6661           // addr space.
6662           // If so, dereference it to see if it still has the right value.
6663           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6664             neighbor = 1;
6665           } else {
6666             // If not, then we know the other copy of the library is no longer
6667             // running.
6668             neighbor = 2;
6669           }
6670         }
6671       }
6672       switch (neighbor) {
6673       case 0: // Cannot parse environment variable -- neighbor status unknown.
6674         // Assume it is the incompatible format of future version of the
6675         // library. Assume the other library is alive.
6676         // WARN( ... ); // TODO: Issue a warning.
6677         file_name = "unknown library";
6678         KMP_FALLTHROUGH();
6679       // Attention! Falling to the next case. That's intentional.
6680       case 1: { // Neighbor is alive.
6681         // Check it is allowed.
6682         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6683         if (!__kmp_str_match_true(duplicate_ok)) {
6684           // That's not allowed. Issue fatal error.
6685           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6686                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6687         }
6688         KMP_INTERNAL_FREE(duplicate_ok);
6689         __kmp_duplicate_library_ok = 1;
6690         done = 1; // Exit the loop.
6691       } break;
6692       case 2: { // Neighbor is dead.
6693 
6694 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6695         // close shared memory.
6696         shm_unlink(shm_name); // this removes file in /dev/shm
6697 #else
6698         // Clear the variable and try to register library again.
6699         __kmp_env_unset(name);
6700 #endif
6701       } break;
6702       default: {
6703         KMP_DEBUG_ASSERT(0);
6704       } break;
6705       }
6706     }
6707     KMP_INTERNAL_FREE((void *)value);
6708 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6709     KMP_INTERNAL_FREE((void *)shm_name);
6710 #endif
6711   } // while
6712   KMP_INTERNAL_FREE((void *)name);
6713 
6714 } // func __kmp_register_library_startup
6715 
6716 void __kmp_unregister_library(void) {
6717 
6718   char *name = __kmp_reg_status_name();
6719   char *value = NULL;
6720 
6721 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6722   char *shm_name = __kmp_str_format("/%s", name);
6723   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6724   if (fd1 == -1) {
6725     // file did not open. return.
6726     return;
6727   }
6728   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6729   if (data1 != MAP_FAILED) {
6730     value = __kmp_str_format("%s", data1); // read value from SHM
6731     munmap(data1, SHM_SIZE);
6732   }
6733   close(fd1);
6734 #else
6735   value = __kmp_env_get(name);
6736 #endif
6737 
6738   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6739   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6740   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6741 //  Ok, this is our variable. Delete it.
6742 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6743     shm_unlink(shm_name); // this removes file in /dev/shm
6744 #else
6745     __kmp_env_unset(name);
6746 #endif
6747   }
6748 
6749 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6750   KMP_INTERNAL_FREE(shm_name);
6751 #endif
6752 
6753   KMP_INTERNAL_FREE(__kmp_registration_str);
6754   KMP_INTERNAL_FREE(value);
6755   KMP_INTERNAL_FREE(name);
6756 
6757   __kmp_registration_flag = 0;
6758   __kmp_registration_str = NULL;
6759 
6760 } // __kmp_unregister_library
6761 
6762 // End of Library registration stuff.
6763 // -----------------------------------------------------------------------------
6764 
6765 #if KMP_MIC_SUPPORTED
6766 
6767 static void __kmp_check_mic_type() {
6768   kmp_cpuid_t cpuid_state = {0};
6769   kmp_cpuid_t *cs_p = &cpuid_state;
6770   __kmp_x86_cpuid(1, 0, cs_p);
6771   // We don't support mic1 at the moment
6772   if ((cs_p->eax & 0xff0) == 0xB10) {
6773     __kmp_mic_type = mic2;
6774   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6775     __kmp_mic_type = mic3;
6776   } else {
6777     __kmp_mic_type = non_mic;
6778   }
6779 }
6780 
6781 #endif /* KMP_MIC_SUPPORTED */
6782 
6783 #if KMP_HAVE_UMWAIT
6784 static void __kmp_user_level_mwait_init() {
6785   struct kmp_cpuid buf;
6786   __kmp_x86_cpuid(7, 0, &buf);
6787   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6788   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6789                 __kmp_umwait_enabled));
6790 }
6791 #elif KMP_HAVE_MWAIT
6792 #ifndef AT_INTELPHIUSERMWAIT
6793 // Spurious, non-existent value that should always fail to return anything.
6794 // Will be replaced with the correct value when we know that.
6795 #define AT_INTELPHIUSERMWAIT 10000
6796 #endif
6797 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6798 // earlier OS is used to build the RTL, we'll use the following internal
6799 // function when the entry is not found.
6800 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6801 unsigned long getauxval(unsigned long) { return 0; }
6802 
6803 static void __kmp_user_level_mwait_init() {
6804   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6805   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6806   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6807   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6808   if (__kmp_mic_type == mic3) {
6809     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6810     if ((res & 0x1) || __kmp_user_level_mwait) {
6811       __kmp_mwait_enabled = TRUE;
6812       if (__kmp_user_level_mwait) {
6813         KMP_INFORM(EnvMwaitWarn);
6814       }
6815     } else {
6816       __kmp_mwait_enabled = FALSE;
6817     }
6818   }
6819   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6820                 "__kmp_mwait_enabled = %d\n",
6821                 __kmp_mic_type, __kmp_mwait_enabled));
6822 }
6823 #endif /* KMP_HAVE_UMWAIT */
6824 
6825 static void __kmp_do_serial_initialize(void) {
6826   int i, gtid;
6827   size_t size;
6828 
6829   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6830 
6831   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6832   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6833   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6834   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6835   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6836 
6837 #if OMPT_SUPPORT
6838   ompt_pre_init();
6839 #endif
6840 #if OMPD_SUPPORT
6841   __kmp_env_dump();
6842   ompd_init();
6843 #endif
6844 
6845   __kmp_validate_locks();
6846 
6847   /* Initialize internal memory allocator */
6848   __kmp_init_allocator();
6849 
6850   /* Register the library startup via an environment variable and check to see
6851      whether another copy of the library is already registered. */
6852 
6853   __kmp_register_library_startup();
6854 
6855   /* TODO reinitialization of library */
6856   if (TCR_4(__kmp_global.g.g_done)) {
6857     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6858   }
6859 
6860   __kmp_global.g.g_abort = 0;
6861   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6862 
6863 /* initialize the locks */
6864 #if KMP_USE_ADAPTIVE_LOCKS
6865 #if KMP_DEBUG_ADAPTIVE_LOCKS
6866   __kmp_init_speculative_stats();
6867 #endif
6868 #endif
6869 #if KMP_STATS_ENABLED
6870   __kmp_stats_init();
6871 #endif
6872   __kmp_init_lock(&__kmp_global_lock);
6873   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6874   __kmp_init_lock(&__kmp_debug_lock);
6875   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6876   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6877   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6878   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6879   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6880   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6881   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6882   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6883   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6884   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6885   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6886   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6887   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6888   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6889   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6890 #if KMP_USE_MONITOR
6891   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6892 #endif
6893   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6894 
6895   /* conduct initialization and initial setup of configuration */
6896 
6897   __kmp_runtime_initialize();
6898 
6899 #if KMP_MIC_SUPPORTED
6900   __kmp_check_mic_type();
6901 #endif
6902 
6903 // Some global variable initialization moved here from kmp_env_initialize()
6904 #ifdef KMP_DEBUG
6905   kmp_diag = 0;
6906 #endif
6907   __kmp_abort_delay = 0;
6908 
6909   // From __kmp_init_dflt_team_nth()
6910   /* assume the entire machine will be used */
6911   __kmp_dflt_team_nth_ub = __kmp_xproc;
6912   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6913     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6914   }
6915   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6916     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6917   }
6918   __kmp_max_nth = __kmp_sys_max_nth;
6919   __kmp_cg_max_nth = __kmp_sys_max_nth;
6920   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6921   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6922     __kmp_teams_max_nth = __kmp_sys_max_nth;
6923   }
6924 
6925   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6926   // part
6927   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6928 #if KMP_USE_MONITOR
6929   __kmp_monitor_wakeups =
6930       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6931   __kmp_bt_intervals =
6932       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6933 #endif
6934   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6935   __kmp_library = library_throughput;
6936   // From KMP_SCHEDULE initialization
6937   __kmp_static = kmp_sch_static_balanced;
6938 // AC: do not use analytical here, because it is non-monotonous
6939 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6940 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6941 // need to repeat assignment
6942 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6943 // bit control and barrier method control parts
6944 #if KMP_FAST_REDUCTION_BARRIER
6945 #define kmp_reduction_barrier_gather_bb ((int)1)
6946 #define kmp_reduction_barrier_release_bb ((int)1)
6947 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
6948 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
6949 #endif // KMP_FAST_REDUCTION_BARRIER
6950   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6951     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6952     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6953     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6954     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6955 #if KMP_FAST_REDUCTION_BARRIER
6956     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6957       // lin_64 ): hyper,1
6958       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6959       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6960       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6961       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6962     }
6963 #endif // KMP_FAST_REDUCTION_BARRIER
6964   }
6965 #if KMP_FAST_REDUCTION_BARRIER
6966 #undef kmp_reduction_barrier_release_pat
6967 #undef kmp_reduction_barrier_gather_pat
6968 #undef kmp_reduction_barrier_release_bb
6969 #undef kmp_reduction_barrier_gather_bb
6970 #endif // KMP_FAST_REDUCTION_BARRIER
6971 #if KMP_MIC_SUPPORTED
6972   if (__kmp_mic_type == mic2) { // KNC
6973     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6974     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6975     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6976         1; // forkjoin release
6977     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6978     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6979   }
6980 #if KMP_FAST_REDUCTION_BARRIER
6981   if (__kmp_mic_type == mic2) { // KNC
6982     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6983     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6984   }
6985 #endif // KMP_FAST_REDUCTION_BARRIER
6986 #endif // KMP_MIC_SUPPORTED
6987 
6988 // From KMP_CHECKS initialization
6989 #ifdef KMP_DEBUG
6990   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6991 #else
6992   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6993 #endif
6994 
6995   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6996   __kmp_foreign_tp = TRUE;
6997 
6998   __kmp_global.g.g_dynamic = FALSE;
6999   __kmp_global.g.g_dynamic_mode = dynamic_default;
7000 
7001   __kmp_init_nesting_mode();
7002 
7003   __kmp_env_initialize(NULL);
7004 
7005 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7006   __kmp_user_level_mwait_init();
7007 #endif
7008 // Print all messages in message catalog for testing purposes.
7009 #ifdef KMP_DEBUG
7010   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7011   if (__kmp_str_match_true(val)) {
7012     kmp_str_buf_t buffer;
7013     __kmp_str_buf_init(&buffer);
7014     __kmp_i18n_dump_catalog(&buffer);
7015     __kmp_printf("%s", buffer.str);
7016     __kmp_str_buf_free(&buffer);
7017   }
7018   __kmp_env_free(&val);
7019 #endif
7020 
7021   __kmp_threads_capacity =
7022       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7023   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7024   __kmp_tp_capacity = __kmp_default_tp_capacity(
7025       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7026 
7027   // If the library is shut down properly, both pools must be NULL. Just in
7028   // case, set them to NULL -- some memory may leak, but subsequent code will
7029   // work even if pools are not freed.
7030   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7031   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7032   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7033   __kmp_thread_pool = NULL;
7034   __kmp_thread_pool_insert_pt = NULL;
7035   __kmp_team_pool = NULL;
7036 
7037   /* Allocate all of the variable sized records */
7038   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7039    * expandable */
7040   /* Since allocation is cache-aligned, just add extra padding at the end */
7041   size =
7042       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7043       CACHE_LINE;
7044   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7045   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7046                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
7047 
7048   /* init thread counts */
7049   KMP_DEBUG_ASSERT(__kmp_all_nth ==
7050                    0); // Asserts fail if the library is reinitializing and
7051   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7052   __kmp_all_nth = 0;
7053   __kmp_nth = 0;
7054 
7055   /* setup the uber master thread and hierarchy */
7056   gtid = __kmp_register_root(TRUE);
7057   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
7058   KMP_ASSERT(KMP_UBER_GTID(gtid));
7059   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7060 
7061   KMP_MB(); /* Flush all pending memory write invalidates.  */
7062 
7063   __kmp_common_initialize();
7064 
7065 #if KMP_OS_UNIX
7066   /* invoke the child fork handler */
7067   __kmp_register_atfork();
7068 #endif
7069 
7070 #if !KMP_DYNAMIC_LIB
7071   {
7072     /* Invoke the exit handler when the program finishes, only for static
7073        library. For dynamic library, we already have _fini and DllMain. */
7074     int rc = atexit(__kmp_internal_end_atexit);
7075     if (rc != 0) {
7076       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7077                   __kmp_msg_null);
7078     }
7079   }
7080 #endif
7081 
7082 #if KMP_HANDLE_SIGNALS
7083 #if KMP_OS_UNIX
7084   /* NOTE: make sure that this is called before the user installs their own
7085      signal handlers so that the user handlers are called first. this way they
7086      can return false, not call our handler, avoid terminating the library, and
7087      continue execution where they left off. */
7088   __kmp_install_signals(FALSE);
7089 #endif /* KMP_OS_UNIX */
7090 #if KMP_OS_WINDOWS
7091   __kmp_install_signals(TRUE);
7092 #endif /* KMP_OS_WINDOWS */
7093 #endif
7094 
7095   /* we have finished the serial initialization */
7096   __kmp_init_counter++;
7097 
7098   __kmp_init_serial = TRUE;
7099 
7100   if (__kmp_settings) {
7101     __kmp_env_print();
7102   }
7103 
7104   if (__kmp_display_env || __kmp_display_env_verbose) {
7105     __kmp_env_print_2();
7106   }
7107 
7108 #if OMPT_SUPPORT
7109   ompt_post_init();
7110 #endif
7111 
7112   KMP_MB();
7113 
7114   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7115 }
7116 
7117 void __kmp_serial_initialize(void) {
7118   if (__kmp_init_serial) {
7119     return;
7120   }
7121   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7122   if (__kmp_init_serial) {
7123     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7124     return;
7125   }
7126   __kmp_do_serial_initialize();
7127   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7128 }
7129 
7130 static void __kmp_do_middle_initialize(void) {
7131   int i, j;
7132   int prev_dflt_team_nth;
7133 
7134   if (!__kmp_init_serial) {
7135     __kmp_do_serial_initialize();
7136   }
7137 
7138   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7139 
7140   // Save the previous value for the __kmp_dflt_team_nth so that
7141   // we can avoid some reinitialization if it hasn't changed.
7142   prev_dflt_team_nth = __kmp_dflt_team_nth;
7143 
7144 #if KMP_AFFINITY_SUPPORTED
7145   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7146   // number of cores on the machine.
7147   __kmp_affinity_initialize();
7148 
7149 #endif /* KMP_AFFINITY_SUPPORTED */
7150 
7151   KMP_ASSERT(__kmp_xproc > 0);
7152   if (__kmp_avail_proc == 0) {
7153     __kmp_avail_proc = __kmp_xproc;
7154   }
7155 
7156   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7157   // correct them now
7158   j = 0;
7159   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7160     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7161         __kmp_avail_proc;
7162     j++;
7163   }
7164 
7165   if (__kmp_dflt_team_nth == 0) {
7166 #ifdef KMP_DFLT_NTH_CORES
7167     // Default #threads = #cores
7168     __kmp_dflt_team_nth = __kmp_ncores;
7169     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7170                   "__kmp_ncores (%d)\n",
7171                   __kmp_dflt_team_nth));
7172 #else
7173     // Default #threads = #available OS procs
7174     __kmp_dflt_team_nth = __kmp_avail_proc;
7175     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7176                   "__kmp_avail_proc(%d)\n",
7177                   __kmp_dflt_team_nth));
7178 #endif /* KMP_DFLT_NTH_CORES */
7179   }
7180 
7181   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7182     __kmp_dflt_team_nth = KMP_MIN_NTH;
7183   }
7184   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7185     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7186   }
7187 
7188   if (__kmp_nesting_mode > 0)
7189     __kmp_set_nesting_mode_threads();
7190 
7191   // There's no harm in continuing if the following check fails,
7192   // but it indicates an error in the previous logic.
7193   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7194 
7195   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7196     // Run through the __kmp_threads array and set the num threads icv for each
7197     // root thread that is currently registered with the RTL (which has not
7198     // already explicitly set its nthreads-var with a call to
7199     // omp_set_num_threads()).
7200     for (i = 0; i < __kmp_threads_capacity; i++) {
7201       kmp_info_t *thread = __kmp_threads[i];
7202       if (thread == NULL)
7203         continue;
7204       if (thread->th.th_current_task->td_icvs.nproc != 0)
7205         continue;
7206 
7207       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7208     }
7209   }
7210   KA_TRACE(
7211       20,
7212       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7213        __kmp_dflt_team_nth));
7214 
7215 #ifdef KMP_ADJUST_BLOCKTIME
7216   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7217   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7218     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7219     if (__kmp_nth > __kmp_avail_proc) {
7220       __kmp_zero_bt = TRUE;
7221     }
7222   }
7223 #endif /* KMP_ADJUST_BLOCKTIME */
7224 
7225   /* we have finished middle initialization */
7226   TCW_SYNC_4(__kmp_init_middle, TRUE);
7227 
7228   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7229 }
7230 
7231 void __kmp_middle_initialize(void) {
7232   if (__kmp_init_middle) {
7233     return;
7234   }
7235   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7236   if (__kmp_init_middle) {
7237     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7238     return;
7239   }
7240   __kmp_do_middle_initialize();
7241   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7242 }
7243 
7244 void __kmp_parallel_initialize(void) {
7245   int gtid = __kmp_entry_gtid(); // this might be a new root
7246 
7247   /* synchronize parallel initialization (for sibling) */
7248   if (TCR_4(__kmp_init_parallel))
7249     return;
7250   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7251   if (TCR_4(__kmp_init_parallel)) {
7252     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7253     return;
7254   }
7255 
7256   /* TODO reinitialization after we have already shut down */
7257   if (TCR_4(__kmp_global.g.g_done)) {
7258     KA_TRACE(
7259         10,
7260         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7261     __kmp_infinite_loop();
7262   }
7263 
7264   /* jc: The lock __kmp_initz_lock is already held, so calling
7265      __kmp_serial_initialize would cause a deadlock.  So we call
7266      __kmp_do_serial_initialize directly. */
7267   if (!__kmp_init_middle) {
7268     __kmp_do_middle_initialize();
7269   }
7270   __kmp_assign_root_init_mask();
7271   __kmp_resume_if_hard_paused();
7272 
7273   /* begin initialization */
7274   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7275   KMP_ASSERT(KMP_UBER_GTID(gtid));
7276 
7277 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7278   // Save the FP control regs.
7279   // Worker threads will set theirs to these values at thread startup.
7280   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7281   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7282   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7283 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7284 
7285 #if KMP_OS_UNIX
7286 #if KMP_HANDLE_SIGNALS
7287   /*  must be after __kmp_serial_initialize  */
7288   __kmp_install_signals(TRUE);
7289 #endif
7290 #endif
7291 
7292   __kmp_suspend_initialize();
7293 
7294 #if defined(USE_LOAD_BALANCE)
7295   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7296     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7297   }
7298 #else
7299   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7300     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7301   }
7302 #endif
7303 
7304   if (__kmp_version) {
7305     __kmp_print_version_2();
7306   }
7307 
7308   /* we have finished parallel initialization */
7309   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7310 
7311   KMP_MB();
7312   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7313 
7314   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7315 }
7316 
7317 void __kmp_hidden_helper_initialize() {
7318   if (TCR_4(__kmp_init_hidden_helper))
7319     return;
7320 
7321   // __kmp_parallel_initialize is required before we initialize hidden helper
7322   if (!TCR_4(__kmp_init_parallel))
7323     __kmp_parallel_initialize();
7324 
7325   // Double check. Note that this double check should not be placed before
7326   // __kmp_parallel_initialize as it will cause dead lock.
7327   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7328   if (TCR_4(__kmp_init_hidden_helper)) {
7329     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7330     return;
7331   }
7332 
7333   // Set the count of hidden helper tasks to be executed to zero
7334   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7335 
7336   // Set the global variable indicating that we're initializing hidden helper
7337   // team/threads
7338   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7339 
7340   // Platform independent initialization
7341   __kmp_do_initialize_hidden_helper_threads();
7342 
7343   // Wait here for the finish of initialization of hidden helper teams
7344   __kmp_hidden_helper_threads_initz_wait();
7345 
7346   // We have finished hidden helper initialization
7347   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7348 
7349   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7350 }
7351 
7352 /* ------------------------------------------------------------------------ */
7353 
7354 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7355                                    kmp_team_t *team) {
7356   kmp_disp_t *dispatch;
7357 
7358   KMP_MB();
7359 
7360   /* none of the threads have encountered any constructs, yet. */
7361   this_thr->th.th_local.this_construct = 0;
7362 #if KMP_CACHE_MANAGE
7363   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7364 #endif /* KMP_CACHE_MANAGE */
7365   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7366   KMP_DEBUG_ASSERT(dispatch);
7367   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7368   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7369   // this_thr->th.th_info.ds.ds_tid ] );
7370 
7371   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7372   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7373   if (__kmp_env_consistency_check)
7374     __kmp_push_parallel(gtid, team->t.t_ident);
7375 
7376   KMP_MB(); /* Flush all pending memory write invalidates.  */
7377 }
7378 
7379 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7380                                   kmp_team_t *team) {
7381   if (__kmp_env_consistency_check)
7382     __kmp_pop_parallel(gtid, team->t.t_ident);
7383 
7384   __kmp_finish_implicit_task(this_thr);
7385 }
7386 
7387 int __kmp_invoke_task_func(int gtid) {
7388   int rc;
7389   int tid = __kmp_tid_from_gtid(gtid);
7390   kmp_info_t *this_thr = __kmp_threads[gtid];
7391   kmp_team_t *team = this_thr->th.th_team;
7392 
7393   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7394 #if USE_ITT_BUILD
7395   if (__itt_stack_caller_create_ptr) {
7396     // inform ittnotify about entering user's code
7397     if (team->t.t_stack_id != NULL) {
7398       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7399     } else {
7400       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7401       __kmp_itt_stack_callee_enter(
7402           (__itt_caller)team->t.t_parent->t.t_stack_id);
7403     }
7404   }
7405 #endif /* USE_ITT_BUILD */
7406 #if INCLUDE_SSC_MARKS
7407   SSC_MARK_INVOKING();
7408 #endif
7409 
7410 #if OMPT_SUPPORT
7411   void *dummy;
7412   void **exit_frame_p;
7413   ompt_data_t *my_task_data;
7414   ompt_data_t *my_parallel_data;
7415   int ompt_team_size;
7416 
7417   if (ompt_enabled.enabled) {
7418     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7419                          .ompt_task_info.frame.exit_frame.ptr);
7420   } else {
7421     exit_frame_p = &dummy;
7422   }
7423 
7424   my_task_data =
7425       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7426   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7427   if (ompt_enabled.ompt_callback_implicit_task) {
7428     ompt_team_size = team->t.t_nproc;
7429     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7430         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7431         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7432     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7433   }
7434 #endif
7435 
7436 #if KMP_STATS_ENABLED
7437   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7438   if (previous_state == stats_state_e::TEAMS_REGION) {
7439     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7440   } else {
7441     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7442   }
7443   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7444 #endif
7445 
7446   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7447                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7448 #if OMPT_SUPPORT
7449                               ,
7450                               exit_frame_p
7451 #endif
7452   );
7453 #if OMPT_SUPPORT
7454   *exit_frame_p = NULL;
7455   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7456 #endif
7457 
7458 #if KMP_STATS_ENABLED
7459   if (previous_state == stats_state_e::TEAMS_REGION) {
7460     KMP_SET_THREAD_STATE(previous_state);
7461   }
7462   KMP_POP_PARTITIONED_TIMER();
7463 #endif
7464 
7465 #if USE_ITT_BUILD
7466   if (__itt_stack_caller_create_ptr) {
7467     // inform ittnotify about leaving user's code
7468     if (team->t.t_stack_id != NULL) {
7469       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7470     } else {
7471       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7472       __kmp_itt_stack_callee_leave(
7473           (__itt_caller)team->t.t_parent->t.t_stack_id);
7474     }
7475   }
7476 #endif /* USE_ITT_BUILD */
7477   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7478 
7479   return rc;
7480 }
7481 
7482 void __kmp_teams_master(int gtid) {
7483   // This routine is called by all primary threads in teams construct
7484   kmp_info_t *thr = __kmp_threads[gtid];
7485   kmp_team_t *team = thr->th.th_team;
7486   ident_t *loc = team->t.t_ident;
7487   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7488   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7489   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7490   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7491                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7492 
7493   // This thread is a new CG root.  Set up the proper variables.
7494   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7495   tmp->cg_root = thr; // Make thr the CG root
7496   // Init to thread limit stored when league primary threads were forked
7497   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7498   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7499   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7500                  " cg_nthreads to 1\n",
7501                  thr, tmp));
7502   tmp->up = thr->th.th_cg_roots;
7503   thr->th.th_cg_roots = tmp;
7504 
7505 // Launch league of teams now, but not let workers execute
7506 // (they hang on fork barrier until next parallel)
7507 #if INCLUDE_SSC_MARKS
7508   SSC_MARK_FORKING();
7509 #endif
7510   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7511                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7512                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7513 #if INCLUDE_SSC_MARKS
7514   SSC_MARK_JOINING();
7515 #endif
7516   // If the team size was reduced from the limit, set it to the new size
7517   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7518     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7519   // AC: last parameter "1" eliminates join barrier which won't work because
7520   // worker threads are in a fork barrier waiting for more parallel regions
7521   __kmp_join_call(loc, gtid
7522 #if OMPT_SUPPORT
7523                   ,
7524                   fork_context_intel
7525 #endif
7526                   ,
7527                   1);
7528 }
7529 
7530 int __kmp_invoke_teams_master(int gtid) {
7531   kmp_info_t *this_thr = __kmp_threads[gtid];
7532   kmp_team_t *team = this_thr->th.th_team;
7533 #if KMP_DEBUG
7534   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7535     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7536                      (void *)__kmp_teams_master);
7537 #endif
7538   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7539 #if OMPT_SUPPORT
7540   int tid = __kmp_tid_from_gtid(gtid);
7541   ompt_data_t *task_data =
7542       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7543   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7544   if (ompt_enabled.ompt_callback_implicit_task) {
7545     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7546         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7547         ompt_task_initial);
7548     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7549   }
7550 #endif
7551   __kmp_teams_master(gtid);
7552 #if OMPT_SUPPORT
7553   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7554 #endif
7555   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7556   return 1;
7557 }
7558 
7559 /* this sets the requested number of threads for the next parallel region
7560    encountered by this team. since this should be enclosed in the forkjoin
7561    critical section it should avoid race conditions with asymmetrical nested
7562    parallelism */
7563 
7564 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7565   kmp_info_t *thr = __kmp_threads[gtid];
7566 
7567   if (num_threads > 0)
7568     thr->th.th_set_nproc = num_threads;
7569 }
7570 
7571 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7572                                     int num_threads) {
7573   KMP_DEBUG_ASSERT(thr);
7574   // Remember the number of threads for inner parallel regions
7575   if (!TCR_4(__kmp_init_middle))
7576     __kmp_middle_initialize(); // get internal globals calculated
7577   __kmp_assign_root_init_mask();
7578   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7579   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7580 
7581   if (num_threads == 0) {
7582     if (__kmp_teams_thread_limit > 0) {
7583       num_threads = __kmp_teams_thread_limit;
7584     } else {
7585       num_threads = __kmp_avail_proc / num_teams;
7586     }
7587     // adjust num_threads w/o warning as it is not user setting
7588     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7589     // no thread_limit clause specified -  do not change thread-limit-var ICV
7590     if (num_threads > __kmp_dflt_team_nth) {
7591       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7592     }
7593     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7594       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7595     } // prevent team size to exceed thread-limit-var
7596     if (num_teams * num_threads > __kmp_teams_max_nth) {
7597       num_threads = __kmp_teams_max_nth / num_teams;
7598     }
7599     if (num_threads == 0) {
7600       num_threads = 1;
7601     }
7602   } else {
7603     // This thread will be the primary thread of the league primary threads
7604     // Store new thread limit; old limit is saved in th_cg_roots list
7605     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7606     // num_threads = min(num_threads, nthreads-var)
7607     if (num_threads > __kmp_dflt_team_nth) {
7608       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7609     }
7610     if (num_teams * num_threads > __kmp_teams_max_nth) {
7611       int new_threads = __kmp_teams_max_nth / num_teams;
7612       if (new_threads == 0) {
7613         new_threads = 1;
7614       }
7615       if (new_threads != num_threads) {
7616         if (!__kmp_reserve_warn) { // user asked for too many threads
7617           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7618           __kmp_msg(kmp_ms_warning,
7619                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7620                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7621         }
7622       }
7623       num_threads = new_threads;
7624     }
7625   }
7626   thr->th.th_teams_size.nth = num_threads;
7627 }
7628 
7629 /* this sets the requested number of teams for the teams region and/or
7630    the number of threads for the next parallel region encountered  */
7631 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7632                           int num_threads) {
7633   kmp_info_t *thr = __kmp_threads[gtid];
7634   KMP_DEBUG_ASSERT(num_teams >= 0);
7635   KMP_DEBUG_ASSERT(num_threads >= 0);
7636 
7637   if (num_teams == 0) {
7638     if (__kmp_nteams > 0) {
7639       num_teams = __kmp_nteams;
7640     } else {
7641       num_teams = 1; // default number of teams is 1.
7642     }
7643   }
7644   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7645     if (!__kmp_reserve_warn) {
7646       __kmp_reserve_warn = 1;
7647       __kmp_msg(kmp_ms_warning,
7648                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7649                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7650     }
7651     num_teams = __kmp_teams_max_nth;
7652   }
7653   // Set number of teams (number of threads in the outer "parallel" of the
7654   // teams)
7655   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7656 
7657   __kmp_push_thread_limit(thr, num_teams, num_threads);
7658 }
7659 
7660 /* This sets the requested number of teams for the teams region and/or
7661    the number of threads for the next parallel region encountered  */
7662 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7663                              int num_teams_ub, int num_threads) {
7664   kmp_info_t *thr = __kmp_threads[gtid];
7665   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7666   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7667   KMP_DEBUG_ASSERT(num_threads >= 0);
7668 
7669   if (num_teams_lb > num_teams_ub) {
7670     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7671                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7672   }
7673 
7674   int num_teams = 1; // defalt number of teams is 1.
7675 
7676   if (num_teams_lb == 0 && num_teams_ub > 0)
7677     num_teams_lb = num_teams_ub;
7678 
7679   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7680     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7681     if (num_teams > __kmp_teams_max_nth) {
7682       if (!__kmp_reserve_warn) {
7683         __kmp_reserve_warn = 1;
7684         __kmp_msg(kmp_ms_warning,
7685                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7686                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7687       }
7688       num_teams = __kmp_teams_max_nth;
7689     }
7690   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7691     num_teams = num_teams_ub;
7692   } else { // num_teams_lb <= num_teams <= num_teams_ub
7693     if (num_threads == 0) {
7694       if (num_teams_ub > __kmp_teams_max_nth) {
7695         num_teams = num_teams_lb;
7696       } else {
7697         num_teams = num_teams_ub;
7698       }
7699     } else {
7700       num_teams = (num_threads > __kmp_teams_max_nth)
7701                       ? num_teams
7702                       : __kmp_teams_max_nth / num_threads;
7703       if (num_teams < num_teams_lb) {
7704         num_teams = num_teams_lb;
7705       } else if (num_teams > num_teams_ub) {
7706         num_teams = num_teams_ub;
7707       }
7708     }
7709   }
7710   // Set number of teams (number of threads in the outer "parallel" of the
7711   // teams)
7712   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7713 
7714   __kmp_push_thread_limit(thr, num_teams, num_threads);
7715 }
7716 
7717 // Set the proc_bind var to use in the following parallel region.
7718 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7719   kmp_info_t *thr = __kmp_threads[gtid];
7720   thr->th.th_set_proc_bind = proc_bind;
7721 }
7722 
7723 /* Launch the worker threads into the microtask. */
7724 
7725 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7726   kmp_info_t *this_thr = __kmp_threads[gtid];
7727 
7728 #ifdef KMP_DEBUG
7729   int f;
7730 #endif /* KMP_DEBUG */
7731 
7732   KMP_DEBUG_ASSERT(team);
7733   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7734   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7735   KMP_MB(); /* Flush all pending memory write invalidates.  */
7736 
7737   team->t.t_construct = 0; /* no single directives seen yet */
7738   team->t.t_ordered.dt.t_value =
7739       0; /* thread 0 enters the ordered section first */
7740 
7741   /* Reset the identifiers on the dispatch buffer */
7742   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7743   if (team->t.t_max_nproc > 1) {
7744     int i;
7745     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7746       team->t.t_disp_buffer[i].buffer_index = i;
7747       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7748     }
7749   } else {
7750     team->t.t_disp_buffer[0].buffer_index = 0;
7751     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7752   }
7753 
7754   KMP_MB(); /* Flush all pending memory write invalidates.  */
7755   KMP_ASSERT(this_thr->th.th_team == team);
7756 
7757 #ifdef KMP_DEBUG
7758   for (f = 0; f < team->t.t_nproc; f++) {
7759     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7760                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7761   }
7762 #endif /* KMP_DEBUG */
7763 
7764   /* release the worker threads so they may begin working */
7765   __kmp_fork_barrier(gtid, 0);
7766 }
7767 
7768 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7769   kmp_info_t *this_thr = __kmp_threads[gtid];
7770 
7771   KMP_DEBUG_ASSERT(team);
7772   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7773   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7774   KMP_MB(); /* Flush all pending memory write invalidates.  */
7775 
7776   /* Join barrier after fork */
7777 
7778 #ifdef KMP_DEBUG
7779   if (__kmp_threads[gtid] &&
7780       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7781     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7782                  __kmp_threads[gtid]);
7783     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7784                  "team->t.t_nproc=%d\n",
7785                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7786                  team->t.t_nproc);
7787     __kmp_print_structure();
7788   }
7789   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7790                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7791 #endif /* KMP_DEBUG */
7792 
7793   __kmp_join_barrier(gtid); /* wait for everyone */
7794 #if OMPT_SUPPORT
7795   if (ompt_enabled.enabled &&
7796       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7797     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7798     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7799     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7800 #if OMPT_OPTIONAL
7801     void *codeptr = NULL;
7802     if (KMP_MASTER_TID(ds_tid) &&
7803         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7804          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7805       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7806 
7807     if (ompt_enabled.ompt_callback_sync_region_wait) {
7808       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7809           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7810           codeptr);
7811     }
7812     if (ompt_enabled.ompt_callback_sync_region) {
7813       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7814           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7815           codeptr);
7816     }
7817 #endif
7818     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7819       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7820           ompt_scope_end, NULL, task_data, 0, ds_tid,
7821           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7822     }
7823   }
7824 #endif
7825 
7826   KMP_MB(); /* Flush all pending memory write invalidates.  */
7827   KMP_ASSERT(this_thr->th.th_team == team);
7828 }
7829 
7830 /* ------------------------------------------------------------------------ */
7831 
7832 #ifdef USE_LOAD_BALANCE
7833 
7834 // Return the worker threads actively spinning in the hot team, if we
7835 // are at the outermost level of parallelism.  Otherwise, return 0.
7836 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7837   int i;
7838   int retval;
7839   kmp_team_t *hot_team;
7840 
7841   if (root->r.r_active) {
7842     return 0;
7843   }
7844   hot_team = root->r.r_hot_team;
7845   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7846     return hot_team->t.t_nproc - 1; // Don't count primary thread
7847   }
7848 
7849   // Skip the primary thread - it is accounted for elsewhere.
7850   retval = 0;
7851   for (i = 1; i < hot_team->t.t_nproc; i++) {
7852     if (hot_team->t.t_threads[i]->th.th_active) {
7853       retval++;
7854     }
7855   }
7856   return retval;
7857 }
7858 
7859 // Perform an automatic adjustment to the number of
7860 // threads used by the next parallel region.
7861 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7862   int retval;
7863   int pool_active;
7864   int hot_team_active;
7865   int team_curr_active;
7866   int system_active;
7867 
7868   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7869                 set_nproc));
7870   KMP_DEBUG_ASSERT(root);
7871   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7872                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7873   KMP_DEBUG_ASSERT(set_nproc > 1);
7874 
7875   if (set_nproc == 1) {
7876     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7877     return 1;
7878   }
7879 
7880   // Threads that are active in the thread pool, active in the hot team for this
7881   // particular root (if we are at the outer par level), and the currently
7882   // executing thread (to become the primary thread) are available to add to the
7883   // new team, but are currently contributing to the system load, and must be
7884   // accounted for.
7885   pool_active = __kmp_thread_pool_active_nth;
7886   hot_team_active = __kmp_active_hot_team_nproc(root);
7887   team_curr_active = pool_active + hot_team_active + 1;
7888 
7889   // Check the system load.
7890   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7891   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7892                 "hot team active = %d\n",
7893                 system_active, pool_active, hot_team_active));
7894 
7895   if (system_active < 0) {
7896     // There was an error reading the necessary info from /proc, so use the
7897     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7898     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7899     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7900     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7901 
7902     // Make this call behave like the thread limit algorithm.
7903     retval = __kmp_avail_proc - __kmp_nth +
7904              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7905     if (retval > set_nproc) {
7906       retval = set_nproc;
7907     }
7908     if (retval < KMP_MIN_NTH) {
7909       retval = KMP_MIN_NTH;
7910     }
7911 
7912     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7913                   retval));
7914     return retval;
7915   }
7916 
7917   // There is a slight delay in the load balance algorithm in detecting new
7918   // running procs. The real system load at this instant should be at least as
7919   // large as the #active omp thread that are available to add to the team.
7920   if (system_active < team_curr_active) {
7921     system_active = team_curr_active;
7922   }
7923   retval = __kmp_avail_proc - system_active + team_curr_active;
7924   if (retval > set_nproc) {
7925     retval = set_nproc;
7926   }
7927   if (retval < KMP_MIN_NTH) {
7928     retval = KMP_MIN_NTH;
7929   }
7930 
7931   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7932   return retval;
7933 } // __kmp_load_balance_nproc()
7934 
7935 #endif /* USE_LOAD_BALANCE */
7936 
7937 /* ------------------------------------------------------------------------ */
7938 
7939 /* NOTE: this is called with the __kmp_init_lock held */
7940 void __kmp_cleanup(void) {
7941   int f;
7942 
7943   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7944 
7945   if (TCR_4(__kmp_init_parallel)) {
7946 #if KMP_HANDLE_SIGNALS
7947     __kmp_remove_signals();
7948 #endif
7949     TCW_4(__kmp_init_parallel, FALSE);
7950   }
7951 
7952   if (TCR_4(__kmp_init_middle)) {
7953 #if KMP_AFFINITY_SUPPORTED
7954     __kmp_affinity_uninitialize();
7955 #endif /* KMP_AFFINITY_SUPPORTED */
7956     __kmp_cleanup_hierarchy();
7957     TCW_4(__kmp_init_middle, FALSE);
7958   }
7959 
7960   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7961 
7962   if (__kmp_init_serial) {
7963     __kmp_runtime_destroy();
7964     __kmp_init_serial = FALSE;
7965   }
7966 
7967   __kmp_cleanup_threadprivate_caches();
7968 
7969   for (f = 0; f < __kmp_threads_capacity; f++) {
7970     if (__kmp_root[f] != NULL) {
7971       __kmp_free(__kmp_root[f]);
7972       __kmp_root[f] = NULL;
7973     }
7974   }
7975   __kmp_free(__kmp_threads);
7976   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7977   // there is no need in freeing __kmp_root.
7978   __kmp_threads = NULL;
7979   __kmp_root = NULL;
7980   __kmp_threads_capacity = 0;
7981 
7982 #if KMP_USE_DYNAMIC_LOCK
7983   __kmp_cleanup_indirect_user_locks();
7984 #else
7985   __kmp_cleanup_user_locks();
7986 #endif
7987 #if OMPD_SUPPORT
7988   if (ompd_state) {
7989     __kmp_free(ompd_env_block);
7990     ompd_env_block = NULL;
7991     ompd_env_block_size = 0;
7992   }
7993 #endif
7994 
7995 #if KMP_AFFINITY_SUPPORTED
7996   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7997   __kmp_cpuinfo_file = NULL;
7998 #endif /* KMP_AFFINITY_SUPPORTED */
7999 
8000 #if KMP_USE_ADAPTIVE_LOCKS
8001 #if KMP_DEBUG_ADAPTIVE_LOCKS
8002   __kmp_print_speculative_stats();
8003 #endif
8004 #endif
8005   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8006   __kmp_nested_nth.nth = NULL;
8007   __kmp_nested_nth.size = 0;
8008   __kmp_nested_nth.used = 0;
8009   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8010   __kmp_nested_proc_bind.bind_types = NULL;
8011   __kmp_nested_proc_bind.size = 0;
8012   __kmp_nested_proc_bind.used = 0;
8013   if (__kmp_affinity_format) {
8014     KMP_INTERNAL_FREE(__kmp_affinity_format);
8015     __kmp_affinity_format = NULL;
8016   }
8017 
8018   __kmp_i18n_catclose();
8019 
8020 #if KMP_USE_HIER_SCHED
8021   __kmp_hier_scheds.deallocate();
8022 #endif
8023 
8024 #if KMP_STATS_ENABLED
8025   __kmp_stats_fini();
8026 #endif
8027 
8028   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8029 }
8030 
8031 /* ------------------------------------------------------------------------ */
8032 
8033 int __kmp_ignore_mppbeg(void) {
8034   char *env;
8035 
8036   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8037     if (__kmp_str_match_false(env))
8038       return FALSE;
8039   }
8040   // By default __kmpc_begin() is no-op.
8041   return TRUE;
8042 }
8043 
8044 int __kmp_ignore_mppend(void) {
8045   char *env;
8046 
8047   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8048     if (__kmp_str_match_false(env))
8049       return FALSE;
8050   }
8051   // By default __kmpc_end() is no-op.
8052   return TRUE;
8053 }
8054 
8055 void __kmp_internal_begin(void) {
8056   int gtid;
8057   kmp_root_t *root;
8058 
8059   /* this is a very important step as it will register new sibling threads
8060      and assign these new uber threads a new gtid */
8061   gtid = __kmp_entry_gtid();
8062   root = __kmp_threads[gtid]->th.th_root;
8063   KMP_ASSERT(KMP_UBER_GTID(gtid));
8064 
8065   if (root->r.r_begin)
8066     return;
8067   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8068   if (root->r.r_begin) {
8069     __kmp_release_lock(&root->r.r_begin_lock, gtid);
8070     return;
8071   }
8072 
8073   root->r.r_begin = TRUE;
8074 
8075   __kmp_release_lock(&root->r.r_begin_lock, gtid);
8076 }
8077 
8078 /* ------------------------------------------------------------------------ */
8079 
8080 void __kmp_user_set_library(enum library_type arg) {
8081   int gtid;
8082   kmp_root_t *root;
8083   kmp_info_t *thread;
8084 
8085   /* first, make sure we are initialized so we can get our gtid */
8086 
8087   gtid = __kmp_entry_gtid();
8088   thread = __kmp_threads[gtid];
8089 
8090   root = thread->th.th_root;
8091 
8092   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8093                 library_serial));
8094   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8095                                   thread */
8096     KMP_WARNING(SetLibraryIncorrectCall);
8097     return;
8098   }
8099 
8100   switch (arg) {
8101   case library_serial:
8102     thread->th.th_set_nproc = 0;
8103     set__nproc(thread, 1);
8104     break;
8105   case library_turnaround:
8106     thread->th.th_set_nproc = 0;
8107     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8108                                            : __kmp_dflt_team_nth_ub);
8109     break;
8110   case library_throughput:
8111     thread->th.th_set_nproc = 0;
8112     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8113                                            : __kmp_dflt_team_nth_ub);
8114     break;
8115   default:
8116     KMP_FATAL(UnknownLibraryType, arg);
8117   }
8118 
8119   __kmp_aux_set_library(arg);
8120 }
8121 
8122 void __kmp_aux_set_stacksize(size_t arg) {
8123   if (!__kmp_init_serial)
8124     __kmp_serial_initialize();
8125 
8126 #if KMP_OS_DARWIN
8127   if (arg & (0x1000 - 1)) {
8128     arg &= ~(0x1000 - 1);
8129     if (arg + 0x1000) /* check for overflow if we round up */
8130       arg += 0x1000;
8131   }
8132 #endif
8133   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8134 
8135   /* only change the default stacksize before the first parallel region */
8136   if (!TCR_4(__kmp_init_parallel)) {
8137     size_t value = arg; /* argument is in bytes */
8138 
8139     if (value < __kmp_sys_min_stksize)
8140       value = __kmp_sys_min_stksize;
8141     else if (value > KMP_MAX_STKSIZE)
8142       value = KMP_MAX_STKSIZE;
8143 
8144     __kmp_stksize = value;
8145 
8146     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8147   }
8148 
8149   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8150 }
8151 
8152 /* set the behaviour of the runtime library */
8153 /* TODO this can cause some odd behaviour with sibling parallelism... */
8154 void __kmp_aux_set_library(enum library_type arg) {
8155   __kmp_library = arg;
8156 
8157   switch (__kmp_library) {
8158   case library_serial: {
8159     KMP_INFORM(LibraryIsSerial);
8160   } break;
8161   case library_turnaround:
8162     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8163       __kmp_use_yield = 2; // only yield when oversubscribed
8164     break;
8165   case library_throughput:
8166     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8167       __kmp_dflt_blocktime = 200;
8168     break;
8169   default:
8170     KMP_FATAL(UnknownLibraryType, arg);
8171   }
8172 }
8173 
8174 /* Getting team information common for all team API */
8175 // Returns NULL if not in teams construct
8176 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8177   kmp_info_t *thr = __kmp_entry_thread();
8178   teams_serialized = 0;
8179   if (thr->th.th_teams_microtask) {
8180     kmp_team_t *team = thr->th.th_team;
8181     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8182     int ii = team->t.t_level;
8183     teams_serialized = team->t.t_serialized;
8184     int level = tlevel + 1;
8185     KMP_DEBUG_ASSERT(ii >= tlevel);
8186     while (ii > level) {
8187       for (teams_serialized = team->t.t_serialized;
8188            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8189       }
8190       if (team->t.t_serialized && (!teams_serialized)) {
8191         team = team->t.t_parent;
8192         continue;
8193       }
8194       if (ii > level) {
8195         team = team->t.t_parent;
8196         ii--;
8197       }
8198     }
8199     return team;
8200   }
8201   return NULL;
8202 }
8203 
8204 int __kmp_aux_get_team_num() {
8205   int serialized;
8206   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8207   if (team) {
8208     if (serialized > 1) {
8209       return 0; // teams region is serialized ( 1 team of 1 thread ).
8210     } else {
8211       return team->t.t_master_tid;
8212     }
8213   }
8214   return 0;
8215 }
8216 
8217 int __kmp_aux_get_num_teams() {
8218   int serialized;
8219   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8220   if (team) {
8221     if (serialized > 1) {
8222       return 1;
8223     } else {
8224       return team->t.t_parent->t.t_nproc;
8225     }
8226   }
8227   return 1;
8228 }
8229 
8230 /* ------------------------------------------------------------------------ */
8231 
8232 /*
8233  * Affinity Format Parser
8234  *
8235  * Field is in form of: %[[[0].]size]type
8236  * % and type are required (%% means print a literal '%')
8237  * type is either single char or long name surrounded by {},
8238  * e.g., N or {num_threads}
8239  * 0 => leading zeros
8240  * . => right justified when size is specified
8241  * by default output is left justified
8242  * size is the *minimum* field length
8243  * All other characters are printed as is
8244  *
8245  * Available field types:
8246  * L {thread_level}      - omp_get_level()
8247  * n {thread_num}        - omp_get_thread_num()
8248  * h {host}              - name of host machine
8249  * P {process_id}        - process id (integer)
8250  * T {thread_identifier} - native thread identifier (integer)
8251  * N {num_threads}       - omp_get_num_threads()
8252  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8253  * a {thread_affinity}   - comma separated list of integers or integer ranges
8254  *                         (values of affinity mask)
8255  *
8256  * Implementation-specific field types can be added
8257  * If a type is unknown, print "undefined"
8258  */
8259 
8260 // Structure holding the short name, long name, and corresponding data type
8261 // for snprintf.  A table of these will represent the entire valid keyword
8262 // field types.
8263 typedef struct kmp_affinity_format_field_t {
8264   char short_name; // from spec e.g., L -> thread level
8265   const char *long_name; // from spec thread_level -> thread level
8266   char field_format; // data type for snprintf (typically 'd' or 's'
8267   // for integer or string)
8268 } kmp_affinity_format_field_t;
8269 
8270 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8271 #if KMP_AFFINITY_SUPPORTED
8272     {'A', "thread_affinity", 's'},
8273 #endif
8274     {'t', "team_num", 'd'},
8275     {'T', "num_teams", 'd'},
8276     {'L', "nesting_level", 'd'},
8277     {'n', "thread_num", 'd'},
8278     {'N', "num_threads", 'd'},
8279     {'a', "ancestor_tnum", 'd'},
8280     {'H', "host", 's'},
8281     {'P', "process_id", 'd'},
8282     {'i', "native_thread_id", 'd'}};
8283 
8284 // Return the number of characters it takes to hold field
8285 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8286                                             const char **ptr,
8287                                             kmp_str_buf_t *field_buffer) {
8288   int rc, format_index, field_value;
8289   const char *width_left, *width_right;
8290   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8291   static const int FORMAT_SIZE = 20;
8292   char format[FORMAT_SIZE] = {0};
8293   char absolute_short_name = 0;
8294 
8295   KMP_DEBUG_ASSERT(gtid >= 0);
8296   KMP_DEBUG_ASSERT(th);
8297   KMP_DEBUG_ASSERT(**ptr == '%');
8298   KMP_DEBUG_ASSERT(field_buffer);
8299 
8300   __kmp_str_buf_clear(field_buffer);
8301 
8302   // Skip the initial %
8303   (*ptr)++;
8304 
8305   // Check for %% first
8306   if (**ptr == '%') {
8307     __kmp_str_buf_cat(field_buffer, "%", 1);
8308     (*ptr)++; // skip over the second %
8309     return 1;
8310   }
8311 
8312   // Parse field modifiers if they are present
8313   pad_zeros = false;
8314   if (**ptr == '0') {
8315     pad_zeros = true;
8316     (*ptr)++; // skip over 0
8317   }
8318   right_justify = false;
8319   if (**ptr == '.') {
8320     right_justify = true;
8321     (*ptr)++; // skip over .
8322   }
8323   // Parse width of field: [width_left, width_right)
8324   width_left = width_right = NULL;
8325   if (**ptr >= '0' && **ptr <= '9') {
8326     width_left = *ptr;
8327     SKIP_DIGITS(*ptr);
8328     width_right = *ptr;
8329   }
8330 
8331   // Create the format for KMP_SNPRINTF based on flags parsed above
8332   format_index = 0;
8333   format[format_index++] = '%';
8334   if (!right_justify)
8335     format[format_index++] = '-';
8336   if (pad_zeros)
8337     format[format_index++] = '0';
8338   if (width_left && width_right) {
8339     int i = 0;
8340     // Only allow 8 digit number widths.
8341     // This also prevents overflowing format variable
8342     while (i < 8 && width_left < width_right) {
8343       format[format_index++] = *width_left;
8344       width_left++;
8345       i++;
8346     }
8347   }
8348 
8349   // Parse a name (long or short)
8350   // Canonicalize the name into absolute_short_name
8351   found_valid_name = false;
8352   parse_long_name = (**ptr == '{');
8353   if (parse_long_name)
8354     (*ptr)++; // skip initial left brace
8355   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8356                              sizeof(__kmp_affinity_format_table[0]);
8357        ++i) {
8358     char short_name = __kmp_affinity_format_table[i].short_name;
8359     const char *long_name = __kmp_affinity_format_table[i].long_name;
8360     char field_format = __kmp_affinity_format_table[i].field_format;
8361     if (parse_long_name) {
8362       size_t length = KMP_STRLEN(long_name);
8363       if (strncmp(*ptr, long_name, length) == 0) {
8364         found_valid_name = true;
8365         (*ptr) += length; // skip the long name
8366       }
8367     } else if (**ptr == short_name) {
8368       found_valid_name = true;
8369       (*ptr)++; // skip the short name
8370     }
8371     if (found_valid_name) {
8372       format[format_index++] = field_format;
8373       format[format_index++] = '\0';
8374       absolute_short_name = short_name;
8375       break;
8376     }
8377   }
8378   if (parse_long_name) {
8379     if (**ptr != '}') {
8380       absolute_short_name = 0;
8381     } else {
8382       (*ptr)++; // skip over the right brace
8383     }
8384   }
8385 
8386   // Attempt to fill the buffer with the requested
8387   // value using snprintf within __kmp_str_buf_print()
8388   switch (absolute_short_name) {
8389   case 't':
8390     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8391     break;
8392   case 'T':
8393     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8394     break;
8395   case 'L':
8396     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8397     break;
8398   case 'n':
8399     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8400     break;
8401   case 'H': {
8402     static const int BUFFER_SIZE = 256;
8403     char buf[BUFFER_SIZE];
8404     __kmp_expand_host_name(buf, BUFFER_SIZE);
8405     rc = __kmp_str_buf_print(field_buffer, format, buf);
8406   } break;
8407   case 'P':
8408     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8409     break;
8410   case 'i':
8411     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8412     break;
8413   case 'N':
8414     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8415     break;
8416   case 'a':
8417     field_value =
8418         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8419     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8420     break;
8421 #if KMP_AFFINITY_SUPPORTED
8422   case 'A': {
8423     kmp_str_buf_t buf;
8424     __kmp_str_buf_init(&buf);
8425     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8426     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8427     __kmp_str_buf_free(&buf);
8428   } break;
8429 #endif
8430   default:
8431     // According to spec, If an implementation does not have info for field
8432     // type, then "undefined" is printed
8433     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8434     // Skip the field
8435     if (parse_long_name) {
8436       SKIP_TOKEN(*ptr);
8437       if (**ptr == '}')
8438         (*ptr)++;
8439     } else {
8440       (*ptr)++;
8441     }
8442   }
8443 
8444   KMP_ASSERT(format_index <= FORMAT_SIZE);
8445   return rc;
8446 }
8447 
8448 /*
8449  * Return number of characters needed to hold the affinity string
8450  * (not including null byte character)
8451  * The resultant string is printed to buffer, which the caller can then
8452  * handle afterwards
8453  */
8454 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8455                                   kmp_str_buf_t *buffer) {
8456   const char *parse_ptr;
8457   size_t retval;
8458   const kmp_info_t *th;
8459   kmp_str_buf_t field;
8460 
8461   KMP_DEBUG_ASSERT(buffer);
8462   KMP_DEBUG_ASSERT(gtid >= 0);
8463 
8464   __kmp_str_buf_init(&field);
8465   __kmp_str_buf_clear(buffer);
8466 
8467   th = __kmp_threads[gtid];
8468   retval = 0;
8469 
8470   // If format is NULL or zero-length string, then we use
8471   // affinity-format-var ICV
8472   parse_ptr = format;
8473   if (parse_ptr == NULL || *parse_ptr == '\0') {
8474     parse_ptr = __kmp_affinity_format;
8475   }
8476   KMP_DEBUG_ASSERT(parse_ptr);
8477 
8478   while (*parse_ptr != '\0') {
8479     // Parse a field
8480     if (*parse_ptr == '%') {
8481       // Put field in the buffer
8482       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8483       __kmp_str_buf_catbuf(buffer, &field);
8484       retval += rc;
8485     } else {
8486       // Put literal character in buffer
8487       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8488       retval++;
8489       parse_ptr++;
8490     }
8491   }
8492   __kmp_str_buf_free(&field);
8493   return retval;
8494 }
8495 
8496 // Displays the affinity string to stdout
8497 void __kmp_aux_display_affinity(int gtid, const char *format) {
8498   kmp_str_buf_t buf;
8499   __kmp_str_buf_init(&buf);
8500   __kmp_aux_capture_affinity(gtid, format, &buf);
8501   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8502   __kmp_str_buf_free(&buf);
8503 }
8504 
8505 /* ------------------------------------------------------------------------ */
8506 
8507 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8508   int blocktime = arg; /* argument is in milliseconds */
8509 #if KMP_USE_MONITOR
8510   int bt_intervals;
8511 #endif
8512   kmp_int8 bt_set;
8513 
8514   __kmp_save_internal_controls(thread);
8515 
8516   /* Normalize and set blocktime for the teams */
8517   if (blocktime < KMP_MIN_BLOCKTIME)
8518     blocktime = KMP_MIN_BLOCKTIME;
8519   else if (blocktime > KMP_MAX_BLOCKTIME)
8520     blocktime = KMP_MAX_BLOCKTIME;
8521 
8522   set__blocktime_team(thread->th.th_team, tid, blocktime);
8523   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8524 
8525 #if KMP_USE_MONITOR
8526   /* Calculate and set blocktime intervals for the teams */
8527   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8528 
8529   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8530   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8531 #endif
8532 
8533   /* Set whether blocktime has been set to "TRUE" */
8534   bt_set = TRUE;
8535 
8536   set__bt_set_team(thread->th.th_team, tid, bt_set);
8537   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8538 #if KMP_USE_MONITOR
8539   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8540                 "bt_intervals=%d, monitor_updates=%d\n",
8541                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8542                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8543                 __kmp_monitor_wakeups));
8544 #else
8545   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8546                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8547                 thread->th.th_team->t.t_id, tid, blocktime));
8548 #endif
8549 }
8550 
8551 void __kmp_aux_set_defaults(char const *str, size_t len) {
8552   if (!__kmp_init_serial) {
8553     __kmp_serial_initialize();
8554   }
8555   __kmp_env_initialize(str);
8556 
8557   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8558     __kmp_env_print();
8559   }
8560 } // __kmp_aux_set_defaults
8561 
8562 /* ------------------------------------------------------------------------ */
8563 /* internal fast reduction routines */
8564 
8565 PACKED_REDUCTION_METHOD_T
8566 __kmp_determine_reduction_method(
8567     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8568     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8569     kmp_critical_name *lck) {
8570 
8571   // Default reduction method: critical construct ( lck != NULL, like in current
8572   // PAROPT )
8573   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8574   // can be selected by RTL
8575   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8576   // can be selected by RTL
8577   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8578   // among generated by PAROPT.
8579 
8580   PACKED_REDUCTION_METHOD_T retval;
8581 
8582   int team_size;
8583 
8584   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8585   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8586 
8587 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8588   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8589 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8590 
8591   retval = critical_reduce_block;
8592 
8593   // another choice of getting a team size (with 1 dynamic deference) is slower
8594   team_size = __kmp_get_team_num_threads(global_tid);
8595   if (team_size == 1) {
8596 
8597     retval = empty_reduce_block;
8598 
8599   } else {
8600 
8601     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8602 
8603 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8604     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8605 
8606 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8607     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8608 
8609     int teamsize_cutoff = 4;
8610 
8611 #if KMP_MIC_SUPPORTED
8612     if (__kmp_mic_type != non_mic) {
8613       teamsize_cutoff = 8;
8614     }
8615 #endif
8616     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8617     if (tree_available) {
8618       if (team_size <= teamsize_cutoff) {
8619         if (atomic_available) {
8620           retval = atomic_reduce_block;
8621         }
8622       } else {
8623         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8624       }
8625     } else if (atomic_available) {
8626       retval = atomic_reduce_block;
8627     }
8628 #else
8629 #error "Unknown or unsupported OS"
8630 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8631        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8632 
8633 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8634 
8635 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8636 
8637     // basic tuning
8638 
8639     if (atomic_available) {
8640       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8641         retval = atomic_reduce_block;
8642       }
8643     } // otherwise: use critical section
8644 
8645 #elif KMP_OS_DARWIN
8646 
8647     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8648     if (atomic_available && (num_vars <= 3)) {
8649       retval = atomic_reduce_block;
8650     } else if (tree_available) {
8651       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8652           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8653         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8654       }
8655     } // otherwise: use critical section
8656 
8657 #else
8658 #error "Unknown or unsupported OS"
8659 #endif
8660 
8661 #else
8662 #error "Unknown or unsupported architecture"
8663 #endif
8664   }
8665 
8666   // KMP_FORCE_REDUCTION
8667 
8668   // If the team is serialized (team_size == 1), ignore the forced reduction
8669   // method and stay with the unsynchronized method (empty_reduce_block)
8670   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8671       team_size != 1) {
8672 
8673     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8674 
8675     int atomic_available, tree_available;
8676 
8677     switch ((forced_retval = __kmp_force_reduction_method)) {
8678     case critical_reduce_block:
8679       KMP_ASSERT(lck); // lck should be != 0
8680       break;
8681 
8682     case atomic_reduce_block:
8683       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8684       if (!atomic_available) {
8685         KMP_WARNING(RedMethodNotSupported, "atomic");
8686         forced_retval = critical_reduce_block;
8687       }
8688       break;
8689 
8690     case tree_reduce_block:
8691       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8692       if (!tree_available) {
8693         KMP_WARNING(RedMethodNotSupported, "tree");
8694         forced_retval = critical_reduce_block;
8695       } else {
8696 #if KMP_FAST_REDUCTION_BARRIER
8697         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8698 #endif
8699       }
8700       break;
8701 
8702     default:
8703       KMP_ASSERT(0); // "unsupported method specified"
8704     }
8705 
8706     retval = forced_retval;
8707   }
8708 
8709   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8710 
8711 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8712 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8713 
8714   return (retval);
8715 }
8716 // this function is for testing set/get/determine reduce method
8717 kmp_int32 __kmp_get_reduce_method(void) {
8718   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8719 }
8720 
8721 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8722 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8723 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8724 
8725 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8726 // OpenMP is used subsequently.
8727 void __kmp_hard_pause() {
8728   __kmp_pause_status = kmp_hard_paused;
8729   __kmp_internal_end_thread(-1);
8730 }
8731 
8732 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8733 void __kmp_resume_if_soft_paused() {
8734   if (__kmp_pause_status == kmp_soft_paused) {
8735     __kmp_pause_status = kmp_not_paused;
8736 
8737     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8738       kmp_info_t *thread = __kmp_threads[gtid];
8739       if (thread) { // Wake it if sleeping
8740         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8741                          thread);
8742         if (fl.is_sleeping())
8743           fl.resume(gtid);
8744         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8745           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8746         } else { // thread holds the lock and may sleep soon
8747           do { // until either the thread sleeps, or we can get the lock
8748             if (fl.is_sleeping()) {
8749               fl.resume(gtid);
8750               break;
8751             } else if (__kmp_try_suspend_mx(thread)) {
8752               __kmp_unlock_suspend_mx(thread);
8753               break;
8754             }
8755           } while (1);
8756         }
8757       }
8758     }
8759   }
8760 }
8761 
8762 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8763 // TODO: add warning messages
8764 int __kmp_pause_resource(kmp_pause_status_t level) {
8765   if (level == kmp_not_paused) { // requesting resume
8766     if (__kmp_pause_status == kmp_not_paused) {
8767       // error message about runtime not being paused, so can't resume
8768       return 1;
8769     } else {
8770       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8771                        __kmp_pause_status == kmp_hard_paused);
8772       __kmp_pause_status = kmp_not_paused;
8773       return 0;
8774     }
8775   } else if (level == kmp_soft_paused) { // requesting soft pause
8776     if (__kmp_pause_status != kmp_not_paused) {
8777       // error message about already being paused
8778       return 1;
8779     } else {
8780       __kmp_soft_pause();
8781       return 0;
8782     }
8783   } else if (level == kmp_hard_paused) { // requesting hard pause
8784     if (__kmp_pause_status != kmp_not_paused) {
8785       // error message about already being paused
8786       return 1;
8787     } else {
8788       __kmp_hard_pause();
8789       return 0;
8790     }
8791   } else {
8792     // error message about invalid level
8793     return 1;
8794   }
8795 }
8796 
8797 void __kmp_omp_display_env(int verbose) {
8798   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8799   if (__kmp_init_serial == 0)
8800     __kmp_do_serial_initialize();
8801   __kmp_display_env_impl(!verbose, verbose);
8802   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8803 }
8804 
8805 // The team size is changing, so distributed barrier must be modified
8806 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
8807                                int new_nthreads) {
8808   KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
8809                    bp_dist_bar);
8810   kmp_info_t **other_threads = team->t.t_threads;
8811 
8812   // We want all the workers to stop waiting on the barrier while we adjust the
8813   // size of the team.
8814   for (int f = 1; f < old_nthreads; ++f) {
8815     KMP_DEBUG_ASSERT(other_threads[f] != NULL);
8816     // Ignore threads that are already inactive or not present in the team
8817     if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
8818       // teams construct causes thread_limit to get passed in, and some of
8819       // those could be inactive; just ignore them
8820       continue;
8821     }
8822     // If thread is transitioning still to in_use state, wait for it
8823     if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
8824       while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
8825         KMP_CPU_PAUSE();
8826     }
8827     // The thread should be in_use now
8828     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
8829     // Transition to unused state
8830     team->t.t_threads[f]->th.th_used_in_team.store(2);
8831     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
8832   }
8833   // Release all the workers
8834   kmp_uint64 new_value; // new value for go
8835   new_value = team->t.b->go_release();
8836 
8837   KMP_MFENCE();
8838 
8839   // Workers should see transition status 2 and move to 0; but may need to be
8840   // woken up first
8841   size_t my_go_index;
8842   int count = old_nthreads - 1;
8843   while (count > 0) {
8844     count = old_nthreads - 1;
8845     for (int f = 1; f < old_nthreads; ++f) {
8846       my_go_index = f / team->t.b->threads_per_go;
8847       if (other_threads[f]->th.th_used_in_team.load() != 0) {
8848         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
8849           kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
8850               void *, other_threads[f]->th.th_sleep_loc);
8851           __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
8852         }
8853       } else {
8854         KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
8855         count--;
8856       }
8857     }
8858   }
8859   // Now update the barrier size
8860   team->t.b->update_num_threads(new_nthreads);
8861   team->t.b->go_reset();
8862 }
8863 
8864 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
8865   // Add the threads back to the team
8866   KMP_DEBUG_ASSERT(team);
8867   // Threads were paused and pointed at th_used_in_team temporarily during a
8868   // resize of the team. We're going to set th_used_in_team to 3 to indicate to
8869   // the thread that it should transition itself back into the team. Then, if
8870   // blocktime isn't infinite, the thread could be sleeping, so we send a resume
8871   // to wake it up.
8872   for (int f = 1; f < new_nthreads; ++f) {
8873     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
8874     KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
8875                                 3);
8876     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
8877       __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
8878                       (kmp_flag_32<false, false> *)NULL);
8879     }
8880   }
8881   // The threads should be transitioning to the team; when they are done, they
8882   // should have set th_used_in_team to 1. This loop forces master to wait until
8883   // all threads have moved into the team and are waiting in the barrier.
8884   int count = new_nthreads - 1;
8885   while (count > 0) {
8886     count = new_nthreads - 1;
8887     for (int f = 1; f < new_nthreads; ++f) {
8888       if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
8889         count--;
8890       }
8891     }
8892   }
8893 }
8894 
8895 // Globals and functions for hidden helper task
8896 kmp_info_t **__kmp_hidden_helper_threads;
8897 kmp_info_t *__kmp_hidden_helper_main_thread;
8898 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8899 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8900 #if KMP_OS_LINUX
8901 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8902 #else
8903 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8904 #endif
8905 
8906 namespace {
8907 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8908 
8909 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8910   // This is an explicit synchronization on all hidden helper threads in case
8911   // that when a regular thread pushes a hidden helper task to one hidden
8912   // helper thread, the thread has not been awaken once since they're released
8913   // by the main thread after creating the team.
8914   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8915   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8916          __kmp_hidden_helper_threads_num)
8917     ;
8918 
8919   // If main thread, then wait for signal
8920   if (__kmpc_master(nullptr, *gtid)) {
8921     // First, unset the initial state and release the initial thread
8922     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8923     __kmp_hidden_helper_initz_release();
8924     __kmp_hidden_helper_main_thread_wait();
8925     // Now wake up all worker threads
8926     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8927       __kmp_hidden_helper_worker_thread_signal();
8928     }
8929   }
8930 }
8931 } // namespace
8932 
8933 void __kmp_hidden_helper_threads_initz_routine() {
8934   // Create a new root for hidden helper team/threads
8935   const int gtid = __kmp_register_root(TRUE);
8936   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8937   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8938   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8939       __kmp_hidden_helper_threads_num;
8940 
8941   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8942 
8943   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8944 
8945   // Set the initialization flag to FALSE
8946   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8947 
8948   __kmp_hidden_helper_threads_deinitz_release();
8949 }
8950 
8951 /* Nesting Mode:
8952    Set via KMP_NESTING_MODE, which takes an integer.
8953    Note: we skip duplicate topology levels, and skip levels with only
8954       one entity.
8955    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
8956    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
8957       in the topology, and initializes the number of threads at each of those
8958       levels to the number of entities at each level, respectively, below the
8959       entity at the parent level.
8960    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
8961       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
8962       the user to turn nesting on explicitly. This is an even more experimental
8963       option to this experimental feature, and may change or go away in the
8964       future.
8965 */
8966 
8967 // Allocate space to store nesting levels
8968 void __kmp_init_nesting_mode() {
8969   int levels = KMP_HW_LAST;
8970   __kmp_nesting_mode_nlevels = levels;
8971   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
8972   for (int i = 0; i < levels; ++i)
8973     __kmp_nesting_nth_level[i] = 0;
8974   if (__kmp_nested_nth.size < levels) {
8975     __kmp_nested_nth.nth =
8976         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
8977     __kmp_nested_nth.size = levels;
8978   }
8979 }
8980 
8981 // Set # threads for top levels of nesting; must be called after topology set
8982 void __kmp_set_nesting_mode_threads() {
8983   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
8984 
8985   if (__kmp_nesting_mode == 1)
8986     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
8987   else if (__kmp_nesting_mode > 1)
8988     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8989 
8990   if (__kmp_topology) { // use topology info
8991     int loc, hw_level;
8992     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
8993                                 loc < __kmp_nesting_mode_nlevels;
8994          loc++, hw_level++) {
8995       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
8996       if (__kmp_nesting_nth_level[loc] == 1)
8997         loc--;
8998     }
8999     // Make sure all cores are used
9000     if (__kmp_nesting_mode > 1 && loc > 1) {
9001       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9002       int num_cores = __kmp_topology->get_count(core_level);
9003       int upper_levels = 1;
9004       for (int level = 0; level < loc - 1; ++level)
9005         upper_levels *= __kmp_nesting_nth_level[level];
9006       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9007         __kmp_nesting_nth_level[loc - 1] =
9008             num_cores / __kmp_nesting_nth_level[loc - 2];
9009     }
9010     __kmp_nesting_mode_nlevels = loc;
9011     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9012   } else { // no topology info available; provide a reasonable guesstimation
9013     if (__kmp_avail_proc >= 4) {
9014       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9015       __kmp_nesting_nth_level[1] = 2;
9016       __kmp_nesting_mode_nlevels = 2;
9017     } else {
9018       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9019       __kmp_nesting_mode_nlevels = 1;
9020     }
9021     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9022   }
9023   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9024     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9025   }
9026   set__nproc(thread, __kmp_nesting_nth_level[0]);
9027   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9028     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9029   if (get__max_active_levels(thread) > 1) {
9030     // if max levels was set, set nesting mode levels to same
9031     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9032   }
9033   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9034     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9035 }
9036