1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46     KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
48 
49 char const __kmp_version_omp_api[] =
50     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
51 
52 #ifdef KMP_DEBUG
53 char const __kmp_version_lock[] =
54     KMP_VERSION_PREFIX "lock type: run time selectable";
55 #endif /* KMP_DEBUG */
56 
57 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
58 
59 /* ------------------------------------------------------------------------ */
60 
61 #if KMP_USE_MONITOR
62 kmp_info_t __kmp_monitor;
63 #endif
64 
65 /* Forward declarations */
66 
67 void __kmp_cleanup(void);
68 
69 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
70                                   int gtid);
71 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
72                                   kmp_internal_control_t *new_icvs,
73                                   ident_t *loc);
74 #if KMP_AFFINITY_SUPPORTED
75 static void __kmp_partition_places(kmp_team_t *team,
76                                    int update_master_only = 0);
77 #endif
78 static void __kmp_do_serial_initialize(void);
79 void __kmp_fork_barrier(int gtid, int tid);
80 void __kmp_join_barrier(int gtid);
81 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
82                           kmp_internal_control_t *new_icvs, ident_t *loc);
83 
84 #ifdef USE_LOAD_BALANCE
85 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
86 #endif
87 
88 static int __kmp_expand_threads(int nNeed);
89 #if KMP_OS_WINDOWS
90 static int __kmp_unregister_root_other_thread(int gtid);
91 #endif
92 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
93 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
94 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
95 
96 /* Calculate the identifier of the current thread */
97 /* fast (and somewhat portable) way to get unique identifier of executing
98    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
99 int __kmp_get_global_thread_id() {
100   int i;
101   kmp_info_t **other_threads;
102   size_t stack_data;
103   char *stack_addr;
104   size_t stack_size;
105   char *stack_base;
106 
107   KA_TRACE(
108       1000,
109       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
110        __kmp_nth, __kmp_all_nth));
111 
112   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
113      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
114      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
115      __kmp_init_gtid for this to work. */
116 
117   if (!TCR_4(__kmp_init_gtid))
118     return KMP_GTID_DNE;
119 
120 #ifdef KMP_TDATA_GTID
121   if (TCR_4(__kmp_gtid_mode) >= 3) {
122     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
123     return __kmp_gtid;
124   }
125 #endif
126   if (TCR_4(__kmp_gtid_mode) >= 2) {
127     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
128     return __kmp_gtid_get_specific();
129   }
130   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
131 
132   stack_addr = (char *)&stack_data;
133   other_threads = __kmp_threads;
134 
135   /* ATT: The code below is a source of potential bugs due to unsynchronized
136      access to __kmp_threads array. For example:
137      1. Current thread loads other_threads[i] to thr and checks it, it is
138         non-NULL.
139      2. Current thread is suspended by OS.
140      3. Another thread unregisters and finishes (debug versions of free()
141         may fill memory with something like 0xEF).
142      4. Current thread is resumed.
143      5. Current thread reads junk from *thr.
144      TODO: Fix it.  --ln  */
145 
146   for (i = 0; i < __kmp_threads_capacity; i++) {
147 
148     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
149     if (!thr)
150       continue;
151 
152     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
153     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
154 
155     /* stack grows down -- search through all of the active threads */
156 
157     if (stack_addr <= stack_base) {
158       size_t stack_diff = stack_base - stack_addr;
159 
160       if (stack_diff <= stack_size) {
161         /* The only way we can be closer than the allocated */
162         /* stack size is if we are running on this thread. */
163         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
164         return i;
165       }
166     }
167   }
168 
169   /* get specific to try and determine our gtid */
170   KA_TRACE(1000,
171            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
172             "thread, using TLS\n"));
173   i = __kmp_gtid_get_specific();
174 
175   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
176 
177   /* if we havn't been assigned a gtid, then return code */
178   if (i < 0)
179     return i;
180 
181   /* dynamically updated stack window for uber threads to avoid get_specific
182      call */
183   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
184     KMP_FATAL(StackOverflow, i);
185   }
186 
187   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
188   if (stack_addr > stack_base) {
189     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
190     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
191             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
192                 stack_base);
193   } else {
194     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195             stack_base - stack_addr);
196   }
197 
198   /* Reprint stack bounds for ubermaster since they have been refined */
199   if (__kmp_storage_map) {
200     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
203                                  other_threads[i]->th.th_info.ds.ds_stacksize,
204                                  "th_%d stack (refinement)", i);
205   }
206   return i;
207 }
208 
209 int __kmp_get_global_thread_id_reg() {
210   int gtid;
211 
212   if (!__kmp_init_serial) {
213     gtid = KMP_GTID_DNE;
214   } else
215 #ifdef KMP_TDATA_GTID
216       if (TCR_4(__kmp_gtid_mode) >= 3) {
217     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
218     gtid = __kmp_gtid;
219   } else
220 #endif
221       if (TCR_4(__kmp_gtid_mode) >= 2) {
222     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
223     gtid = __kmp_gtid_get_specific();
224   } else {
225     KA_TRACE(1000,
226              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
227     gtid = __kmp_get_global_thread_id();
228   }
229 
230   /* we must be a new uber master sibling thread */
231   if (gtid == KMP_GTID_DNE) {
232     KA_TRACE(10,
233              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
234               "Registering a new gtid.\n"));
235     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
236     if (!__kmp_init_serial) {
237       __kmp_do_serial_initialize();
238       gtid = __kmp_gtid_get_specific();
239     } else {
240       gtid = __kmp_register_root(FALSE);
241     }
242     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
243     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
244   }
245 
246   KMP_DEBUG_ASSERT(gtid >= 0);
247 
248   return gtid;
249 }
250 
251 /* caller must hold forkjoin_lock */
252 void __kmp_check_stack_overlap(kmp_info_t *th) {
253   int f;
254   char *stack_beg = NULL;
255   char *stack_end = NULL;
256   int gtid;
257 
258   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
259   if (__kmp_storage_map) {
260     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
261     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
262 
263     gtid = __kmp_gtid_from_thread(th);
264 
265     if (gtid == KMP_GTID_MONITOR) {
266       __kmp_print_storage_map_gtid(
267           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
268           "th_%s stack (%s)", "mon",
269           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
270     } else {
271       __kmp_print_storage_map_gtid(
272           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273           "th_%d stack (%s)", gtid,
274           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
275     }
276   }
277 
278   /* No point in checking ubermaster threads since they use refinement and
279    * cannot overlap */
280   gtid = __kmp_gtid_from_thread(th);
281   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
282     KA_TRACE(10,
283              ("__kmp_check_stack_overlap: performing extensive checking\n"));
284     if (stack_beg == NULL) {
285       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
286       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
287     }
288 
289     for (f = 0; f < __kmp_threads_capacity; f++) {
290       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
291 
292       if (f_th && f_th != th) {
293         char *other_stack_end =
294             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295         char *other_stack_beg =
296             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
299 
300           /* Print the other stack values before the abort */
301           if (__kmp_storage_map)
302             __kmp_print_storage_map_gtid(
303                 -1, other_stack_beg, other_stack_end,
304                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
305                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
306 
307           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
308                       __kmp_msg_null);
309         }
310       }
311     }
312   }
313   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
314 }
315 
316 /* ------------------------------------------------------------------------ */
317 
318 void __kmp_infinite_loop(void) {
319   static int done = FALSE;
320 
321   while (!done) {
322     KMP_YIELD(TRUE);
323   }
324 }
325 
326 #define MAX_MESSAGE 512
327 
328 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
329                                   char const *format, ...) {
330   char buffer[MAX_MESSAGE];
331   va_list ap;
332 
333   va_start(ap, format);
334   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
335                p2, (unsigned long)size, format);
336   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
337   __kmp_vprintf(kmp_err, buffer, ap);
338 #if KMP_PRINT_DATA_PLACEMENT
339   int node;
340   if (gtid >= 0) {
341     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
342       if (__kmp_storage_map_verbose) {
343         node = __kmp_get_host_node(p1);
344         if (node < 0) /* doesn't work, so don't try this next time */
345           __kmp_storage_map_verbose = FALSE;
346         else {
347           char *last;
348           int lastNode;
349           int localProc = __kmp_get_cpu_from_gtid(gtid);
350 
351           const int page_size = KMP_GET_PAGE_SIZE();
352 
353           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
354           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
355           if (localProc >= 0)
356             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
357                                  localProc >> 1);
358           else
359             __kmp_printf_no_lock("  GTID %d\n", gtid);
360 #if KMP_USE_PRCTL
361           /* The more elaborate format is disabled for now because of the prctl
362            * hanging bug. */
363           do {
364             last = p1;
365             lastNode = node;
366             /* This loop collates adjacent pages with the same host node. */
367             do {
368               (char *)p1 += page_size;
369             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
370             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
371                                  lastNode);
372           } while (p1 <= p2);
373 #else
374           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
375                                (char *)p1 + (page_size - 1),
376                                __kmp_get_host_node(p1));
377           if (p1 < p2) {
378             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
379                                  (char *)p2 + (page_size - 1),
380                                  __kmp_get_host_node(p2));
381           }
382 #endif
383         }
384       }
385     } else
386       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
387   }
388 #endif /* KMP_PRINT_DATA_PLACEMENT */
389   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
390 }
391 
392 void __kmp_warn(char const *format, ...) {
393   char buffer[MAX_MESSAGE];
394   va_list ap;
395 
396   if (__kmp_generate_warnings == kmp_warnings_off) {
397     return;
398   }
399 
400   va_start(ap, format);
401 
402   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
403   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
404   __kmp_vprintf(kmp_err, buffer, ap);
405   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406 
407   va_end(ap);
408 }
409 
410 void __kmp_abort_process() {
411   // Later threads may stall here, but that's ok because abort() will kill them.
412   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
413 
414   if (__kmp_debug_buf) {
415     __kmp_dump_debug_buffer();
416   }
417 
418   if (KMP_OS_WINDOWS) {
419     // Let other threads know of abnormal termination and prevent deadlock
420     // if abort happened during library initialization or shutdown
421     __kmp_global.g.g_abort = SIGABRT;
422 
423     /* On Windows* OS by default abort() causes pop-up error box, which stalls
424        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
425        boxes. _set_abort_behavior() works well, but this function is not
426        available in VS7 (this is not problem for DLL, but it is a problem for
427        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
428        help, at least in some versions of MS C RTL.
429 
430        It seems following sequence is the only way to simulate abort() and
431        avoid pop-up error box. */
432     raise(SIGABRT);
433     _exit(3); // Just in case, if signal ignored, exit anyway.
434   } else {
435     abort();
436   }
437 
438   __kmp_infinite_loop();
439   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
440 
441 } // __kmp_abort_process
442 
443 void __kmp_abort_thread(void) {
444   // TODO: Eliminate g_abort global variable and this function.
445   // In case of abort just call abort(), it will kill all the threads.
446   __kmp_infinite_loop();
447 } // __kmp_abort_thread
448 
449 /* Print out the storage map for the major kmp_info_t thread data structures
450    that are allocated together. */
451 
452 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
453   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
454                                gtid);
455 
456   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
457                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
458 
459   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
460                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
461 
462   __kmp_print_storage_map_gtid(
463       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
464       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
465 
466   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
467                                &thr->th.th_bar[bs_plain_barrier + 1],
468                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
469                                gtid);
470 
471   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
472                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
473                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
474                                gtid);
475 
476 #if KMP_FAST_REDUCTION_BARRIER
477   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
478                                &thr->th.th_bar[bs_reduction_barrier + 1],
479                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
480                                gtid);
481 #endif // KMP_FAST_REDUCTION_BARRIER
482 }
483 
484 /* Print out the storage map for the major kmp_team_t team data structures
485    that are allocated together. */
486 
487 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
488                                          int team_id, int num_thr) {
489   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
490   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
491                                header, team_id);
492 
493   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
494                                &team->t.t_bar[bs_last_barrier],
495                                sizeof(kmp_balign_team_t) * bs_last_barrier,
496                                "%s_%d.t_bar", header, team_id);
497 
498   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
499                                &team->t.t_bar[bs_plain_barrier + 1],
500                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
501                                header, team_id);
502 
503   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
504                                &team->t.t_bar[bs_forkjoin_barrier + 1],
505                                sizeof(kmp_balign_team_t),
506                                "%s_%d.t_bar[forkjoin]", header, team_id);
507 
508 #if KMP_FAST_REDUCTION_BARRIER
509   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
510                                &team->t.t_bar[bs_reduction_barrier + 1],
511                                sizeof(kmp_balign_team_t),
512                                "%s_%d.t_bar[reduction]", header, team_id);
513 #endif // KMP_FAST_REDUCTION_BARRIER
514 
515   __kmp_print_storage_map_gtid(
516       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
517       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
518 
519   __kmp_print_storage_map_gtid(
520       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
522 
523   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
524                                &team->t.t_disp_buffer[num_disp_buff],
525                                sizeof(dispatch_shared_info_t) * num_disp_buff,
526                                "%s_%d.t_disp_buffer", header, team_id);
527 }
528 
529 static void __kmp_init_allocator() { __kmp_init_memkind(); }
530 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
531 
532 /* ------------------------------------------------------------------------ */
533 
534 #if KMP_DYNAMIC_LIB
535 #if KMP_OS_WINDOWS
536 
537 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
538   // TODO: Change to __kmp_break_bootstrap_lock().
539   __kmp_init_bootstrap_lock(lck); // make the lock released
540 }
541 
542 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
543   int i;
544   int thread_count;
545 
546   // PROCESS_DETACH is expected to be called by a thread that executes
547   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
548   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
549   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
550   // threads can be still alive here, although being about to be terminated. The
551   // threads in the array with ds_thread==0 are most suspicious. Actually, it
552   // can be not safe to access the __kmp_threads[].
553 
554   // TODO: does it make sense to check __kmp_roots[] ?
555 
556   // Let's check that there are no other alive threads registered with the OMP
557   // lib.
558   while (1) {
559     thread_count = 0;
560     for (i = 0; i < __kmp_threads_capacity; ++i) {
561       if (!__kmp_threads)
562         continue;
563       kmp_info_t *th = __kmp_threads[i];
564       if (th == NULL)
565         continue;
566       int gtid = th->th.th_info.ds.ds_gtid;
567       if (gtid == gtid_req)
568         continue;
569       if (gtid < 0)
570         continue;
571       DWORD exit_val;
572       int alive = __kmp_is_thread_alive(th, &exit_val);
573       if (alive) {
574         ++thread_count;
575       }
576     }
577     if (thread_count == 0)
578       break; // success
579   }
580 
581   // Assume that I'm alone. Now it might be safe to check and reset locks.
582   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
583   __kmp_reset_lock(&__kmp_forkjoin_lock);
584 #ifdef KMP_DEBUG
585   __kmp_reset_lock(&__kmp_stdio_lock);
586 #endif // KMP_DEBUG
587 }
588 
589 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
590   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
591 
592   switch (fdwReason) {
593 
594   case DLL_PROCESS_ATTACH:
595     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
596 
597     return TRUE;
598 
599   case DLL_PROCESS_DETACH:
600     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
601 
602     if (lpReserved != NULL) {
603       // lpReserved is used for telling the difference:
604       //   lpReserved == NULL when FreeLibrary() was called,
605       //   lpReserved != NULL when the process terminates.
606       // When FreeLibrary() is called, worker threads remain alive. So they will
607       // release the forkjoin lock by themselves. When the process terminates,
608       // worker threads disappear triggering the problem of unreleased forkjoin
609       // lock as described below.
610 
611       // A worker thread can take the forkjoin lock. The problem comes up if
612       // that worker thread becomes dead before it releases the forkjoin lock.
613       // The forkjoin lock remains taken, while the thread executing
614       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
615       // to take the forkjoin lock and will always fail, so that the application
616       // will never finish [normally]. This scenario is possible if
617       // __kmpc_end() has not been executed. It looks like it's not a corner
618       // case, but common cases:
619       // - the main function was compiled by an alternative compiler;
620       // - the main function was compiled by icl but without /Qopenmp
621       //   (application with plugins);
622       // - application terminates by calling C exit(), Fortran CALL EXIT() or
623       //   Fortran STOP.
624       // - alive foreign thread prevented __kmpc_end from doing cleanup.
625       //
626       // This is a hack to work around the problem.
627       // TODO: !!! figure out something better.
628       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
629     }
630 
631     __kmp_internal_end_library(__kmp_gtid_get_specific());
632 
633     return TRUE;
634 
635   case DLL_THREAD_ATTACH:
636     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
637 
638     /* if we want to register new siblings all the time here call
639      * __kmp_get_gtid(); */
640     return TRUE;
641 
642   case DLL_THREAD_DETACH:
643     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
644 
645     __kmp_internal_end_thread(__kmp_gtid_get_specific());
646     return TRUE;
647   }
648 
649   return TRUE;
650 }
651 
652 #endif /* KMP_OS_WINDOWS */
653 #endif /* KMP_DYNAMIC_LIB */
654 
655 /* __kmp_parallel_deo -- Wait until it's our turn. */
656 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
657   int gtid = *gtid_ref;
658 #ifdef BUILD_PARALLEL_ORDERED
659   kmp_team_t *team = __kmp_team_from_gtid(gtid);
660 #endif /* BUILD_PARALLEL_ORDERED */
661 
662   if (__kmp_env_consistency_check) {
663     if (__kmp_threads[gtid]->th.th_root->r.r_active)
664 #if KMP_USE_DYNAMIC_LOCK
665       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
666 #else
667       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
668 #endif
669   }
670 #ifdef BUILD_PARALLEL_ORDERED
671   if (!team->t.t_serialized) {
672     KMP_MB();
673     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
674              NULL);
675     KMP_MB();
676   }
677 #endif /* BUILD_PARALLEL_ORDERED */
678 }
679 
680 /* __kmp_parallel_dxo -- Signal the next task. */
681 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682   int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684   int tid = __kmp_tid_from_gtid(gtid);
685   kmp_team_t *team = __kmp_team_from_gtid(gtid);
686 #endif /* BUILD_PARALLEL_ORDERED */
687 
688   if (__kmp_env_consistency_check) {
689     if (__kmp_threads[gtid]->th.th_root->r.r_active)
690       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
691   }
692 #ifdef BUILD_PARALLEL_ORDERED
693   if (!team->t.t_serialized) {
694     KMP_MB(); /* Flush all pending memory write invalidates.  */
695 
696     /* use the tid of the next thread in this team */
697     /* TODO replace with general release procedure */
698     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
699 
700     KMP_MB(); /* Flush all pending memory write invalidates.  */
701   }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* ------------------------------------------------------------------------ */
706 /* The BARRIER for a SINGLE process section is always explicit   */
707 
708 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
709   int status;
710   kmp_info_t *th;
711   kmp_team_t *team;
712 
713   if (!TCR_4(__kmp_init_parallel))
714     __kmp_parallel_initialize();
715   __kmp_resume_if_soft_paused();
716 
717   th = __kmp_threads[gtid];
718   team = th->th.th_team;
719   status = 0;
720 
721   th->th.th_ident = id_ref;
722 
723   if (team->t.t_serialized) {
724     status = 1;
725   } else {
726     kmp_int32 old_this = th->th.th_local.this_construct;
727 
728     ++th->th.th_local.this_construct;
729     /* try to set team count to thread count--success means thread got the
730        single block */
731     /* TODO: Should this be acquire or release? */
732     if (team->t.t_construct == old_this) {
733       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
734                                               th->th.th_local.this_construct);
735     }
736 #if USE_ITT_BUILD
737     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
738         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
739         team->t.t_active_level ==
740             1) { // Only report metadata by master of active team at level 1
741       __kmp_itt_metadata_single(id_ref);
742     }
743 #endif /* USE_ITT_BUILD */
744   }
745 
746   if (__kmp_env_consistency_check) {
747     if (status && push_ws) {
748       __kmp_push_workshare(gtid, ct_psingle, id_ref);
749     } else {
750       __kmp_check_workshare(gtid, ct_psingle, id_ref);
751     }
752   }
753 #if USE_ITT_BUILD
754   if (status) {
755     __kmp_itt_single_start(gtid);
756   }
757 #endif /* USE_ITT_BUILD */
758   return status;
759 }
760 
761 void __kmp_exit_single(int gtid) {
762 #if USE_ITT_BUILD
763   __kmp_itt_single_end(gtid);
764 #endif /* USE_ITT_BUILD */
765   if (__kmp_env_consistency_check)
766     __kmp_pop_workshare(gtid, ct_psingle, NULL);
767 }
768 
769 /* determine if we can go parallel or must use a serialized parallel region and
770  * how many threads we can use
771  * set_nproc is the number of threads requested for the team
772  * returns 0 if we should serialize or only use one thread,
773  * otherwise the number of threads to use
774  * The forkjoin lock is held by the caller. */
775 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
776                                  int master_tid, int set_nthreads,
777                                  int enter_teams) {
778   int capacity;
779   int new_nthreads;
780   KMP_DEBUG_ASSERT(__kmp_init_serial);
781   KMP_DEBUG_ASSERT(root && parent_team);
782   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
783 
784   // If dyn-var is set, dynamically adjust the number of desired threads,
785   // according to the method specified by dynamic_mode.
786   new_nthreads = set_nthreads;
787   if (!get__dynamic_2(parent_team, master_tid)) {
788     ;
789   }
790 #ifdef USE_LOAD_BALANCE
791   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
792     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
793     if (new_nthreads == 1) {
794       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795                     "reservation to 1 thread\n",
796                     master_tid));
797       return 1;
798     }
799     if (new_nthreads < set_nthreads) {
800       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
801                     "reservation to %d threads\n",
802                     master_tid, new_nthreads));
803     }
804   }
805 #endif /* USE_LOAD_BALANCE */
806   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
807     new_nthreads = __kmp_avail_proc - __kmp_nth +
808                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
809     if (new_nthreads <= 1) {
810       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811                     "reservation to 1 thread\n",
812                     master_tid));
813       return 1;
814     }
815     if (new_nthreads < set_nthreads) {
816       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
817                     "reservation to %d threads\n",
818                     master_tid, new_nthreads));
819     } else {
820       new_nthreads = set_nthreads;
821     }
822   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
823     if (set_nthreads > 2) {
824       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
825       new_nthreads = (new_nthreads % set_nthreads) + 1;
826       if (new_nthreads == 1) {
827         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828                       "reservation to 1 thread\n",
829                       master_tid));
830         return 1;
831       }
832       if (new_nthreads < set_nthreads) {
833         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
834                       "reservation to %d threads\n",
835                       master_tid, new_nthreads));
836       }
837     }
838   } else {
839     KMP_ASSERT(0);
840   }
841 
842   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
843   if (__kmp_nth + new_nthreads -
844           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
845       __kmp_max_nth) {
846     int tl_nthreads = __kmp_max_nth - __kmp_nth +
847                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
848     if (tl_nthreads <= 0) {
849       tl_nthreads = 1;
850     }
851 
852     // If dyn-var is false, emit a 1-time warning.
853     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
854       __kmp_reserve_warn = 1;
855       __kmp_msg(kmp_ms_warning,
856                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
857                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
858     }
859     if (tl_nthreads == 1) {
860       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
861                     "reduced reservation to 1 thread\n",
862                     master_tid));
863       return 1;
864     }
865     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
866                   "reservation to %d threads\n",
867                   master_tid, tl_nthreads));
868     new_nthreads = tl_nthreads;
869   }
870 
871   // Respect OMP_THREAD_LIMIT
872   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
873   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
874   if (cg_nthreads + new_nthreads -
875           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
876       max_cg_threads) {
877     int tl_nthreads = max_cg_threads - cg_nthreads +
878                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
879     if (tl_nthreads <= 0) {
880       tl_nthreads = 1;
881     }
882 
883     // If dyn-var is false, emit a 1-time warning.
884     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885       __kmp_reserve_warn = 1;
886       __kmp_msg(kmp_ms_warning,
887                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
888                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
889     }
890     if (tl_nthreads == 1) {
891       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
892                     "reduced reservation to 1 thread\n",
893                     master_tid));
894       return 1;
895     }
896     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
897                   "reservation to %d threads\n",
898                   master_tid, tl_nthreads));
899     new_nthreads = tl_nthreads;
900   }
901 
902   // Check if the threads array is large enough, or needs expanding.
903   // See comment in __kmp_register_root() about the adjustment if
904   // __kmp_threads[0] == NULL.
905   capacity = __kmp_threads_capacity;
906   if (TCR_PTR(__kmp_threads[0]) == NULL) {
907     --capacity;
908   }
909   if (__kmp_nth + new_nthreads -
910           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911       capacity) {
912     // Expand the threads array.
913     int slotsRequired = __kmp_nth + new_nthreads -
914                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915                         capacity;
916     int slotsAdded = __kmp_expand_threads(slotsRequired);
917     if (slotsAdded < slotsRequired) {
918       // The threads array was not expanded enough.
919       new_nthreads -= (slotsRequired - slotsAdded);
920       KMP_ASSERT(new_nthreads >= 1);
921 
922       // If dyn-var is false, emit a 1-time warning.
923       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924         __kmp_reserve_warn = 1;
925         if (__kmp_tp_cached) {
926           __kmp_msg(kmp_ms_warning,
927                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930         } else {
931           __kmp_msg(kmp_ms_warning,
932                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934         }
935       }
936     }
937   }
938 
939 #ifdef KMP_DEBUG
940   if (new_nthreads == 1) {
941     KC_TRACE(10,
942              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943               "dead roots and rechecking; requested %d threads\n",
944               __kmp_get_gtid(), set_nthreads));
945   } else {
946     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947                   " %d threads\n",
948                   __kmp_get_gtid(), new_nthreads, set_nthreads));
949   }
950 #endif // KMP_DEBUG
951   return new_nthreads;
952 }
953 
954 /* Allocate threads from the thread pool and assign them to the new team. We are
955    assured that there are enough threads available, because we checked on that
956    earlier within critical section forkjoin */
957 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
958                                     kmp_info_t *master_th, int master_gtid) {
959   int i;
960   int use_hot_team;
961 
962   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
963   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
964   KMP_MB();
965 
966   /* first, let's setup the master thread */
967   master_th->th.th_info.ds.ds_tid = 0;
968   master_th->th.th_team = team;
969   master_th->th.th_team_nproc = team->t.t_nproc;
970   master_th->th.th_team_master = master_th;
971   master_th->th.th_team_serialized = FALSE;
972   master_th->th.th_dispatch = &team->t.t_dispatch[0];
973 
974 /* make sure we are not the optimized hot team */
975 #if KMP_NESTED_HOT_TEAMS
976   use_hot_team = 0;
977   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
978   if (hot_teams) { // hot teams array is not allocated if
979     // KMP_HOT_TEAMS_MAX_LEVEL=0
980     int level = team->t.t_active_level - 1; // index in array of hot teams
981     if (master_th->th.th_teams_microtask) { // are we inside the teams?
982       if (master_th->th.th_teams_size.nteams > 1) {
983         ++level; // level was not increased in teams construct for
984         // team_of_masters
985       }
986       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
987           master_th->th.th_teams_level == team->t.t_level) {
988         ++level; // level was not increased in teams construct for
989         // team_of_workers before the parallel
990       } // team->t.t_level will be increased inside parallel
991     }
992     if (level < __kmp_hot_teams_max_level) {
993       if (hot_teams[level].hot_team) {
994         // hot team has already been allocated for given level
995         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
996         use_hot_team = 1; // the team is ready to use
997       } else {
998         use_hot_team = 0; // AC: threads are not allocated yet
999         hot_teams[level].hot_team = team; // remember new hot team
1000         hot_teams[level].hot_team_nth = team->t.t_nproc;
1001       }
1002     } else {
1003       use_hot_team = 0;
1004     }
1005   }
1006 #else
1007   use_hot_team = team == root->r.r_hot_team;
1008 #endif
1009   if (!use_hot_team) {
1010 
1011     /* install the master thread */
1012     team->t.t_threads[0] = master_th;
1013     __kmp_initialize_info(master_th, team, 0, master_gtid);
1014 
1015     /* now, install the worker threads */
1016     for (i = 1; i < team->t.t_nproc; i++) {
1017 
1018       /* fork or reallocate a new thread and install it in team */
1019       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1020       team->t.t_threads[i] = thr;
1021       KMP_DEBUG_ASSERT(thr);
1022       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1023       /* align team and thread arrived states */
1024       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1025                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1026                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1027                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1028                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1029                     team->t.t_bar[bs_plain_barrier].b_arrived));
1030       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1031       thr->th.th_teams_level = master_th->th.th_teams_level;
1032       thr->th.th_teams_size = master_th->th.th_teams_size;
1033       { // Initialize threads' barrier data.
1034         int b;
1035         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1036         for (b = 0; b < bs_last_barrier; ++b) {
1037           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1038           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1039 #if USE_DEBUGGER
1040           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1041 #endif
1042         }
1043       }
1044     }
1045 
1046 #if KMP_AFFINITY_SUPPORTED
1047     __kmp_partition_places(team);
1048 #endif
1049   }
1050 
1051   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1052     for (i = 0; i < team->t.t_nproc; i++) {
1053       kmp_info_t *thr = team->t.t_threads[i];
1054       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1055           thr->th.th_prev_level != team->t.t_level) {
1056         team->t.t_display_affinity = 1;
1057         break;
1058       }
1059     }
1060   }
1061 
1062   KMP_MB();
1063 }
1064 
1065 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1066 // Propagate any changes to the floating point control registers out to the team
1067 // We try to avoid unnecessary writes to the relevant cache line in the team
1068 // structure, so we don't make changes unless they are needed.
1069 inline static void propagateFPControl(kmp_team_t *team) {
1070   if (__kmp_inherit_fp_control) {
1071     kmp_int16 x87_fpu_control_word;
1072     kmp_uint32 mxcsr;
1073 
1074     // Get master values of FPU control flags (both X87 and vector)
1075     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1076     __kmp_store_mxcsr(&mxcsr);
1077     mxcsr &= KMP_X86_MXCSR_MASK;
1078 
1079     // There is no point looking at t_fp_control_saved here.
1080     // If it is TRUE, we still have to update the values if they are different
1081     // from those we now have. If it is FALSE we didn't save anything yet, but
1082     // our objective is the same. We have to ensure that the values in the team
1083     // are the same as those we have.
1084     // So, this code achieves what we need whether or not t_fp_control_saved is
1085     // true. By checking whether the value needs updating we avoid unnecessary
1086     // writes that would put the cache-line into a written state, causing all
1087     // threads in the team to have to read it again.
1088     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1089     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1090     // Although we don't use this value, other code in the runtime wants to know
1091     // whether it should restore them. So we must ensure it is correct.
1092     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1093   } else {
1094     // Similarly here. Don't write to this cache-line in the team structure
1095     // unless we have to.
1096     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1097   }
1098 }
1099 
1100 // Do the opposite, setting the hardware registers to the updated values from
1101 // the team.
1102 inline static void updateHWFPControl(kmp_team_t *team) {
1103   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1104     // Only reset the fp control regs if they have been changed in the team.
1105     // the parallel region that we are exiting.
1106     kmp_int16 x87_fpu_control_word;
1107     kmp_uint32 mxcsr;
1108     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109     __kmp_store_mxcsr(&mxcsr);
1110     mxcsr &= KMP_X86_MXCSR_MASK;
1111 
1112     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1113       __kmp_clear_x87_fpu_status_word();
1114       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1115     }
1116 
1117     if (team->t.t_mxcsr != mxcsr) {
1118       __kmp_load_mxcsr(&team->t.t_mxcsr);
1119     }
1120   }
1121 }
1122 #else
1123 #define propagateFPControl(x) ((void)0)
1124 #define updateHWFPControl(x) ((void)0)
1125 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1126 
1127 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1128                                      int realloc); // forward declaration
1129 
1130 /* Run a parallel region that has been serialized, so runs only in a team of the
1131    single master thread. */
1132 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1133   kmp_info_t *this_thr;
1134   kmp_team_t *serial_team;
1135 
1136   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1137 
1138   /* Skip all this code for autopar serialized loops since it results in
1139      unacceptable overhead */
1140   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1141     return;
1142 
1143   if (!TCR_4(__kmp_init_parallel))
1144     __kmp_parallel_initialize();
1145   __kmp_resume_if_soft_paused();
1146 
1147   this_thr = __kmp_threads[global_tid];
1148   serial_team = this_thr->th.th_serial_team;
1149 
1150   /* utilize the serialized team held by this thread */
1151   KMP_DEBUG_ASSERT(serial_team);
1152   KMP_MB();
1153 
1154   if (__kmp_tasking_mode != tskm_immediate_exec) {
1155     KMP_DEBUG_ASSERT(
1156         this_thr->th.th_task_team ==
1157         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1158     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1159                      NULL);
1160     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1161                   "team %p, new task_team = NULL\n",
1162                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1163     this_thr->th.th_task_team = NULL;
1164   }
1165 
1166   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1167   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1168     proc_bind = proc_bind_false;
1169   } else if (proc_bind == proc_bind_default) {
1170     // No proc_bind clause was specified, so use the current value
1171     // of proc-bind-var for this parallel region.
1172     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1173   }
1174   // Reset for next parallel region
1175   this_thr->th.th_set_proc_bind = proc_bind_default;
1176 
1177 #if OMPT_SUPPORT
1178   ompt_data_t ompt_parallel_data = ompt_data_none;
1179   ompt_data_t *implicit_task_data;
1180   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1181   if (ompt_enabled.enabled &&
1182       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1183 
1184     ompt_task_info_t *parent_task_info;
1185     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1186 
1187     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1188     if (ompt_enabled.ompt_callback_parallel_begin) {
1189       int team_size = 1;
1190 
1191       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1192           &(parent_task_info->task_data), &(parent_task_info->frame),
1193           &ompt_parallel_data, team_size,
1194           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1195     }
1196   }
1197 #endif // OMPT_SUPPORT
1198 
1199   if (this_thr->th.th_team != serial_team) {
1200     // Nested level will be an index in the nested nthreads array
1201     int level = this_thr->th.th_team->t.t_level;
1202 
1203     if (serial_team->t.t_serialized) {
1204       /* this serial team was already used
1205          TODO increase performance by making this locks more specific */
1206       kmp_team_t *new_team;
1207 
1208       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1209 
1210       new_team =
1211           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1212 #if OMPT_SUPPORT
1213                               ompt_parallel_data,
1214 #endif
1215                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1216                               0 USE_NESTED_HOT_ARG(NULL));
1217       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1218       KMP_ASSERT(new_team);
1219 
1220       /* setup new serialized team and install it */
1221       new_team->t.t_threads[0] = this_thr;
1222       new_team->t.t_parent = this_thr->th.th_team;
1223       serial_team = new_team;
1224       this_thr->th.th_serial_team = serial_team;
1225 
1226       KF_TRACE(
1227           10,
1228           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1229            global_tid, serial_team));
1230 
1231       /* TODO the above breaks the requirement that if we run out of resources,
1232          then we can still guarantee that serialized teams are ok, since we may
1233          need to allocate a new one */
1234     } else {
1235       KF_TRACE(
1236           10,
1237           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1238            global_tid, serial_team));
1239     }
1240 
1241     /* we have to initialize this serial team */
1242     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1243     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1244     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1245     serial_team->t.t_ident = loc;
1246     serial_team->t.t_serialized = 1;
1247     serial_team->t.t_nproc = 1;
1248     serial_team->t.t_parent = this_thr->th.th_team;
1249     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1250     this_thr->th.th_team = serial_team;
1251     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1252 
1253     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1254                   this_thr->th.th_current_task));
1255     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1256     this_thr->th.th_current_task->td_flags.executing = 0;
1257 
1258     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1259 
1260     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1261        implicit task for each serialized task represented by
1262        team->t.t_serialized? */
1263     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1264               &this_thr->th.th_current_task->td_parent->td_icvs);
1265 
1266     // Thread value exists in the nested nthreads array for the next nested
1267     // level
1268     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1269       this_thr->th.th_current_task->td_icvs.nproc =
1270           __kmp_nested_nth.nth[level + 1];
1271     }
1272 
1273     if (__kmp_nested_proc_bind.used &&
1274         (level + 1 < __kmp_nested_proc_bind.used)) {
1275       this_thr->th.th_current_task->td_icvs.proc_bind =
1276           __kmp_nested_proc_bind.bind_types[level + 1];
1277     }
1278 
1279 #if USE_DEBUGGER
1280     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1281 #endif
1282     this_thr->th.th_info.ds.ds_tid = 0;
1283 
1284     /* set thread cache values */
1285     this_thr->th.th_team_nproc = 1;
1286     this_thr->th.th_team_master = this_thr;
1287     this_thr->th.th_team_serialized = 1;
1288 
1289     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1290     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1291     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1292 
1293     propagateFPControl(serial_team);
1294 
1295     /* check if we need to allocate dispatch buffers stack */
1296     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1297     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1298       serial_team->t.t_dispatch->th_disp_buffer =
1299           (dispatch_private_info_t *)__kmp_allocate(
1300               sizeof(dispatch_private_info_t));
1301     }
1302     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1303 
1304     KMP_MB();
1305 
1306   } else {
1307     /* this serialized team is already being used,
1308      * that's fine, just add another nested level */
1309     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1310     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1311     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1312     ++serial_team->t.t_serialized;
1313     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1314 
1315     // Nested level will be an index in the nested nthreads array
1316     int level = this_thr->th.th_team->t.t_level;
1317     // Thread value exists in the nested nthreads array for the next nested
1318     // level
1319     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1320       this_thr->th.th_current_task->td_icvs.nproc =
1321           __kmp_nested_nth.nth[level + 1];
1322     }
1323     serial_team->t.t_level++;
1324     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1325                   "of serial team %p to %d\n",
1326                   global_tid, serial_team, serial_team->t.t_level));
1327 
1328     /* allocate/push dispatch buffers stack */
1329     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1330     {
1331       dispatch_private_info_t *disp_buffer =
1332           (dispatch_private_info_t *)__kmp_allocate(
1333               sizeof(dispatch_private_info_t));
1334       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1335       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1336     }
1337     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1338 
1339     KMP_MB();
1340   }
1341   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1342 
1343   // Perform the display affinity functionality for
1344   // serialized parallel regions
1345   if (__kmp_display_affinity) {
1346     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1347         this_thr->th.th_prev_num_threads != 1) {
1348       // NULL means use the affinity-format-var ICV
1349       __kmp_aux_display_affinity(global_tid, NULL);
1350       this_thr->th.th_prev_level = serial_team->t.t_level;
1351       this_thr->th.th_prev_num_threads = 1;
1352     }
1353   }
1354 
1355   if (__kmp_env_consistency_check)
1356     __kmp_push_parallel(global_tid, NULL);
1357 #if OMPT_SUPPORT
1358   serial_team->t.ompt_team_info.master_return_address = codeptr;
1359   if (ompt_enabled.enabled &&
1360       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1361     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1362 
1363     ompt_lw_taskteam_t lw_taskteam;
1364     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1365                             &ompt_parallel_data, codeptr);
1366 
1367     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1368     // don't use lw_taskteam after linking. content was swaped
1369 
1370     /* OMPT implicit task begin */
1371     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1372     if (ompt_enabled.ompt_callback_implicit_task) {
1373       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1374           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1375           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1376       OMPT_CUR_TASK_INFO(this_thr)
1377           ->thread_num = __kmp_tid_from_gtid(global_tid);
1378     }
1379 
1380     /* OMPT state */
1381     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1382     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1383   }
1384 #endif
1385 }
1386 
1387 /* most of the work for a fork */
1388 /* return true if we really went parallel, false if serialized */
1389 int __kmp_fork_call(ident_t *loc, int gtid,
1390                     enum fork_context_e call_context, // Intel, GNU, ...
1391                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1392                     kmp_va_list ap) {
1393   void **argv;
1394   int i;
1395   int master_tid;
1396   int master_this_cons;
1397   kmp_team_t *team;
1398   kmp_team_t *parent_team;
1399   kmp_info_t *master_th;
1400   kmp_root_t *root;
1401   int nthreads;
1402   int master_active;
1403   int master_set_numthreads;
1404   int level;
1405   int active_level;
1406   int teams_level;
1407 #if KMP_NESTED_HOT_TEAMS
1408   kmp_hot_team_ptr_t **p_hot_teams;
1409 #endif
1410   { // KMP_TIME_BLOCK
1411     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1412     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1413 
1414     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1415     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1416       /* Some systems prefer the stack for the root thread(s) to start with */
1417       /* some gap from the parent stack to prevent false sharing. */
1418       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1419       /* These 2 lines below are so this does not get optimized out */
1420       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1421         __kmp_stkpadding += (short)((kmp_int64)dummy);
1422     }
1423 
1424     /* initialize if needed */
1425     KMP_DEBUG_ASSERT(
1426         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1427     if (!TCR_4(__kmp_init_parallel))
1428       __kmp_parallel_initialize();
1429     __kmp_resume_if_soft_paused();
1430 
1431     /* setup current data */
1432     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1433     // shutdown
1434     parent_team = master_th->th.th_team;
1435     master_tid = master_th->th.th_info.ds.ds_tid;
1436     master_this_cons = master_th->th.th_local.this_construct;
1437     root = master_th->th.th_root;
1438     master_active = root->r.r_active;
1439     master_set_numthreads = master_th->th.th_set_nproc;
1440 
1441 #if OMPT_SUPPORT
1442     ompt_data_t ompt_parallel_data = ompt_data_none;
1443     ompt_data_t *parent_task_data;
1444     ompt_frame_t *ompt_frame;
1445     ompt_data_t *implicit_task_data;
1446     void *return_address = NULL;
1447 
1448     if (ompt_enabled.enabled) {
1449       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1450                                     NULL, NULL);
1451       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1452     }
1453 #endif
1454 
1455     // Nested level will be an index in the nested nthreads array
1456     level = parent_team->t.t_level;
1457     // used to launch non-serial teams even if nested is not allowed
1458     active_level = parent_team->t.t_active_level;
1459     // needed to check nesting inside the teams
1460     teams_level = master_th->th.th_teams_level;
1461 #if KMP_NESTED_HOT_TEAMS
1462     p_hot_teams = &master_th->th.th_hot_teams;
1463     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1464       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1465           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1466       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1467       // it is either actual or not needed (when active_level > 0)
1468       (*p_hot_teams)[0].hot_team_nth = 1;
1469     }
1470 #endif
1471 
1472 #if OMPT_SUPPORT
1473     if (ompt_enabled.enabled) {
1474       if (ompt_enabled.ompt_callback_parallel_begin) {
1475         int team_size = master_set_numthreads
1476                             ? master_set_numthreads
1477                             : get__nproc_2(parent_team, master_tid);
1478         int flags = OMPT_INVOKER(call_context) |
1479                     ((microtask == (microtask_t)__kmp_teams_master)
1480                          ? ompt_parallel_league
1481                          : ompt_parallel_team);
1482         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1483             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1484             return_address);
1485       }
1486       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1487     }
1488 #endif
1489 
1490     master_th->th.th_ident = loc;
1491 
1492     if (master_th->th.th_teams_microtask && ap &&
1493         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1494       // AC: This is start of parallel that is nested inside teams construct.
1495       // The team is actual (hot), all workers are ready at the fork barrier.
1496       // No lock needed to initialize the team a bit, then free workers.
1497       parent_team->t.t_ident = loc;
1498       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1499       parent_team->t.t_argc = argc;
1500       argv = (void **)parent_team->t.t_argv;
1501       for (i = argc - 1; i >= 0; --i)
1502         *argv++ = va_arg(kmp_va_deref(ap), void *);
1503       // Increment our nested depth levels, but not increase the serialization
1504       if (parent_team == master_th->th.th_serial_team) {
1505         // AC: we are in serialized parallel
1506         __kmpc_serialized_parallel(loc, gtid);
1507         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1508 
1509 #if OMPT_SUPPORT
1510         void *dummy;
1511         void **exit_frame_p;
1512 
1513         ompt_lw_taskteam_t lw_taskteam;
1514 
1515         if (ompt_enabled.enabled) {
1516           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1517                                   &ompt_parallel_data, return_address);
1518           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1519 
1520           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1521           // don't use lw_taskteam after linking. content was swaped
1522 
1523           /* OMPT implicit task begin */
1524           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1525           if (ompt_enabled.ompt_callback_implicit_task) {
1526             OMPT_CUR_TASK_INFO(master_th)
1527                 ->thread_num = __kmp_tid_from_gtid(gtid);
1528             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1529                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1530                 implicit_task_data, 1,
1531                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1532           }
1533 
1534           /* OMPT state */
1535           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1536         } else {
1537           exit_frame_p = &dummy;
1538         }
1539 #endif
1540         // AC: need to decrement t_serialized for enquiry functions to work
1541         // correctly, will restore at join time
1542         parent_team->t.t_serialized--;
1543 
1544         {
1545           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1546           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1547           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1548 #if OMPT_SUPPORT
1549                                  ,
1550                                  exit_frame_p
1551 #endif
1552                                  );
1553         }
1554 
1555 #if OMPT_SUPPORT
1556         if (ompt_enabled.enabled) {
1557           *exit_frame_p = NULL;
1558           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1559           if (ompt_enabled.ompt_callback_implicit_task) {
1560             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1561                 ompt_scope_end, NULL, implicit_task_data, 1,
1562                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1563           }
1564           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1565           __ompt_lw_taskteam_unlink(master_th);
1566           if (ompt_enabled.ompt_callback_parallel_end) {
1567             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1568                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1569                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1570                 return_address);
1571           }
1572           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1573         }
1574 #endif
1575         return TRUE;
1576       }
1577 
1578       parent_team->t.t_pkfn = microtask;
1579       parent_team->t.t_invoke = invoker;
1580       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1581       parent_team->t.t_active_level++;
1582       parent_team->t.t_level++;
1583       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1584 
1585 #if OMPT_SUPPORT
1586       if (ompt_enabled.enabled) {
1587         ompt_lw_taskteam_t lw_taskteam;
1588         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1589                                 &ompt_parallel_data, return_address);
1590         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1591       }
1592 #endif
1593 
1594       /* Change number of threads in the team if requested */
1595       if (master_set_numthreads) { // The parallel has num_threads clause
1596         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1597           // AC: only can reduce number of threads dynamically, can't increase
1598           kmp_info_t **other_threads = parent_team->t.t_threads;
1599           parent_team->t.t_nproc = master_set_numthreads;
1600           for (i = 0; i < master_set_numthreads; ++i) {
1601             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1602           }
1603           // Keep extra threads hot in the team for possible next parallels
1604         }
1605         master_th->th.th_set_nproc = 0;
1606       }
1607 
1608 #if USE_DEBUGGER
1609       if (__kmp_debugging) { // Let debugger override number of threads.
1610         int nth = __kmp_omp_num_threads(loc);
1611         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1612           master_set_numthreads = nth;
1613         }
1614       }
1615 #endif
1616 
1617 #if USE_ITT_BUILD
1618       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1619            KMP_ITT_DEBUG) &&
1620           __kmp_forkjoin_frames_mode == 3 &&
1621           parent_team->t.t_active_level == 1 // only report frames at level 1
1622           && master_th->th.th_teams_size.nteams == 1) {
1623         kmp_uint64 tmp_time = __itt_get_timestamp();
1624         master_th->th.th_frame_time = tmp_time;
1625         parent_team->t.t_region_time = tmp_time;
1626       }
1627       if (__itt_stack_caller_create_ptr) {
1628         // create new stack stitching id before entering fork barrier
1629         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1630       }
1631 #endif /* USE_ITT_BUILD */
1632 
1633       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1634                     "master_th=%p, gtid=%d\n",
1635                     root, parent_team, master_th, gtid));
1636       __kmp_internal_fork(loc, gtid, parent_team);
1637       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1638                     "master_th=%p, gtid=%d\n",
1639                     root, parent_team, master_th, gtid));
1640 
1641       /* Invoke microtask for MASTER thread */
1642       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1643                     parent_team->t.t_id, parent_team->t.t_pkfn));
1644 
1645       if (!parent_team->t.t_invoke(gtid)) {
1646         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1647       }
1648       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1649                     parent_team->t.t_id, parent_team->t.t_pkfn));
1650       KMP_MB(); /* Flush all pending memory write invalidates.  */
1651 
1652       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1653 
1654       return TRUE;
1655     } // Parallel closely nested in teams construct
1656 
1657 #if KMP_DEBUG
1658     if (__kmp_tasking_mode != tskm_immediate_exec) {
1659       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1660                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1661     }
1662 #endif
1663 
1664     if (parent_team->t.t_active_level >=
1665         master_th->th.th_current_task->td_icvs.max_active_levels) {
1666       nthreads = 1;
1667     } else {
1668       int enter_teams = ((ap == NULL && active_level == 0) ||
1669                          (ap && teams_level > 0 && teams_level == level));
1670       nthreads =
1671           master_set_numthreads
1672               ? master_set_numthreads
1673               : get__nproc_2(
1674                     parent_team,
1675                     master_tid); // TODO: get nproc directly from current task
1676 
1677       // Check if we need to take forkjoin lock? (no need for serialized
1678       // parallel out of teams construct). This code moved here from
1679       // __kmp_reserve_threads() to speedup nested serialized parallels.
1680       if (nthreads > 1) {
1681         if ((get__max_active_levels(master_th) == 1 &&
1682              (root->r.r_in_parallel && !enter_teams)) ||
1683             (__kmp_library == library_serial)) {
1684           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1685                         " threads\n",
1686                         gtid, nthreads));
1687           nthreads = 1;
1688         }
1689       }
1690       if (nthreads > 1) {
1691         /* determine how many new threads we can use */
1692         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1693         /* AC: If we execute teams from parallel region (on host), then teams
1694            should be created but each can only have 1 thread if nesting is
1695            disabled. If teams called from serial region, then teams and their
1696            threads should be created regardless of the nesting setting. */
1697         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1698                                          nthreads, enter_teams);
1699         if (nthreads == 1) {
1700           // Free lock for single thread execution here; for multi-thread
1701           // execution it will be freed later after team of threads created
1702           // and initialized
1703           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1704         }
1705       }
1706     }
1707     KMP_DEBUG_ASSERT(nthreads > 0);
1708 
1709     // If we temporarily changed the set number of threads then restore it now
1710     master_th->th.th_set_nproc = 0;
1711 
1712     /* create a serialized parallel region? */
1713     if (nthreads == 1) {
1714 /* josh todo: hypothetical question: what do we do for OS X*? */
1715 #if KMP_OS_LINUX &&                                                            \
1716     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1717       void *args[argc];
1718 #else
1719       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1720 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1721           KMP_ARCH_AARCH64) */
1722 
1723       KA_TRACE(20,
1724                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1725 
1726       __kmpc_serialized_parallel(loc, gtid);
1727 
1728       if (call_context == fork_context_intel) {
1729         /* TODO this sucks, use the compiler itself to pass args! :) */
1730         master_th->th.th_serial_team->t.t_ident = loc;
1731         if (!ap) {
1732           // revert change made in __kmpc_serialized_parallel()
1733           master_th->th.th_serial_team->t.t_level--;
1734 // Get args from parent team for teams construct
1735 
1736 #if OMPT_SUPPORT
1737           void *dummy;
1738           void **exit_frame_p;
1739           ompt_task_info_t *task_info;
1740 
1741           ompt_lw_taskteam_t lw_taskteam;
1742 
1743           if (ompt_enabled.enabled) {
1744             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1745                                     &ompt_parallel_data, return_address);
1746 
1747             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1748             // don't use lw_taskteam after linking. content was swaped
1749 
1750             task_info = OMPT_CUR_TASK_INFO(master_th);
1751             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1752             if (ompt_enabled.ompt_callback_implicit_task) {
1753               OMPT_CUR_TASK_INFO(master_th)
1754                   ->thread_num = __kmp_tid_from_gtid(gtid);
1755               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1756                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1757                   &(task_info->task_data), 1,
1758                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1759                   ompt_task_implicit);
1760             }
1761 
1762             /* OMPT state */
1763             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1764           } else {
1765             exit_frame_p = &dummy;
1766           }
1767 #endif
1768 
1769           {
1770             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1771             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1772             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1773                                    parent_team->t.t_argv
1774 #if OMPT_SUPPORT
1775                                    ,
1776                                    exit_frame_p
1777 #endif
1778                                    );
1779           }
1780 
1781 #if OMPT_SUPPORT
1782           if (ompt_enabled.enabled) {
1783             *exit_frame_p = NULL;
1784             if (ompt_enabled.ompt_callback_implicit_task) {
1785               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1786                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1787                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1788                   ompt_task_implicit);
1789             }
1790             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1791             __ompt_lw_taskteam_unlink(master_th);
1792             if (ompt_enabled.ompt_callback_parallel_end) {
1793               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1794                   &ompt_parallel_data, parent_task_data,
1795                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1796                   return_address);
1797             }
1798             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1799           }
1800 #endif
1801         } else if (microtask == (microtask_t)__kmp_teams_master) {
1802           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1803                            master_th->th.th_serial_team);
1804           team = master_th->th.th_team;
1805           // team->t.t_pkfn = microtask;
1806           team->t.t_invoke = invoker;
1807           __kmp_alloc_argv_entries(argc, team, TRUE);
1808           team->t.t_argc = argc;
1809           argv = (void **)team->t.t_argv;
1810           if (ap) {
1811             for (i = argc - 1; i >= 0; --i)
1812               *argv++ = va_arg(kmp_va_deref(ap), void *);
1813           } else {
1814             for (i = 0; i < argc; ++i)
1815               // Get args from parent team for teams construct
1816               argv[i] = parent_team->t.t_argv[i];
1817           }
1818           // AC: revert change made in __kmpc_serialized_parallel()
1819           //     because initial code in teams should have level=0
1820           team->t.t_level--;
1821           // AC: call special invoker for outer "parallel" of teams construct
1822           invoker(gtid);
1823 #if OMPT_SUPPORT
1824           if (ompt_enabled.enabled) {
1825             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1826             if (ompt_enabled.ompt_callback_implicit_task) {
1827               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1828                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1829                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1830             }
1831             if (ompt_enabled.ompt_callback_parallel_end) {
1832               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1833                   &ompt_parallel_data, parent_task_data,
1834                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1835                   return_address);
1836             }
1837             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1838           }
1839 #endif
1840         } else {
1841           argv = args;
1842           for (i = argc - 1; i >= 0; --i)
1843             *argv++ = va_arg(kmp_va_deref(ap), void *);
1844           KMP_MB();
1845 
1846 #if OMPT_SUPPORT
1847           void *dummy;
1848           void **exit_frame_p;
1849           ompt_task_info_t *task_info;
1850 
1851           ompt_lw_taskteam_t lw_taskteam;
1852 
1853           if (ompt_enabled.enabled) {
1854             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1855                                     &ompt_parallel_data, return_address);
1856             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1857             // don't use lw_taskteam after linking. content was swaped
1858             task_info = OMPT_CUR_TASK_INFO(master_th);
1859             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1860 
1861             /* OMPT implicit task begin */
1862             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1863             if (ompt_enabled.ompt_callback_implicit_task) {
1864               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1865                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1866                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1867                   ompt_task_implicit);
1868               OMPT_CUR_TASK_INFO(master_th)
1869                   ->thread_num = __kmp_tid_from_gtid(gtid);
1870             }
1871 
1872             /* OMPT state */
1873             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1874           } else {
1875             exit_frame_p = &dummy;
1876           }
1877 #endif
1878 
1879           {
1880             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1881             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1882             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1883 #if OMPT_SUPPORT
1884                                    ,
1885                                    exit_frame_p
1886 #endif
1887                                    );
1888           }
1889 
1890 #if OMPT_SUPPORT
1891           if (ompt_enabled.enabled) {
1892             *exit_frame_p = NULL;
1893             if (ompt_enabled.ompt_callback_implicit_task) {
1894               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1895                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1896                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1897                   ompt_task_implicit);
1898             }
1899 
1900             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1901             __ompt_lw_taskteam_unlink(master_th);
1902             if (ompt_enabled.ompt_callback_parallel_end) {
1903               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1904                   &ompt_parallel_data, parent_task_data,
1905                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1906                   return_address);
1907             }
1908             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1909           }
1910 #endif
1911         }
1912       } else if (call_context == fork_context_gnu) {
1913 #if OMPT_SUPPORT
1914         ompt_lw_taskteam_t lwt;
1915         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1916                                 return_address);
1917 
1918         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1919         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1920 // don't use lw_taskteam after linking. content was swaped
1921 #endif
1922 
1923         // we were called from GNU native code
1924         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1925         return FALSE;
1926       } else {
1927         KMP_ASSERT2(call_context < fork_context_last,
1928                     "__kmp_fork_call: unknown fork_context parameter");
1929       }
1930 
1931       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1932       KMP_MB();
1933       return FALSE;
1934     } // if (nthreads == 1)
1935 
1936     // GEH: only modify the executing flag in the case when not serialized
1937     //      serialized case is handled in kmpc_serialized_parallel
1938     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1939                   "curtask=%p, curtask_max_aclevel=%d\n",
1940                   parent_team->t.t_active_level, master_th,
1941                   master_th->th.th_current_task,
1942                   master_th->th.th_current_task->td_icvs.max_active_levels));
1943     // TODO: GEH - cannot do this assertion because root thread not set up as
1944     // executing
1945     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1946     master_th->th.th_current_task->td_flags.executing = 0;
1947 
1948     if (!master_th->th.th_teams_microtask || level > teams_level) {
1949       /* Increment our nested depth level */
1950       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1951     }
1952 
1953     // See if we need to make a copy of the ICVs.
1954     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1955     if ((level + 1 < __kmp_nested_nth.used) &&
1956         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1957       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1958     } else {
1959       nthreads_icv = 0; // don't update
1960     }
1961 
1962     // Figure out the proc_bind_policy for the new team.
1963     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1964     kmp_proc_bind_t proc_bind_icv =
1965         proc_bind_default; // proc_bind_default means don't update
1966     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1967       proc_bind = proc_bind_false;
1968     } else {
1969       if (proc_bind == proc_bind_default) {
1970         // No proc_bind clause specified; use current proc-bind-var for this
1971         // parallel region
1972         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1973       }
1974       /* else: The proc_bind policy was specified explicitly on parallel clause.
1975          This overrides proc-bind-var for this parallel region, but does not
1976          change proc-bind-var. */
1977       // Figure the value of proc-bind-var for the child threads.
1978       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1979           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1980            master_th->th.th_current_task->td_icvs.proc_bind)) {
1981         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1982       }
1983     }
1984 
1985     // Reset for next parallel region
1986     master_th->th.th_set_proc_bind = proc_bind_default;
1987 
1988     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1989       kmp_internal_control_t new_icvs;
1990       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1991       new_icvs.next = NULL;
1992       if (nthreads_icv > 0) {
1993         new_icvs.nproc = nthreads_icv;
1994       }
1995       if (proc_bind_icv != proc_bind_default) {
1996         new_icvs.proc_bind = proc_bind_icv;
1997       }
1998 
1999       /* allocate a new parallel team */
2000       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2001       team = __kmp_allocate_team(root, nthreads, nthreads,
2002 #if OMPT_SUPPORT
2003                                  ompt_parallel_data,
2004 #endif
2005                                  proc_bind, &new_icvs,
2006                                  argc USE_NESTED_HOT_ARG(master_th));
2007     } else {
2008       /* allocate a new parallel team */
2009       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2010       team = __kmp_allocate_team(root, nthreads, nthreads,
2011 #if OMPT_SUPPORT
2012                                  ompt_parallel_data,
2013 #endif
2014                                  proc_bind,
2015                                  &master_th->th.th_current_task->td_icvs,
2016                                  argc USE_NESTED_HOT_ARG(master_th));
2017     }
2018     KF_TRACE(
2019         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2020 
2021     /* setup the new team */
2022     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2023     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2024     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2025     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2026     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2027 #if OMPT_SUPPORT
2028     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2029                           return_address);
2030 #endif
2031     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2032     // TODO: parent_team->t.t_level == INT_MAX ???
2033     if (!master_th->th.th_teams_microtask || level > teams_level) {
2034       int new_level = parent_team->t.t_level + 1;
2035       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2036       new_level = parent_team->t.t_active_level + 1;
2037       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2038     } else {
2039       // AC: Do not increase parallel level at start of the teams construct
2040       int new_level = parent_team->t.t_level;
2041       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2042       new_level = parent_team->t.t_active_level;
2043       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2044     }
2045     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2046     // set master's schedule as new run-time schedule
2047     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2048 
2049     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2050     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2051 
2052     // Update the floating point rounding in the team if required.
2053     propagateFPControl(team);
2054 
2055     if (__kmp_tasking_mode != tskm_immediate_exec) {
2056       // Set master's task team to team's task team. Unless this is hot team, it
2057       // should be NULL.
2058       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2059                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2060       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2061                     "%p, new task_team %p / team %p\n",
2062                     __kmp_gtid_from_thread(master_th),
2063                     master_th->th.th_task_team, parent_team,
2064                     team->t.t_task_team[master_th->th.th_task_state], team));
2065 
2066       if (active_level || master_th->th.th_task_team) {
2067         // Take a memo of master's task_state
2068         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2069         if (master_th->th.th_task_state_top >=
2070             master_th->th.th_task_state_stack_sz) { // increase size
2071           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2072           kmp_uint8 *old_stack, *new_stack;
2073           kmp_uint32 i;
2074           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2075           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2076             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2077           }
2078           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2079                ++i) { // zero-init rest of stack
2080             new_stack[i] = 0;
2081           }
2082           old_stack = master_th->th.th_task_state_memo_stack;
2083           master_th->th.th_task_state_memo_stack = new_stack;
2084           master_th->th.th_task_state_stack_sz = new_size;
2085           __kmp_free(old_stack);
2086         }
2087         // Store master's task_state on stack
2088         master_th->th
2089             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2090             master_th->th.th_task_state;
2091         master_th->th.th_task_state_top++;
2092 #if KMP_NESTED_HOT_TEAMS
2093         if (master_th->th.th_hot_teams &&
2094             active_level < __kmp_hot_teams_max_level &&
2095             team == master_th->th.th_hot_teams[active_level].hot_team) {
2096           // Restore master's nested state if nested hot team
2097           master_th->th.th_task_state =
2098               master_th->th
2099                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2100         } else {
2101 #endif
2102           master_th->th.th_task_state = 0;
2103 #if KMP_NESTED_HOT_TEAMS
2104         }
2105 #endif
2106       }
2107 #if !KMP_NESTED_HOT_TEAMS
2108       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2109                        (team == root->r.r_hot_team));
2110 #endif
2111     }
2112 
2113     KA_TRACE(
2114         20,
2115         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2116          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2117          team->t.t_nproc));
2118     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2119                      (team->t.t_master_tid == 0 &&
2120                       (team->t.t_parent == root->r.r_root_team ||
2121                        team->t.t_parent->t.t_serialized)));
2122     KMP_MB();
2123 
2124     /* now, setup the arguments */
2125     argv = (void **)team->t.t_argv;
2126     if (ap) {
2127       for (i = argc - 1; i >= 0; --i) {
2128         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2129         KMP_CHECK_UPDATE(*argv, new_argv);
2130         argv++;
2131       }
2132     } else {
2133       for (i = 0; i < argc; ++i) {
2134         // Get args from parent team for teams construct
2135         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2136       }
2137     }
2138 
2139     /* now actually fork the threads */
2140     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2141     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2142       root->r.r_active = TRUE;
2143 
2144     __kmp_fork_team_threads(root, team, master_th, gtid);
2145     __kmp_setup_icv_copy(team, nthreads,
2146                          &master_th->th.th_current_task->td_icvs, loc);
2147 
2148 #if OMPT_SUPPORT
2149     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2150 #endif
2151 
2152     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2153 
2154 #if USE_ITT_BUILD
2155     if (team->t.t_active_level == 1 // only report frames at level 1
2156         && !master_th->th.th_teams_microtask) { // not in teams construct
2157 #if USE_ITT_NOTIFY
2158       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2159           (__kmp_forkjoin_frames_mode == 3 ||
2160            __kmp_forkjoin_frames_mode == 1)) {
2161         kmp_uint64 tmp_time = 0;
2162         if (__itt_get_timestamp_ptr)
2163           tmp_time = __itt_get_timestamp();
2164         // Internal fork - report frame begin
2165         master_th->th.th_frame_time = tmp_time;
2166         if (__kmp_forkjoin_frames_mode == 3)
2167           team->t.t_region_time = tmp_time;
2168       } else
2169 // only one notification scheme (either "submit" or "forking/joined", not both)
2170 #endif /* USE_ITT_NOTIFY */
2171           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2172               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2173         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2174         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2175       }
2176     }
2177 #endif /* USE_ITT_BUILD */
2178 
2179     /* now go on and do the work */
2180     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2181     KMP_MB();
2182     KF_TRACE(10,
2183              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2184               root, team, master_th, gtid));
2185 
2186 #if USE_ITT_BUILD
2187     if (__itt_stack_caller_create_ptr) {
2188       team->t.t_stack_id =
2189           __kmp_itt_stack_caller_create(); // create new stack stitching id
2190       // before entering fork barrier
2191     }
2192 #endif /* USE_ITT_BUILD */
2193 
2194     // AC: skip __kmp_internal_fork at teams construct, let only master
2195     // threads execute
2196     if (ap) {
2197       __kmp_internal_fork(loc, gtid, team);
2198       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2199                     "master_th=%p, gtid=%d\n",
2200                     root, team, master_th, gtid));
2201     }
2202 
2203     if (call_context == fork_context_gnu) {
2204       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2205       return TRUE;
2206     }
2207 
2208     /* Invoke microtask for MASTER thread */
2209     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2210                   team->t.t_id, team->t.t_pkfn));
2211   } // END of timer KMP_fork_call block
2212 
2213 #if KMP_STATS_ENABLED
2214   // If beginning a teams construct, then change thread state
2215   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2216   if (!ap) {
2217     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2218   }
2219 #endif
2220 
2221   if (!team->t.t_invoke(gtid)) {
2222     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2223   }
2224 
2225 #if KMP_STATS_ENABLED
2226   // If was beginning of a teams construct, then reset thread state
2227   if (!ap) {
2228     KMP_SET_THREAD_STATE(previous_state);
2229   }
2230 #endif
2231 
2232   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2233                 team->t.t_id, team->t.t_pkfn));
2234   KMP_MB(); /* Flush all pending memory write invalidates.  */
2235 
2236   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2237 
2238 #if OMPT_SUPPORT
2239   if (ompt_enabled.enabled) {
2240     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2241   }
2242 #endif
2243 
2244   return TRUE;
2245 }
2246 
2247 #if OMPT_SUPPORT
2248 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2249                                             kmp_team_t *team) {
2250   // restore state outside the region
2251   thread->th.ompt_thread_info.state =
2252       ((team->t.t_serialized) ? ompt_state_work_serial
2253                               : ompt_state_work_parallel);
2254 }
2255 
2256 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2257                                    kmp_team_t *team, ompt_data_t *parallel_data,
2258                                    int flags, void *codeptr) {
2259   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2260   if (ompt_enabled.ompt_callback_parallel_end) {
2261     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2262         parallel_data, &(task_info->task_data), flags, codeptr);
2263   }
2264 
2265   task_info->frame.enter_frame = ompt_data_none;
2266   __kmp_join_restore_state(thread, team);
2267 }
2268 #endif
2269 
2270 void __kmp_join_call(ident_t *loc, int gtid
2271 #if OMPT_SUPPORT
2272                      ,
2273                      enum fork_context_e fork_context
2274 #endif
2275                      ,
2276                      int exit_teams) {
2277   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2278   kmp_team_t *team;
2279   kmp_team_t *parent_team;
2280   kmp_info_t *master_th;
2281   kmp_root_t *root;
2282   int master_active;
2283 
2284   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2285 
2286   /* setup current data */
2287   master_th = __kmp_threads[gtid];
2288   root = master_th->th.th_root;
2289   team = master_th->th.th_team;
2290   parent_team = team->t.t_parent;
2291 
2292   master_th->th.th_ident = loc;
2293 
2294 #if OMPT_SUPPORT
2295   void *team_microtask = (void *)team->t.t_pkfn;
2296   if (ompt_enabled.enabled) {
2297     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2298   }
2299 #endif
2300 
2301 #if KMP_DEBUG
2302   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2303     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2304                   "th_task_team = %p\n",
2305                   __kmp_gtid_from_thread(master_th), team,
2306                   team->t.t_task_team[master_th->th.th_task_state],
2307                   master_th->th.th_task_team));
2308     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2309                      team->t.t_task_team[master_th->th.th_task_state]);
2310   }
2311 #endif
2312 
2313   if (team->t.t_serialized) {
2314     if (master_th->th.th_teams_microtask) {
2315       // We are in teams construct
2316       int level = team->t.t_level;
2317       int tlevel = master_th->th.th_teams_level;
2318       if (level == tlevel) {
2319         // AC: we haven't incremented it earlier at start of teams construct,
2320         //     so do it here - at the end of teams construct
2321         team->t.t_level++;
2322       } else if (level == tlevel + 1) {
2323         // AC: we are exiting parallel inside teams, need to increment
2324         // serialization in order to restore it in the next call to
2325         // __kmpc_end_serialized_parallel
2326         team->t.t_serialized++;
2327       }
2328     }
2329     __kmpc_end_serialized_parallel(loc, gtid);
2330 
2331 #if OMPT_SUPPORT
2332     if (ompt_enabled.enabled) {
2333       __kmp_join_restore_state(master_th, parent_team);
2334     }
2335 #endif
2336 
2337     return;
2338   }
2339 
2340   master_active = team->t.t_master_active;
2341 
2342   if (!exit_teams) {
2343     // AC: No barrier for internal teams at exit from teams construct.
2344     //     But there is barrier for external team (league).
2345     __kmp_internal_join(loc, gtid, team);
2346   } else {
2347     master_th->th.th_task_state =
2348         0; // AC: no tasking in teams (out of any parallel)
2349   }
2350 
2351   KMP_MB();
2352 
2353 #if OMPT_SUPPORT
2354   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2355   void *codeptr = team->t.ompt_team_info.master_return_address;
2356 #endif
2357 
2358 #if USE_ITT_BUILD
2359   if (__itt_stack_caller_create_ptr) {
2360     // destroy the stack stitching id after join barrier
2361     __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2362   }
2363   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2364   if (team->t.t_active_level == 1 &&
2365       (!master_th->th.th_teams_microtask || /* not in teams construct */
2366        master_th->th.th_teams_size.nteams == 1)) {
2367     master_th->th.th_ident = loc;
2368     // only one notification scheme (either "submit" or "forking/joined", not
2369     // both)
2370     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2371         __kmp_forkjoin_frames_mode == 3)
2372       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2373                              master_th->th.th_frame_time, 0, loc,
2374                              master_th->th.th_team_nproc, 1);
2375     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2376              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2377       __kmp_itt_region_joined(gtid);
2378   } // active_level == 1
2379 #endif /* USE_ITT_BUILD */
2380 
2381   if (master_th->th.th_teams_microtask && !exit_teams &&
2382       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2383       team->t.t_level == master_th->th.th_teams_level + 1) {
2384 // AC: We need to leave the team structure intact at the end of parallel
2385 // inside the teams construct, so that at the next parallel same (hot) team
2386 // works, only adjust nesting levels
2387 #if OMPT_SUPPORT
2388     ompt_data_t ompt_parallel_data = ompt_data_none;
2389     if (ompt_enabled.enabled) {
2390       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2391       if (ompt_enabled.ompt_callback_implicit_task) {
2392         int ompt_team_size = team->t.t_nproc;
2393         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2394             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2395             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2396       }
2397       task_info->frame.exit_frame = ompt_data_none;
2398       task_info->task_data = ompt_data_none;
2399       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2400       __ompt_lw_taskteam_unlink(master_th);
2401     }
2402 #endif
2403     /* Decrement our nested depth level */
2404     team->t.t_level--;
2405     team->t.t_active_level--;
2406     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2407 
2408     // Restore number of threads in the team if needed. This code relies on
2409     // the proper adjustment of th_teams_size.nth after the fork in
2410     // __kmp_teams_master on each teams master in the case that
2411     // __kmp_reserve_threads reduced it.
2412     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2413       int old_num = master_th->th.th_team_nproc;
2414       int new_num = master_th->th.th_teams_size.nth;
2415       kmp_info_t **other_threads = team->t.t_threads;
2416       team->t.t_nproc = new_num;
2417       for (int i = 0; i < old_num; ++i) {
2418         other_threads[i]->th.th_team_nproc = new_num;
2419       }
2420       // Adjust states of non-used threads of the team
2421       for (int i = old_num; i < new_num; ++i) {
2422         // Re-initialize thread's barrier data.
2423         KMP_DEBUG_ASSERT(other_threads[i]);
2424         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2425         for (int b = 0; b < bs_last_barrier; ++b) {
2426           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2427           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2428 #if USE_DEBUGGER
2429           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2430 #endif
2431         }
2432         if (__kmp_tasking_mode != tskm_immediate_exec) {
2433           // Synchronize thread's task state
2434           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2435         }
2436       }
2437     }
2438 
2439 #if OMPT_SUPPORT
2440     if (ompt_enabled.enabled) {
2441       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2442                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2443     }
2444 #endif
2445 
2446     return;
2447   }
2448 
2449   /* do cleanup and restore the parent team */
2450   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2451   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2452 
2453   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2454 
2455   /* jc: The following lock has instructions with REL and ACQ semantics,
2456      separating the parallel user code called in this parallel region
2457      from the serial user code called after this function returns. */
2458   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2459 
2460   if (!master_th->th.th_teams_microtask ||
2461       team->t.t_level > master_th->th.th_teams_level) {
2462     /* Decrement our nested depth level */
2463     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2464   }
2465   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2466 
2467 #if OMPT_SUPPORT
2468   if (ompt_enabled.enabled) {
2469     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2470     if (ompt_enabled.ompt_callback_implicit_task) {
2471       int flags = (team_microtask == (void *)__kmp_teams_master)
2472                       ? ompt_task_initial
2473                       : ompt_task_implicit;
2474       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2475       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2476           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2477           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2478     }
2479     task_info->frame.exit_frame = ompt_data_none;
2480     task_info->task_data = ompt_data_none;
2481   }
2482 #endif
2483 
2484   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2485                 master_th, team));
2486   __kmp_pop_current_task_from_thread(master_th);
2487 
2488 #if KMP_AFFINITY_SUPPORTED
2489   // Restore master thread's partition.
2490   master_th->th.th_first_place = team->t.t_first_place;
2491   master_th->th.th_last_place = team->t.t_last_place;
2492 #endif // KMP_AFFINITY_SUPPORTED
2493   master_th->th.th_def_allocator = team->t.t_def_allocator;
2494 
2495   updateHWFPControl(team);
2496 
2497   if (root->r.r_active != master_active)
2498     root->r.r_active = master_active;
2499 
2500   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2501                             master_th)); // this will free worker threads
2502 
2503   /* this race was fun to find. make sure the following is in the critical
2504      region otherwise assertions may fail occasionally since the old team may be
2505      reallocated and the hierarchy appears inconsistent. it is actually safe to
2506      run and won't cause any bugs, but will cause those assertion failures. it's
2507      only one deref&assign so might as well put this in the critical region */
2508   master_th->th.th_team = parent_team;
2509   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2510   master_th->th.th_team_master = parent_team->t.t_threads[0];
2511   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2512 
2513   /* restore serialized team, if need be */
2514   if (parent_team->t.t_serialized &&
2515       parent_team != master_th->th.th_serial_team &&
2516       parent_team != root->r.r_root_team) {
2517     __kmp_free_team(root,
2518                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2519     master_th->th.th_serial_team = parent_team;
2520   }
2521 
2522   if (__kmp_tasking_mode != tskm_immediate_exec) {
2523     if (master_th->th.th_task_state_top >
2524         0) { // Restore task state from memo stack
2525       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2526       // Remember master's state if we re-use this nested hot team
2527       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2528           master_th->th.th_task_state;
2529       --master_th->th.th_task_state_top; // pop
2530       // Now restore state at this level
2531       master_th->th.th_task_state =
2532           master_th->th
2533               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2534     }
2535     // Copy the task team from the parent team to the master thread
2536     master_th->th.th_task_team =
2537         parent_team->t.t_task_team[master_th->th.th_task_state];
2538     KA_TRACE(20,
2539              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2540               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2541               parent_team));
2542   }
2543 
2544   // TODO: GEH - cannot do this assertion because root thread not set up as
2545   // executing
2546   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2547   master_th->th.th_current_task->td_flags.executing = 1;
2548 
2549   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2550 
2551 #if OMPT_SUPPORT
2552   int flags =
2553       OMPT_INVOKER(fork_context) |
2554       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2555                                                       : ompt_parallel_team);
2556   if (ompt_enabled.enabled) {
2557     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2558                     codeptr);
2559   }
2560 #endif
2561 
2562   KMP_MB();
2563   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2564 }
2565 
2566 /* Check whether we should push an internal control record onto the
2567    serial team stack.  If so, do it.  */
2568 void __kmp_save_internal_controls(kmp_info_t *thread) {
2569 
2570   if (thread->th.th_team != thread->th.th_serial_team) {
2571     return;
2572   }
2573   if (thread->th.th_team->t.t_serialized > 1) {
2574     int push = 0;
2575 
2576     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2577       push = 1;
2578     } else {
2579       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2580           thread->th.th_team->t.t_serialized) {
2581         push = 1;
2582       }
2583     }
2584     if (push) { /* push a record on the serial team's stack */
2585       kmp_internal_control_t *control =
2586           (kmp_internal_control_t *)__kmp_allocate(
2587               sizeof(kmp_internal_control_t));
2588 
2589       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2590 
2591       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2592 
2593       control->next = thread->th.th_team->t.t_control_stack_top;
2594       thread->th.th_team->t.t_control_stack_top = control;
2595     }
2596   }
2597 }
2598 
2599 /* Changes set_nproc */
2600 void __kmp_set_num_threads(int new_nth, int gtid) {
2601   kmp_info_t *thread;
2602   kmp_root_t *root;
2603 
2604   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2605   KMP_DEBUG_ASSERT(__kmp_init_serial);
2606 
2607   if (new_nth < 1)
2608     new_nth = 1;
2609   else if (new_nth > __kmp_max_nth)
2610     new_nth = __kmp_max_nth;
2611 
2612   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2613   thread = __kmp_threads[gtid];
2614   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2615     return; // nothing to do
2616 
2617   __kmp_save_internal_controls(thread);
2618 
2619   set__nproc(thread, new_nth);
2620 
2621   // If this omp_set_num_threads() call will cause the hot team size to be
2622   // reduced (in the absence of a num_threads clause), then reduce it now,
2623   // rather than waiting for the next parallel region.
2624   root = thread->th.th_root;
2625   if (__kmp_init_parallel && (!root->r.r_active) &&
2626       (root->r.r_hot_team->t.t_nproc > new_nth)
2627 #if KMP_NESTED_HOT_TEAMS
2628       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2629 #endif
2630       ) {
2631     kmp_team_t *hot_team = root->r.r_hot_team;
2632     int f;
2633 
2634     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2635 
2636     // Release the extra threads we don't need any more.
2637     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2638       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2639       if (__kmp_tasking_mode != tskm_immediate_exec) {
2640         // When decreasing team size, threads no longer in the team should unref
2641         // task team.
2642         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2643       }
2644       __kmp_free_thread(hot_team->t.t_threads[f]);
2645       hot_team->t.t_threads[f] = NULL;
2646     }
2647     hot_team->t.t_nproc = new_nth;
2648 #if KMP_NESTED_HOT_TEAMS
2649     if (thread->th.th_hot_teams) {
2650       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2651       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2652     }
2653 #endif
2654 
2655     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2656 
2657     // Update the t_nproc field in the threads that are still active.
2658     for (f = 0; f < new_nth; f++) {
2659       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2660       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2661     }
2662     // Special flag in case omp_set_num_threads() call
2663     hot_team->t.t_size_changed = -1;
2664   }
2665 }
2666 
2667 /* Changes max_active_levels */
2668 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2669   kmp_info_t *thread;
2670 
2671   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2672                 "%d = (%d)\n",
2673                 gtid, max_active_levels));
2674   KMP_DEBUG_ASSERT(__kmp_init_serial);
2675 
2676   // validate max_active_levels
2677   if (max_active_levels < 0) {
2678     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2679     // We ignore this call if the user has specified a negative value.
2680     // The current setting won't be changed. The last valid setting will be
2681     // used. A warning will be issued (if warnings are allowed as controlled by
2682     // the KMP_WARNINGS env var).
2683     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2684                   "max_active_levels for thread %d = (%d)\n",
2685                   gtid, max_active_levels));
2686     return;
2687   }
2688   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2689     // it's OK, the max_active_levels is within the valid range: [ 0;
2690     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2691     // We allow a zero value. (implementation defined behavior)
2692   } else {
2693     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2694                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2695     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2696     // Current upper limit is MAX_INT. (implementation defined behavior)
2697     // If the input exceeds the upper limit, we correct the input to be the
2698     // upper limit. (implementation defined behavior)
2699     // Actually, the flow should never get here until we use MAX_INT limit.
2700   }
2701   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2702                 "max_active_levels for thread %d = (%d)\n",
2703                 gtid, max_active_levels));
2704 
2705   thread = __kmp_threads[gtid];
2706 
2707   __kmp_save_internal_controls(thread);
2708 
2709   set__max_active_levels(thread, max_active_levels);
2710 }
2711 
2712 /* Gets max_active_levels */
2713 int __kmp_get_max_active_levels(int gtid) {
2714   kmp_info_t *thread;
2715 
2716   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2717   KMP_DEBUG_ASSERT(__kmp_init_serial);
2718 
2719   thread = __kmp_threads[gtid];
2720   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2721   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2722                 "curtask_maxaclevel=%d\n",
2723                 gtid, thread->th.th_current_task,
2724                 thread->th.th_current_task->td_icvs.max_active_levels));
2725   return thread->th.th_current_task->td_icvs.max_active_levels;
2726 }
2727 
2728 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2729 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2730 
2731 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2732 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2733   kmp_info_t *thread;
2734   kmp_sched_t orig_kind;
2735   //    kmp_team_t *team;
2736 
2737   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2738                 gtid, (int)kind, chunk));
2739   KMP_DEBUG_ASSERT(__kmp_init_serial);
2740 
2741   // Check if the kind parameter is valid, correct if needed.
2742   // Valid parameters should fit in one of two intervals - standard or extended:
2743   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2744   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2745   orig_kind = kind;
2746   kind = __kmp_sched_without_mods(kind);
2747 
2748   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2749       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2750     // TODO: Hint needs attention in case we change the default schedule.
2751     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2752               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2753               __kmp_msg_null);
2754     kind = kmp_sched_default;
2755     chunk = 0; // ignore chunk value in case of bad kind
2756   }
2757 
2758   thread = __kmp_threads[gtid];
2759 
2760   __kmp_save_internal_controls(thread);
2761 
2762   if (kind < kmp_sched_upper_std) {
2763     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2764       // differ static chunked vs. unchunked:  chunk should be invalid to
2765       // indicate unchunked schedule (which is the default)
2766       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2767     } else {
2768       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2769           __kmp_sch_map[kind - kmp_sched_lower - 1];
2770     }
2771   } else {
2772     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2773     //    kmp_sched_lower - 2 ];
2774     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2775         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2776                       kmp_sched_lower - 2];
2777   }
2778   __kmp_sched_apply_mods_intkind(
2779       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2780   if (kind == kmp_sched_auto || chunk < 1) {
2781     // ignore parameter chunk for schedule auto
2782     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2783   } else {
2784     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2785   }
2786 }
2787 
2788 /* Gets def_sched_var ICV values */
2789 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2790   kmp_info_t *thread;
2791   enum sched_type th_type;
2792 
2793   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2794   KMP_DEBUG_ASSERT(__kmp_init_serial);
2795 
2796   thread = __kmp_threads[gtid];
2797 
2798   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2799   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2800   case kmp_sch_static:
2801   case kmp_sch_static_greedy:
2802   case kmp_sch_static_balanced:
2803     *kind = kmp_sched_static;
2804     __kmp_sched_apply_mods_stdkind(kind, th_type);
2805     *chunk = 0; // chunk was not set, try to show this fact via zero value
2806     return;
2807   case kmp_sch_static_chunked:
2808     *kind = kmp_sched_static;
2809     break;
2810   case kmp_sch_dynamic_chunked:
2811     *kind = kmp_sched_dynamic;
2812     break;
2813   case kmp_sch_guided_chunked:
2814   case kmp_sch_guided_iterative_chunked:
2815   case kmp_sch_guided_analytical_chunked:
2816     *kind = kmp_sched_guided;
2817     break;
2818   case kmp_sch_auto:
2819     *kind = kmp_sched_auto;
2820     break;
2821   case kmp_sch_trapezoidal:
2822     *kind = kmp_sched_trapezoidal;
2823     break;
2824 #if KMP_STATIC_STEAL_ENABLED
2825   case kmp_sch_static_steal:
2826     *kind = kmp_sched_static_steal;
2827     break;
2828 #endif
2829   default:
2830     KMP_FATAL(UnknownSchedulingType, th_type);
2831   }
2832 
2833   __kmp_sched_apply_mods_stdkind(kind, th_type);
2834   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2835 }
2836 
2837 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2838 
2839   int ii, dd;
2840   kmp_team_t *team;
2841   kmp_info_t *thr;
2842 
2843   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2844   KMP_DEBUG_ASSERT(__kmp_init_serial);
2845 
2846   // validate level
2847   if (level == 0)
2848     return 0;
2849   if (level < 0)
2850     return -1;
2851   thr = __kmp_threads[gtid];
2852   team = thr->th.th_team;
2853   ii = team->t.t_level;
2854   if (level > ii)
2855     return -1;
2856 
2857   if (thr->th.th_teams_microtask) {
2858     // AC: we are in teams region where multiple nested teams have same level
2859     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2860     if (level <=
2861         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2862       KMP_DEBUG_ASSERT(ii >= tlevel);
2863       // AC: As we need to pass by the teams league, we need to artificially
2864       // increase ii
2865       if (ii == tlevel) {
2866         ii += 2; // three teams have same level
2867       } else {
2868         ii++; // two teams have same level
2869       }
2870     }
2871   }
2872 
2873   if (ii == level)
2874     return __kmp_tid_from_gtid(gtid);
2875 
2876   dd = team->t.t_serialized;
2877   level++;
2878   while (ii > level) {
2879     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2880     }
2881     if ((team->t.t_serialized) && (!dd)) {
2882       team = team->t.t_parent;
2883       continue;
2884     }
2885     if (ii > level) {
2886       team = team->t.t_parent;
2887       dd = team->t.t_serialized;
2888       ii--;
2889     }
2890   }
2891 
2892   return (dd > 1) ? (0) : (team->t.t_master_tid);
2893 }
2894 
2895 int __kmp_get_team_size(int gtid, int level) {
2896 
2897   int ii, dd;
2898   kmp_team_t *team;
2899   kmp_info_t *thr;
2900 
2901   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2902   KMP_DEBUG_ASSERT(__kmp_init_serial);
2903 
2904   // validate level
2905   if (level == 0)
2906     return 1;
2907   if (level < 0)
2908     return -1;
2909   thr = __kmp_threads[gtid];
2910   team = thr->th.th_team;
2911   ii = team->t.t_level;
2912   if (level > ii)
2913     return -1;
2914 
2915   if (thr->th.th_teams_microtask) {
2916     // AC: we are in teams region where multiple nested teams have same level
2917     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2918     if (level <=
2919         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2920       KMP_DEBUG_ASSERT(ii >= tlevel);
2921       // AC: As we need to pass by the teams league, we need to artificially
2922       // increase ii
2923       if (ii == tlevel) {
2924         ii += 2; // three teams have same level
2925       } else {
2926         ii++; // two teams have same level
2927       }
2928     }
2929   }
2930 
2931   while (ii > level) {
2932     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2933     }
2934     if (team->t.t_serialized && (!dd)) {
2935       team = team->t.t_parent;
2936       continue;
2937     }
2938     if (ii > level) {
2939       team = team->t.t_parent;
2940       ii--;
2941     }
2942   }
2943 
2944   return team->t.t_nproc;
2945 }
2946 
2947 kmp_r_sched_t __kmp_get_schedule_global() {
2948   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2949   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2950   // independently. So one can get the updated schedule here.
2951 
2952   kmp_r_sched_t r_sched;
2953 
2954   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2955   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2956   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2957   // different roots (even in OMP 2.5)
2958   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2959   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2960   if (s == kmp_sch_static) {
2961     // replace STATIC with more detailed schedule (balanced or greedy)
2962     r_sched.r_sched_type = __kmp_static;
2963   } else if (s == kmp_sch_guided_chunked) {
2964     // replace GUIDED with more detailed schedule (iterative or analytical)
2965     r_sched.r_sched_type = __kmp_guided;
2966   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2967     r_sched.r_sched_type = __kmp_sched;
2968   }
2969   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2970 
2971   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2972     // __kmp_chunk may be wrong here (if it was not ever set)
2973     r_sched.chunk = KMP_DEFAULT_CHUNK;
2974   } else {
2975     r_sched.chunk = __kmp_chunk;
2976   }
2977 
2978   return r_sched;
2979 }
2980 
2981 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2982    at least argc number of *t_argv entries for the requested team. */
2983 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2984 
2985   KMP_DEBUG_ASSERT(team);
2986   if (!realloc || argc > team->t.t_max_argc) {
2987 
2988     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2989                    "current entries=%d\n",
2990                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2991     /* if previously allocated heap space for args, free them */
2992     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2993       __kmp_free((void *)team->t.t_argv);
2994 
2995     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2996       /* use unused space in the cache line for arguments */
2997       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2998       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2999                      "argv entries\n",
3000                      team->t.t_id, team->t.t_max_argc));
3001       team->t.t_argv = &team->t.t_inline_argv[0];
3002       if (__kmp_storage_map) {
3003         __kmp_print_storage_map_gtid(
3004             -1, &team->t.t_inline_argv[0],
3005             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3006             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3007             team->t.t_id);
3008       }
3009     } else {
3010       /* allocate space for arguments in the heap */
3011       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3012                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3013                                : 2 * argc;
3014       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3015                      "argv entries\n",
3016                      team->t.t_id, team->t.t_max_argc));
3017       team->t.t_argv =
3018           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3019       if (__kmp_storage_map) {
3020         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3021                                      &team->t.t_argv[team->t.t_max_argc],
3022                                      sizeof(void *) * team->t.t_max_argc,
3023                                      "team_%d.t_argv", team->t.t_id);
3024       }
3025     }
3026   }
3027 }
3028 
3029 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3030   int i;
3031   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3032   team->t.t_threads =
3033       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3034   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3035       sizeof(dispatch_shared_info_t) * num_disp_buff);
3036   team->t.t_dispatch =
3037       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3038   team->t.t_implicit_task_taskdata =
3039       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3040   team->t.t_max_nproc = max_nth;
3041 
3042   /* setup dispatch buffers */
3043   for (i = 0; i < num_disp_buff; ++i) {
3044     team->t.t_disp_buffer[i].buffer_index = i;
3045     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3046   }
3047 }
3048 
3049 static void __kmp_free_team_arrays(kmp_team_t *team) {
3050   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3051   int i;
3052   for (i = 0; i < team->t.t_max_nproc; ++i) {
3053     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3054       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3055       team->t.t_dispatch[i].th_disp_buffer = NULL;
3056     }
3057   }
3058 #if KMP_USE_HIER_SCHED
3059   __kmp_dispatch_free_hierarchies(team);
3060 #endif
3061   __kmp_free(team->t.t_threads);
3062   __kmp_free(team->t.t_disp_buffer);
3063   __kmp_free(team->t.t_dispatch);
3064   __kmp_free(team->t.t_implicit_task_taskdata);
3065   team->t.t_threads = NULL;
3066   team->t.t_disp_buffer = NULL;
3067   team->t.t_dispatch = NULL;
3068   team->t.t_implicit_task_taskdata = 0;
3069 }
3070 
3071 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3072   kmp_info_t **oldThreads = team->t.t_threads;
3073 
3074   __kmp_free(team->t.t_disp_buffer);
3075   __kmp_free(team->t.t_dispatch);
3076   __kmp_free(team->t.t_implicit_task_taskdata);
3077   __kmp_allocate_team_arrays(team, max_nth);
3078 
3079   KMP_MEMCPY(team->t.t_threads, oldThreads,
3080              team->t.t_nproc * sizeof(kmp_info_t *));
3081 
3082   __kmp_free(oldThreads);
3083 }
3084 
3085 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3086 
3087   kmp_r_sched_t r_sched =
3088       __kmp_get_schedule_global(); // get current state of scheduling globals
3089 
3090   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3091 
3092   kmp_internal_control_t g_icvs = {
3093     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3094     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3095     // adjustment of threads (per thread)
3096     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3097     // whether blocktime is explicitly set
3098     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3099 #if KMP_USE_MONITOR
3100     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3101 // intervals
3102 #endif
3103     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3104     // next parallel region (per thread)
3105     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3106     __kmp_cg_max_nth, // int thread_limit;
3107     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3108     // for max_active_levels
3109     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3110     // {sched,chunk} pair
3111     __kmp_nested_proc_bind.bind_types[0],
3112     __kmp_default_device,
3113     NULL // struct kmp_internal_control *next;
3114   };
3115 
3116   return g_icvs;
3117 }
3118 
3119 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3120 
3121   kmp_internal_control_t gx_icvs;
3122   gx_icvs.serial_nesting_level =
3123       0; // probably =team->t.t_serial like in save_inter_controls
3124   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3125   gx_icvs.next = NULL;
3126 
3127   return gx_icvs;
3128 }
3129 
3130 static void __kmp_initialize_root(kmp_root_t *root) {
3131   int f;
3132   kmp_team_t *root_team;
3133   kmp_team_t *hot_team;
3134   int hot_team_max_nth;
3135   kmp_r_sched_t r_sched =
3136       __kmp_get_schedule_global(); // get current state of scheduling globals
3137   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3138   KMP_DEBUG_ASSERT(root);
3139   KMP_ASSERT(!root->r.r_begin);
3140 
3141   /* setup the root state structure */
3142   __kmp_init_lock(&root->r.r_begin_lock);
3143   root->r.r_begin = FALSE;
3144   root->r.r_active = FALSE;
3145   root->r.r_in_parallel = 0;
3146   root->r.r_blocktime = __kmp_dflt_blocktime;
3147 
3148   /* setup the root team for this task */
3149   /* allocate the root team structure */
3150   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3151 
3152   root_team =
3153       __kmp_allocate_team(root,
3154                           1, // new_nproc
3155                           1, // max_nproc
3156 #if OMPT_SUPPORT
3157                           ompt_data_none, // root parallel id
3158 #endif
3159                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3160                           0 // argc
3161                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3162                           );
3163 #if USE_DEBUGGER
3164   // Non-NULL value should be assigned to make the debugger display the root
3165   // team.
3166   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3167 #endif
3168 
3169   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3170 
3171   root->r.r_root_team = root_team;
3172   root_team->t.t_control_stack_top = NULL;
3173 
3174   /* initialize root team */
3175   root_team->t.t_threads[0] = NULL;
3176   root_team->t.t_nproc = 1;
3177   root_team->t.t_serialized = 1;
3178   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3179   root_team->t.t_sched.sched = r_sched.sched;
3180   KA_TRACE(
3181       20,
3182       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3183        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3184 
3185   /* setup the  hot team for this task */
3186   /* allocate the hot team structure */
3187   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3188 
3189   hot_team =
3190       __kmp_allocate_team(root,
3191                           1, // new_nproc
3192                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3193 #if OMPT_SUPPORT
3194                           ompt_data_none, // root parallel id
3195 #endif
3196                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3197                           0 // argc
3198                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3199                           );
3200   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3201 
3202   root->r.r_hot_team = hot_team;
3203   root_team->t.t_control_stack_top = NULL;
3204 
3205   /* first-time initialization */
3206   hot_team->t.t_parent = root_team;
3207 
3208   /* initialize hot team */
3209   hot_team_max_nth = hot_team->t.t_max_nproc;
3210   for (f = 0; f < hot_team_max_nth; ++f) {
3211     hot_team->t.t_threads[f] = NULL;
3212   }
3213   hot_team->t.t_nproc = 1;
3214   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3215   hot_team->t.t_sched.sched = r_sched.sched;
3216   hot_team->t.t_size_changed = 0;
3217 }
3218 
3219 #ifdef KMP_DEBUG
3220 
3221 typedef struct kmp_team_list_item {
3222   kmp_team_p const *entry;
3223   struct kmp_team_list_item *next;
3224 } kmp_team_list_item_t;
3225 typedef kmp_team_list_item_t *kmp_team_list_t;
3226 
3227 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3228     kmp_team_list_t list, // List of teams.
3229     kmp_team_p const *team // Team to add.
3230     ) {
3231 
3232   // List must terminate with item where both entry and next are NULL.
3233   // Team is added to the list only once.
3234   // List is sorted in ascending order by team id.
3235   // Team id is *not* a key.
3236 
3237   kmp_team_list_t l;
3238 
3239   KMP_DEBUG_ASSERT(list != NULL);
3240   if (team == NULL) {
3241     return;
3242   }
3243 
3244   __kmp_print_structure_team_accum(list, team->t.t_parent);
3245   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3246 
3247   // Search list for the team.
3248   l = list;
3249   while (l->next != NULL && l->entry != team) {
3250     l = l->next;
3251   }
3252   if (l->next != NULL) {
3253     return; // Team has been added before, exit.
3254   }
3255 
3256   // Team is not found. Search list again for insertion point.
3257   l = list;
3258   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3259     l = l->next;
3260   }
3261 
3262   // Insert team.
3263   {
3264     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3265         sizeof(kmp_team_list_item_t));
3266     *item = *l;
3267     l->entry = team;
3268     l->next = item;
3269   }
3270 }
3271 
3272 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3273 
3274                                        ) {
3275   __kmp_printf("%s", title);
3276   if (team != NULL) {
3277     __kmp_printf("%2x %p\n", team->t.t_id, team);
3278   } else {
3279     __kmp_printf(" - (nil)\n");
3280   }
3281 }
3282 
3283 static void __kmp_print_structure_thread(char const *title,
3284                                          kmp_info_p const *thread) {
3285   __kmp_printf("%s", title);
3286   if (thread != NULL) {
3287     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3288   } else {
3289     __kmp_printf(" - (nil)\n");
3290   }
3291 }
3292 
3293 void __kmp_print_structure(void) {
3294 
3295   kmp_team_list_t list;
3296 
3297   // Initialize list of teams.
3298   list =
3299       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3300   list->entry = NULL;
3301   list->next = NULL;
3302 
3303   __kmp_printf("\n------------------------------\nGlobal Thread "
3304                "Table\n------------------------------\n");
3305   {
3306     int gtid;
3307     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3308       __kmp_printf("%2d", gtid);
3309       if (__kmp_threads != NULL) {
3310         __kmp_printf(" %p", __kmp_threads[gtid]);
3311       }
3312       if (__kmp_root != NULL) {
3313         __kmp_printf(" %p", __kmp_root[gtid]);
3314       }
3315       __kmp_printf("\n");
3316     }
3317   }
3318 
3319   // Print out __kmp_threads array.
3320   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3321                "----------\n");
3322   if (__kmp_threads != NULL) {
3323     int gtid;
3324     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3325       kmp_info_t const *thread = __kmp_threads[gtid];
3326       if (thread != NULL) {
3327         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3328         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3329         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3330         __kmp_print_structure_team("    Serial Team:  ",
3331                                    thread->th.th_serial_team);
3332         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3333         __kmp_print_structure_thread("    Master:       ",
3334                                      thread->th.th_team_master);
3335         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3336         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3337         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3338         __kmp_print_structure_thread("    Next in pool: ",
3339                                      thread->th.th_next_pool);
3340         __kmp_printf("\n");
3341         __kmp_print_structure_team_accum(list, thread->th.th_team);
3342         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3343       }
3344     }
3345   } else {
3346     __kmp_printf("Threads array is not allocated.\n");
3347   }
3348 
3349   // Print out __kmp_root array.
3350   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3351                "--------\n");
3352   if (__kmp_root != NULL) {
3353     int gtid;
3354     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3355       kmp_root_t const *root = __kmp_root[gtid];
3356       if (root != NULL) {
3357         __kmp_printf("GTID %2d %p:\n", gtid, root);
3358         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3359         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3360         __kmp_print_structure_thread("    Uber Thread:  ",
3361                                      root->r.r_uber_thread);
3362         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3363         __kmp_printf("    In Parallel:  %2d\n",
3364                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3365         __kmp_printf("\n");
3366         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3367         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3368       }
3369     }
3370   } else {
3371     __kmp_printf("Ubers array is not allocated.\n");
3372   }
3373 
3374   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3375                "--------\n");
3376   while (list->next != NULL) {
3377     kmp_team_p const *team = list->entry;
3378     int i;
3379     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3380     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3381     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3382     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3383     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3384     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3385     for (i = 0; i < team->t.t_nproc; ++i) {
3386       __kmp_printf("    Thread %2d:      ", i);
3387       __kmp_print_structure_thread("", team->t.t_threads[i]);
3388     }
3389     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3390     __kmp_printf("\n");
3391     list = list->next;
3392   }
3393 
3394   // Print out __kmp_thread_pool and __kmp_team_pool.
3395   __kmp_printf("\n------------------------------\nPools\n----------------------"
3396                "--------\n");
3397   __kmp_print_structure_thread("Thread pool:          ",
3398                                CCAST(kmp_info_t *, __kmp_thread_pool));
3399   __kmp_print_structure_team("Team pool:            ",
3400                              CCAST(kmp_team_t *, __kmp_team_pool));
3401   __kmp_printf("\n");
3402 
3403   // Free team list.
3404   while (list != NULL) {
3405     kmp_team_list_item_t *item = list;
3406     list = list->next;
3407     KMP_INTERNAL_FREE(item);
3408   }
3409 }
3410 
3411 #endif
3412 
3413 //---------------------------------------------------------------------------
3414 //  Stuff for per-thread fast random number generator
3415 //  Table of primes
3416 static const unsigned __kmp_primes[] = {
3417     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3418     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3419     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3420     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3421     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3422     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3423     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3424     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3425     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3426     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3427     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3428 
3429 //---------------------------------------------------------------------------
3430 //  __kmp_get_random: Get a random number using a linear congruential method.
3431 unsigned short __kmp_get_random(kmp_info_t *thread) {
3432   unsigned x = thread->th.th_x;
3433   unsigned short r = x >> 16;
3434 
3435   thread->th.th_x = x * thread->th.th_a + 1;
3436 
3437   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3438                 thread->th.th_info.ds.ds_tid, r));
3439 
3440   return r;
3441 }
3442 //--------------------------------------------------------
3443 // __kmp_init_random: Initialize a random number generator
3444 void __kmp_init_random(kmp_info_t *thread) {
3445   unsigned seed = thread->th.th_info.ds.ds_tid;
3446 
3447   thread->th.th_a =
3448       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3449   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3450   KA_TRACE(30,
3451            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3452 }
3453 
3454 #if KMP_OS_WINDOWS
3455 /* reclaim array entries for root threads that are already dead, returns number
3456  * reclaimed */
3457 static int __kmp_reclaim_dead_roots(void) {
3458   int i, r = 0;
3459 
3460   for (i = 0; i < __kmp_threads_capacity; ++i) {
3461     if (KMP_UBER_GTID(i) &&
3462         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3463         !__kmp_root[i]
3464              ->r.r_active) { // AC: reclaim only roots died in non-active state
3465       r += __kmp_unregister_root_other_thread(i);
3466     }
3467   }
3468   return r;
3469 }
3470 #endif
3471 
3472 /* This function attempts to create free entries in __kmp_threads and
3473    __kmp_root, and returns the number of free entries generated.
3474 
3475    For Windows* OS static library, the first mechanism used is to reclaim array
3476    entries for root threads that are already dead.
3477 
3478    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3479    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3480    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3481    threadprivate cache array has been created. Synchronization with
3482    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3483 
3484    After any dead root reclamation, if the clipping value allows array expansion
3485    to result in the generation of a total of nNeed free slots, the function does
3486    that expansion. If not, nothing is done beyond the possible initial root
3487    thread reclamation.
3488 
3489    If any argument is negative, the behavior is undefined. */
3490 static int __kmp_expand_threads(int nNeed) {
3491   int added = 0;
3492   int minimumRequiredCapacity;
3493   int newCapacity;
3494   kmp_info_t **newThreads;
3495   kmp_root_t **newRoot;
3496 
3497 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3498 // resizing __kmp_threads does not need additional protection if foreign
3499 // threads are present
3500 
3501 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3502   /* only for Windows static library */
3503   /* reclaim array entries for root threads that are already dead */
3504   added = __kmp_reclaim_dead_roots();
3505 
3506   if (nNeed) {
3507     nNeed -= added;
3508     if (nNeed < 0)
3509       nNeed = 0;
3510   }
3511 #endif
3512   if (nNeed <= 0)
3513     return added;
3514 
3515   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3516   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3517   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3518   // > __kmp_max_nth in one of two ways:
3519   //
3520   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3521   //    may not be reused by another thread, so we may need to increase
3522   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3523   //
3524   // 2) New foreign root(s) are encountered.  We always register new foreign
3525   //    roots. This may cause a smaller # of threads to be allocated at
3526   //    subsequent parallel regions, but the worker threads hang around (and
3527   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3528   //
3529   // Anyway, that is the reason for moving the check to see if
3530   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3531   // instead of having it performed here. -BB
3532 
3533   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3534 
3535   /* compute expansion headroom to check if we can expand */
3536   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3537     /* possible expansion too small -- give up */
3538     return added;
3539   }
3540   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3541 
3542   newCapacity = __kmp_threads_capacity;
3543   do {
3544     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3545                                                           : __kmp_sys_max_nth;
3546   } while (newCapacity < minimumRequiredCapacity);
3547   newThreads = (kmp_info_t **)__kmp_allocate(
3548       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3549   newRoot =
3550       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3551   KMP_MEMCPY(newThreads, __kmp_threads,
3552              __kmp_threads_capacity * sizeof(kmp_info_t *));
3553   KMP_MEMCPY(newRoot, __kmp_root,
3554              __kmp_threads_capacity * sizeof(kmp_root_t *));
3555 
3556   kmp_info_t **temp_threads = __kmp_threads;
3557   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3558   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3559   __kmp_free(temp_threads);
3560   added += newCapacity - __kmp_threads_capacity;
3561   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3562 
3563   if (newCapacity > __kmp_tp_capacity) {
3564     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3565     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3566       __kmp_threadprivate_resize_cache(newCapacity);
3567     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3568       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3569     }
3570     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3571   }
3572 
3573   return added;
3574 }
3575 
3576 /* Register the current thread as a root thread and obtain our gtid. We must
3577    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3578    thread that calls from __kmp_do_serial_initialize() */
3579 int __kmp_register_root(int initial_thread) {
3580   kmp_info_t *root_thread;
3581   kmp_root_t *root;
3582   int gtid;
3583   int capacity;
3584   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3585   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3586   KMP_MB();
3587 
3588   /* 2007-03-02:
3589      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3590      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3591      work as expected -- it may return false (that means there is at least one
3592      empty slot in __kmp_threads array), but it is possible the only free slot
3593      is #0, which is reserved for initial thread and so cannot be used for this
3594      one. Following code workarounds this bug.
3595 
3596      However, right solution seems to be not reserving slot #0 for initial
3597      thread because:
3598      (1) there is no magic in slot #0,
3599      (2) we cannot detect initial thread reliably (the first thread which does
3600         serial initialization may be not a real initial thread).
3601   */
3602   capacity = __kmp_threads_capacity;
3603   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3604     --capacity;
3605   }
3606 
3607   /* see if there are too many threads */
3608   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3609     if (__kmp_tp_cached) {
3610       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3611                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3612                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3613     } else {
3614       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3615                   __kmp_msg_null);
3616     }
3617   }
3618 
3619   /* find an available thread slot */
3620   /* Don't reassign the zero slot since we need that to only be used by initial
3621      thread */
3622   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3623        gtid++)
3624     ;
3625   KA_TRACE(1,
3626            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3627   KMP_ASSERT(gtid < __kmp_threads_capacity);
3628 
3629   /* update global accounting */
3630   __kmp_all_nth++;
3631   TCW_4(__kmp_nth, __kmp_nth + 1);
3632 
3633   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3634   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3635   if (__kmp_adjust_gtid_mode) {
3636     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3637       if (TCR_4(__kmp_gtid_mode) != 2) {
3638         TCW_4(__kmp_gtid_mode, 2);
3639       }
3640     } else {
3641       if (TCR_4(__kmp_gtid_mode) != 1) {
3642         TCW_4(__kmp_gtid_mode, 1);
3643       }
3644     }
3645   }
3646 
3647 #ifdef KMP_ADJUST_BLOCKTIME
3648   /* Adjust blocktime to zero if necessary            */
3649   /* Middle initialization might not have occurred yet */
3650   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3651     if (__kmp_nth > __kmp_avail_proc) {
3652       __kmp_zero_bt = TRUE;
3653     }
3654   }
3655 #endif /* KMP_ADJUST_BLOCKTIME */
3656 
3657   /* setup this new hierarchy */
3658   if (!(root = __kmp_root[gtid])) {
3659     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3660     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3661   }
3662 
3663 #if KMP_STATS_ENABLED
3664   // Initialize stats as soon as possible (right after gtid assignment).
3665   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3666   __kmp_stats_thread_ptr->startLife();
3667   KMP_SET_THREAD_STATE(SERIAL_REGION);
3668   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3669 #endif
3670   __kmp_initialize_root(root);
3671 
3672   /* setup new root thread structure */
3673   if (root->r.r_uber_thread) {
3674     root_thread = root->r.r_uber_thread;
3675   } else {
3676     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3677     if (__kmp_storage_map) {
3678       __kmp_print_thread_storage_map(root_thread, gtid);
3679     }
3680     root_thread->th.th_info.ds.ds_gtid = gtid;
3681 #if OMPT_SUPPORT
3682     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3683 #endif
3684     root_thread->th.th_root = root;
3685     if (__kmp_env_consistency_check) {
3686       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3687     }
3688 #if USE_FAST_MEMORY
3689     __kmp_initialize_fast_memory(root_thread);
3690 #endif /* USE_FAST_MEMORY */
3691 
3692 #if KMP_USE_BGET
3693     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3694     __kmp_initialize_bget(root_thread);
3695 #endif
3696     __kmp_init_random(root_thread); // Initialize random number generator
3697   }
3698 
3699   /* setup the serial team held in reserve by the root thread */
3700   if (!root_thread->th.th_serial_team) {
3701     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3702     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3703     root_thread->th.th_serial_team = __kmp_allocate_team(
3704         root, 1, 1,
3705 #if OMPT_SUPPORT
3706         ompt_data_none, // root parallel id
3707 #endif
3708         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3709   }
3710   KMP_ASSERT(root_thread->th.th_serial_team);
3711   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3712                 root_thread->th.th_serial_team));
3713 
3714   /* drop root_thread into place */
3715   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3716 
3717   root->r.r_root_team->t.t_threads[0] = root_thread;
3718   root->r.r_hot_team->t.t_threads[0] = root_thread;
3719   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3720   // AC: the team created in reserve, not for execution (it is unused for now).
3721   root_thread->th.th_serial_team->t.t_serialized = 0;
3722   root->r.r_uber_thread = root_thread;
3723 
3724   /* initialize the thread, get it ready to go */
3725   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3726   TCW_4(__kmp_init_gtid, TRUE);
3727 
3728   /* prepare the master thread for get_gtid() */
3729   __kmp_gtid_set_specific(gtid);
3730 
3731 #if USE_ITT_BUILD
3732   __kmp_itt_thread_name(gtid);
3733 #endif /* USE_ITT_BUILD */
3734 
3735 #ifdef KMP_TDATA_GTID
3736   __kmp_gtid = gtid;
3737 #endif
3738   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3739   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3740 
3741   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3742                 "plain=%u\n",
3743                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3744                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3745                 KMP_INIT_BARRIER_STATE));
3746   { // Initialize barrier data.
3747     int b;
3748     for (b = 0; b < bs_last_barrier; ++b) {
3749       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3750 #if USE_DEBUGGER
3751       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3752 #endif
3753     }
3754   }
3755   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3756                    KMP_INIT_BARRIER_STATE);
3757 
3758 #if KMP_AFFINITY_SUPPORTED
3759   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3760   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3761   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3762   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3763   if (TCR_4(__kmp_init_middle)) {
3764     __kmp_affinity_set_init_mask(gtid, TRUE);
3765   }
3766 #endif /* KMP_AFFINITY_SUPPORTED */
3767   root_thread->th.th_def_allocator = __kmp_def_allocator;
3768   root_thread->th.th_prev_level = 0;
3769   root_thread->th.th_prev_num_threads = 1;
3770 
3771   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3772   tmp->cg_root = root_thread;
3773   tmp->cg_thread_limit = __kmp_cg_max_nth;
3774   tmp->cg_nthreads = 1;
3775   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3776                  " cg_nthreads init to 1\n",
3777                  root_thread, tmp));
3778   tmp->up = NULL;
3779   root_thread->th.th_cg_roots = tmp;
3780 
3781   __kmp_root_counter++;
3782 
3783 #if OMPT_SUPPORT
3784   if (!initial_thread && ompt_enabled.enabled) {
3785 
3786     kmp_info_t *root_thread = ompt_get_thread();
3787 
3788     ompt_set_thread_state(root_thread, ompt_state_overhead);
3789 
3790     if (ompt_enabled.ompt_callback_thread_begin) {
3791       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3792           ompt_thread_initial, __ompt_get_thread_data_internal());
3793     }
3794     ompt_data_t *task_data;
3795     ompt_data_t *parallel_data;
3796     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3797     if (ompt_enabled.ompt_callback_implicit_task) {
3798       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3799           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3800     }
3801 
3802     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3803   }
3804 #endif
3805 
3806   KMP_MB();
3807   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3808 
3809   return gtid;
3810 }
3811 
3812 #if KMP_NESTED_HOT_TEAMS
3813 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3814                                 const int max_level) {
3815   int i, n, nth;
3816   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3817   if (!hot_teams || !hot_teams[level].hot_team) {
3818     return 0;
3819   }
3820   KMP_DEBUG_ASSERT(level < max_level);
3821   kmp_team_t *team = hot_teams[level].hot_team;
3822   nth = hot_teams[level].hot_team_nth;
3823   n = nth - 1; // master is not freed
3824   if (level < max_level - 1) {
3825     for (i = 0; i < nth; ++i) {
3826       kmp_info_t *th = team->t.t_threads[i];
3827       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3828       if (i > 0 && th->th.th_hot_teams) {
3829         __kmp_free(th->th.th_hot_teams);
3830         th->th.th_hot_teams = NULL;
3831       }
3832     }
3833   }
3834   __kmp_free_team(root, team, NULL);
3835   return n;
3836 }
3837 #endif
3838 
3839 // Resets a root thread and clear its root and hot teams.
3840 // Returns the number of __kmp_threads entries directly and indirectly freed.
3841 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3842   kmp_team_t *root_team = root->r.r_root_team;
3843   kmp_team_t *hot_team = root->r.r_hot_team;
3844   int n = hot_team->t.t_nproc;
3845   int i;
3846 
3847   KMP_DEBUG_ASSERT(!root->r.r_active);
3848 
3849   root->r.r_root_team = NULL;
3850   root->r.r_hot_team = NULL;
3851   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3852   // before call to __kmp_free_team().
3853   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3854 #if KMP_NESTED_HOT_TEAMS
3855   if (__kmp_hot_teams_max_level >
3856       0) { // need to free nested hot teams and their threads if any
3857     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3858       kmp_info_t *th = hot_team->t.t_threads[i];
3859       if (__kmp_hot_teams_max_level > 1) {
3860         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3861       }
3862       if (th->th.th_hot_teams) {
3863         __kmp_free(th->th.th_hot_teams);
3864         th->th.th_hot_teams = NULL;
3865       }
3866     }
3867   }
3868 #endif
3869   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3870 
3871   // Before we can reap the thread, we need to make certain that all other
3872   // threads in the teams that had this root as ancestor have stopped trying to
3873   // steal tasks.
3874   if (__kmp_tasking_mode != tskm_immediate_exec) {
3875     __kmp_wait_to_unref_task_teams();
3876   }
3877 
3878 #if KMP_OS_WINDOWS
3879   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3880   KA_TRACE(
3881       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3882            "\n",
3883            (LPVOID) & (root->r.r_uber_thread->th),
3884            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3885   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3886 #endif /* KMP_OS_WINDOWS */
3887 
3888 #if OMPT_SUPPORT
3889   ompt_data_t *task_data;
3890   ompt_data_t *parallel_data;
3891   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3892   if (ompt_enabled.ompt_callback_implicit_task) {
3893     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3894         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3895   }
3896   if (ompt_enabled.ompt_callback_thread_end) {
3897     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3898         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3899   }
3900 #endif
3901 
3902   TCW_4(__kmp_nth,
3903         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3904   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3905   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3906                  " to %d\n",
3907                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3908                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3909   if (i == 1) {
3910     // need to free contention group structure
3911     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3912                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3913     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3914     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3915     root->r.r_uber_thread->th.th_cg_roots = NULL;
3916   }
3917   __kmp_reap_thread(root->r.r_uber_thread, 1);
3918 
3919   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3920   // instead of freeing.
3921   root->r.r_uber_thread = NULL;
3922   /* mark root as no longer in use */
3923   root->r.r_begin = FALSE;
3924 
3925   return n;
3926 }
3927 
3928 void __kmp_unregister_root_current_thread(int gtid) {
3929   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3930   /* this lock should be ok, since unregister_root_current_thread is never
3931      called during an abort, only during a normal close. furthermore, if you
3932      have the forkjoin lock, you should never try to get the initz lock */
3933   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3934   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3935     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3936                   "exiting T#%d\n",
3937                   gtid));
3938     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3939     return;
3940   }
3941   kmp_root_t *root = __kmp_root[gtid];
3942 
3943   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3944   KMP_ASSERT(KMP_UBER_GTID(gtid));
3945   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3946   KMP_ASSERT(root->r.r_active == FALSE);
3947 
3948   KMP_MB();
3949 
3950   kmp_info_t *thread = __kmp_threads[gtid];
3951   kmp_team_t *team = thread->th.th_team;
3952   kmp_task_team_t *task_team = thread->th.th_task_team;
3953 
3954   // we need to wait for the proxy tasks before finishing the thread
3955   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3956 #if OMPT_SUPPORT
3957     // the runtime is shutting down so we won't report any events
3958     thread->th.ompt_thread_info.state = ompt_state_undefined;
3959 #endif
3960     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3961   }
3962 
3963   __kmp_reset_root(gtid, root);
3964 
3965   /* free up this thread slot */
3966   __kmp_gtid_set_specific(KMP_GTID_DNE);
3967 #ifdef KMP_TDATA_GTID
3968   __kmp_gtid = KMP_GTID_DNE;
3969 #endif
3970 
3971   KMP_MB();
3972   KC_TRACE(10,
3973            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3974 
3975   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3976 }
3977 
3978 #if KMP_OS_WINDOWS
3979 /* __kmp_forkjoin_lock must be already held
3980    Unregisters a root thread that is not the current thread.  Returns the number
3981    of __kmp_threads entries freed as a result. */
3982 static int __kmp_unregister_root_other_thread(int gtid) {
3983   kmp_root_t *root = __kmp_root[gtid];
3984   int r;
3985 
3986   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3987   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3988   KMP_ASSERT(KMP_UBER_GTID(gtid));
3989   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3990   KMP_ASSERT(root->r.r_active == FALSE);
3991 
3992   r = __kmp_reset_root(gtid, root);
3993   KC_TRACE(10,
3994            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3995   return r;
3996 }
3997 #endif
3998 
3999 #if KMP_DEBUG
4000 void __kmp_task_info() {
4001 
4002   kmp_int32 gtid = __kmp_entry_gtid();
4003   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4004   kmp_info_t *this_thr = __kmp_threads[gtid];
4005   kmp_team_t *steam = this_thr->th.th_serial_team;
4006   kmp_team_t *team = this_thr->th.th_team;
4007 
4008   __kmp_printf(
4009       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4010       "ptask=%p\n",
4011       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4012       team->t.t_implicit_task_taskdata[tid].td_parent);
4013 }
4014 #endif // KMP_DEBUG
4015 
4016 /* TODO optimize with one big memclr, take out what isn't needed, split
4017    responsibility to workers as much as possible, and delay initialization of
4018    features as much as possible  */
4019 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4020                                   int tid, int gtid) {
4021   /* this_thr->th.th_info.ds.ds_gtid is setup in
4022      kmp_allocate_thread/create_worker.
4023      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4024   kmp_info_t *master = team->t.t_threads[0];
4025   KMP_DEBUG_ASSERT(this_thr != NULL);
4026   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4027   KMP_DEBUG_ASSERT(team);
4028   KMP_DEBUG_ASSERT(team->t.t_threads);
4029   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4030   KMP_DEBUG_ASSERT(master);
4031   KMP_DEBUG_ASSERT(master->th.th_root);
4032 
4033   KMP_MB();
4034 
4035   TCW_SYNC_PTR(this_thr->th.th_team, team);
4036 
4037   this_thr->th.th_info.ds.ds_tid = tid;
4038   this_thr->th.th_set_nproc = 0;
4039   if (__kmp_tasking_mode != tskm_immediate_exec)
4040     // When tasking is possible, threads are not safe to reap until they are
4041     // done tasking; this will be set when tasking code is exited in wait
4042     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4043   else // no tasking --> always safe to reap
4044     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4045   this_thr->th.th_set_proc_bind = proc_bind_default;
4046 #if KMP_AFFINITY_SUPPORTED
4047   this_thr->th.th_new_place = this_thr->th.th_current_place;
4048 #endif
4049   this_thr->th.th_root = master->th.th_root;
4050 
4051   /* setup the thread's cache of the team structure */
4052   this_thr->th.th_team_nproc = team->t.t_nproc;
4053   this_thr->th.th_team_master = master;
4054   this_thr->th.th_team_serialized = team->t.t_serialized;
4055   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4056 
4057   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4058 
4059   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4060                 tid, gtid, this_thr, this_thr->th.th_current_task));
4061 
4062   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4063                            team, tid, TRUE);
4064 
4065   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4066                 tid, gtid, this_thr, this_thr->th.th_current_task));
4067   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4068   // __kmp_initialize_team()?
4069 
4070   /* TODO no worksharing in speculative threads */
4071   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4072 
4073   this_thr->th.th_local.this_construct = 0;
4074 
4075   if (!this_thr->th.th_pri_common) {
4076     this_thr->th.th_pri_common =
4077         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4078     if (__kmp_storage_map) {
4079       __kmp_print_storage_map_gtid(
4080           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4081           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4082     }
4083     this_thr->th.th_pri_head = NULL;
4084   }
4085 
4086   if (this_thr != master && // Master's CG root is initialized elsewhere
4087       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4088     // Make new thread's CG root same as master's
4089     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4090     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4091     if (tmp) {
4092       // worker changes CG, need to check if old CG should be freed
4093       int i = tmp->cg_nthreads--;
4094       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4095                      " on node %p of thread %p to %d\n",
4096                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4097       if (i == 1) {
4098         __kmp_free(tmp); // last thread left CG --> free it
4099       }
4100     }
4101     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4102     // Increment new thread's CG root's counter to add the new thread
4103     this_thr->th.th_cg_roots->cg_nthreads++;
4104     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4105                    " node %p of thread %p to %d\n",
4106                    this_thr, this_thr->th.th_cg_roots,
4107                    this_thr->th.th_cg_roots->cg_root,
4108                    this_thr->th.th_cg_roots->cg_nthreads));
4109     this_thr->th.th_current_task->td_icvs.thread_limit =
4110         this_thr->th.th_cg_roots->cg_thread_limit;
4111   }
4112 
4113   /* Initialize dynamic dispatch */
4114   {
4115     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4116     // Use team max_nproc since this will never change for the team.
4117     size_t disp_size =
4118         sizeof(dispatch_private_info_t) *
4119         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4120     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4121                   team->t.t_max_nproc));
4122     KMP_ASSERT(dispatch);
4123     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4124     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4125 
4126     dispatch->th_disp_index = 0;
4127     dispatch->th_doacross_buf_idx = 0;
4128     if (!dispatch->th_disp_buffer) {
4129       dispatch->th_disp_buffer =
4130           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4131 
4132       if (__kmp_storage_map) {
4133         __kmp_print_storage_map_gtid(
4134             gtid, &dispatch->th_disp_buffer[0],
4135             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4136                                           ? 1
4137                                           : __kmp_dispatch_num_buffers],
4138             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4139                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4140             gtid, team->t.t_id, gtid);
4141       }
4142     } else {
4143       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4144     }
4145 
4146     dispatch->th_dispatch_pr_current = 0;
4147     dispatch->th_dispatch_sh_current = 0;
4148 
4149     dispatch->th_deo_fcn = 0; /* ORDERED     */
4150     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4151   }
4152 
4153   this_thr->th.th_next_pool = NULL;
4154 
4155   if (!this_thr->th.th_task_state_memo_stack) {
4156     size_t i;
4157     this_thr->th.th_task_state_memo_stack =
4158         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4159     this_thr->th.th_task_state_top = 0;
4160     this_thr->th.th_task_state_stack_sz = 4;
4161     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4162          ++i) // zero init the stack
4163       this_thr->th.th_task_state_memo_stack[i] = 0;
4164   }
4165 
4166   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4167   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4168 
4169   KMP_MB();
4170 }
4171 
4172 /* allocate a new thread for the requesting team. this is only called from
4173    within a forkjoin critical section. we will first try to get an available
4174    thread from the thread pool. if none is available, we will fork a new one
4175    assuming we are able to create a new one. this should be assured, as the
4176    caller should check on this first. */
4177 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4178                                   int new_tid) {
4179   kmp_team_t *serial_team;
4180   kmp_info_t *new_thr;
4181   int new_gtid;
4182 
4183   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4184   KMP_DEBUG_ASSERT(root && team);
4185 #if !KMP_NESTED_HOT_TEAMS
4186   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4187 #endif
4188   KMP_MB();
4189 
4190   /* first, try to get one from the thread pool */
4191   if (__kmp_thread_pool) {
4192     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4193     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4194     if (new_thr == __kmp_thread_pool_insert_pt) {
4195       __kmp_thread_pool_insert_pt = NULL;
4196     }
4197     TCW_4(new_thr->th.th_in_pool, FALSE);
4198     __kmp_suspend_initialize_thread(new_thr);
4199     __kmp_lock_suspend_mx(new_thr);
4200     if (new_thr->th.th_active_in_pool == TRUE) {
4201       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4202       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4203       new_thr->th.th_active_in_pool = FALSE;
4204     }
4205     __kmp_unlock_suspend_mx(new_thr);
4206 
4207     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4208                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4209     KMP_ASSERT(!new_thr->th.th_team);
4210     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4211 
4212     /* setup the thread structure */
4213     __kmp_initialize_info(new_thr, team, new_tid,
4214                           new_thr->th.th_info.ds.ds_gtid);
4215     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4216 
4217     TCW_4(__kmp_nth, __kmp_nth + 1);
4218 
4219     new_thr->th.th_task_state = 0;
4220     new_thr->th.th_task_state_top = 0;
4221     new_thr->th.th_task_state_stack_sz = 4;
4222 
4223 #ifdef KMP_ADJUST_BLOCKTIME
4224     /* Adjust blocktime back to zero if necessary */
4225     /* Middle initialization might not have occurred yet */
4226     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4227       if (__kmp_nth > __kmp_avail_proc) {
4228         __kmp_zero_bt = TRUE;
4229       }
4230     }
4231 #endif /* KMP_ADJUST_BLOCKTIME */
4232 
4233 #if KMP_DEBUG
4234     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4235     // KMP_BARRIER_PARENT_FLAG.
4236     int b;
4237     kmp_balign_t *balign = new_thr->th.th_bar;
4238     for (b = 0; b < bs_last_barrier; ++b)
4239       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4240 #endif
4241 
4242     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4243                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4244 
4245     KMP_MB();
4246     return new_thr;
4247   }
4248 
4249   /* no, well fork a new one */
4250   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4251   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4252 
4253 #if KMP_USE_MONITOR
4254   // If this is the first worker thread the RTL is creating, then also
4255   // launch the monitor thread.  We try to do this as early as possible.
4256   if (!TCR_4(__kmp_init_monitor)) {
4257     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4258     if (!TCR_4(__kmp_init_monitor)) {
4259       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4260       TCW_4(__kmp_init_monitor, 1);
4261       __kmp_create_monitor(&__kmp_monitor);
4262       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4263 #if KMP_OS_WINDOWS
4264       // AC: wait until monitor has started. This is a fix for CQ232808.
4265       // The reason is that if the library is loaded/unloaded in a loop with
4266       // small (parallel) work in between, then there is high probability that
4267       // monitor thread started after the library shutdown. At shutdown it is
4268       // too late to cope with the problem, because when the master is in
4269       // DllMain (process detach) the monitor has no chances to start (it is
4270       // blocked), and master has no means to inform the monitor that the
4271       // library has gone, because all the memory which the monitor can access
4272       // is going to be released/reset.
4273       while (TCR_4(__kmp_init_monitor) < 2) {
4274         KMP_YIELD(TRUE);
4275       }
4276       KF_TRACE(10, ("after monitor thread has started\n"));
4277 #endif
4278     }
4279     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4280   }
4281 #endif
4282 
4283   KMP_MB();
4284   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4285     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4286   }
4287 
4288   /* allocate space for it. */
4289   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4290 
4291   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4292 
4293 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4294   // suppress race conditions detection on synchronization flags in debug mode
4295   // this helps to analyze library internals eliminating false positives
4296   __itt_suppress_mark_range(
4297       __itt_suppress_range, __itt_suppress_threading_errors,
4298       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4299   __itt_suppress_mark_range(
4300       __itt_suppress_range, __itt_suppress_threading_errors,
4301       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4302 #if KMP_OS_WINDOWS
4303   __itt_suppress_mark_range(
4304       __itt_suppress_range, __itt_suppress_threading_errors,
4305       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4306 #else
4307   __itt_suppress_mark_range(__itt_suppress_range,
4308                             __itt_suppress_threading_errors,
4309                             &new_thr->th.th_suspend_init_count,
4310                             sizeof(new_thr->th.th_suspend_init_count));
4311 #endif
4312   // TODO: check if we need to also suppress b_arrived flags
4313   __itt_suppress_mark_range(__itt_suppress_range,
4314                             __itt_suppress_threading_errors,
4315                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4316                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4317   __itt_suppress_mark_range(__itt_suppress_range,
4318                             __itt_suppress_threading_errors,
4319                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4320                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4321   __itt_suppress_mark_range(__itt_suppress_range,
4322                             __itt_suppress_threading_errors,
4323                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4324                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4325 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4326   if (__kmp_storage_map) {
4327     __kmp_print_thread_storage_map(new_thr, new_gtid);
4328   }
4329 
4330   // add the reserve serialized team, initialized from the team's master thread
4331   {
4332     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4333     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4334     new_thr->th.th_serial_team = serial_team =
4335         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4336 #if OMPT_SUPPORT
4337                                           ompt_data_none, // root parallel id
4338 #endif
4339                                           proc_bind_default, &r_icvs,
4340                                           0 USE_NESTED_HOT_ARG(NULL));
4341   }
4342   KMP_ASSERT(serial_team);
4343   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4344   // execution (it is unused for now).
4345   serial_team->t.t_threads[0] = new_thr;
4346   KF_TRACE(10,
4347            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4348             new_thr));
4349 
4350   /* setup the thread structures */
4351   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4352 
4353 #if USE_FAST_MEMORY
4354   __kmp_initialize_fast_memory(new_thr);
4355 #endif /* USE_FAST_MEMORY */
4356 
4357 #if KMP_USE_BGET
4358   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4359   __kmp_initialize_bget(new_thr);
4360 #endif
4361 
4362   __kmp_init_random(new_thr); // Initialize random number generator
4363 
4364   /* Initialize these only once when thread is grabbed for a team allocation */
4365   KA_TRACE(20,
4366            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4367             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4368 
4369   int b;
4370   kmp_balign_t *balign = new_thr->th.th_bar;
4371   for (b = 0; b < bs_last_barrier; ++b) {
4372     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4373     balign[b].bb.team = NULL;
4374     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4375     balign[b].bb.use_oncore_barrier = 0;
4376   }
4377 
4378   new_thr->th.th_spin_here = FALSE;
4379   new_thr->th.th_next_waiting = 0;
4380 #if KMP_OS_UNIX
4381   new_thr->th.th_blocking = false;
4382 #endif
4383 
4384 #if KMP_AFFINITY_SUPPORTED
4385   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4386   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4387   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4388   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4389 #endif
4390   new_thr->th.th_def_allocator = __kmp_def_allocator;
4391   new_thr->th.th_prev_level = 0;
4392   new_thr->th.th_prev_num_threads = 1;
4393 
4394   TCW_4(new_thr->th.th_in_pool, FALSE);
4395   new_thr->th.th_active_in_pool = FALSE;
4396   TCW_4(new_thr->th.th_active, TRUE);
4397 
4398   /* adjust the global counters */
4399   __kmp_all_nth++;
4400   __kmp_nth++;
4401 
4402   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4403   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4404   if (__kmp_adjust_gtid_mode) {
4405     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4406       if (TCR_4(__kmp_gtid_mode) != 2) {
4407         TCW_4(__kmp_gtid_mode, 2);
4408       }
4409     } else {
4410       if (TCR_4(__kmp_gtid_mode) != 1) {
4411         TCW_4(__kmp_gtid_mode, 1);
4412       }
4413     }
4414   }
4415 
4416 #ifdef KMP_ADJUST_BLOCKTIME
4417   /* Adjust blocktime back to zero if necessary       */
4418   /* Middle initialization might not have occurred yet */
4419   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4420     if (__kmp_nth > __kmp_avail_proc) {
4421       __kmp_zero_bt = TRUE;
4422     }
4423   }
4424 #endif /* KMP_ADJUST_BLOCKTIME */
4425 
4426   /* actually fork it and create the new worker thread */
4427   KF_TRACE(
4428       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4429   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4430   KF_TRACE(10,
4431            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4432 
4433   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4434                 new_gtid));
4435   KMP_MB();
4436   return new_thr;
4437 }
4438 
4439 /* Reinitialize team for reuse.
4440    The hot team code calls this case at every fork barrier, so EPCC barrier
4441    test are extremely sensitive to changes in it, esp. writes to the team
4442    struct, which cause a cache invalidation in all threads.
4443    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4444 static void __kmp_reinitialize_team(kmp_team_t *team,
4445                                     kmp_internal_control_t *new_icvs,
4446                                     ident_t *loc) {
4447   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4448                 team->t.t_threads[0], team));
4449   KMP_DEBUG_ASSERT(team && new_icvs);
4450   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4451   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4452 
4453   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4454   // Copy ICVs to the master thread's implicit taskdata
4455   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4456   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4457 
4458   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4459                 team->t.t_threads[0], team));
4460 }
4461 
4462 /* Initialize the team data structure.
4463    This assumes the t_threads and t_max_nproc are already set.
4464    Also, we don't touch the arguments */
4465 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4466                                   kmp_internal_control_t *new_icvs,
4467                                   ident_t *loc) {
4468   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4469 
4470   /* verify */
4471   KMP_DEBUG_ASSERT(team);
4472   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4473   KMP_DEBUG_ASSERT(team->t.t_threads);
4474   KMP_MB();
4475 
4476   team->t.t_master_tid = 0; /* not needed */
4477   /* team->t.t_master_bar;        not needed */
4478   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4479   team->t.t_nproc = new_nproc;
4480 
4481   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4482   team->t.t_next_pool = NULL;
4483   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4484    * up hot team */
4485 
4486   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4487   team->t.t_invoke = NULL; /* not needed */
4488 
4489   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4490   team->t.t_sched.sched = new_icvs->sched.sched;
4491 
4492 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4493   team->t.t_fp_control_saved = FALSE; /* not needed */
4494   team->t.t_x87_fpu_control_word = 0; /* not needed */
4495   team->t.t_mxcsr = 0; /* not needed */
4496 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4497 
4498   team->t.t_construct = 0;
4499 
4500   team->t.t_ordered.dt.t_value = 0;
4501   team->t.t_master_active = FALSE;
4502 
4503 #ifdef KMP_DEBUG
4504   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4505 #endif
4506 #if KMP_OS_WINDOWS
4507   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4508 #endif
4509 
4510   team->t.t_control_stack_top = NULL;
4511 
4512   __kmp_reinitialize_team(team, new_icvs, loc);
4513 
4514   KMP_MB();
4515   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4516 }
4517 
4518 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4519 /* Sets full mask for thread and returns old mask, no changes to structures. */
4520 static void
4521 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4522   if (KMP_AFFINITY_CAPABLE()) {
4523     int status;
4524     if (old_mask != NULL) {
4525       status = __kmp_get_system_affinity(old_mask, TRUE);
4526       int error = errno;
4527       if (status != 0) {
4528         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4529                     __kmp_msg_null);
4530       }
4531     }
4532     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4533   }
4534 }
4535 #endif
4536 
4537 #if KMP_AFFINITY_SUPPORTED
4538 
4539 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4540 // It calculates the worker + master thread's partition based upon the parent
4541 // thread's partition, and binds each worker to a thread in their partition.
4542 // The master thread's partition should already include its current binding.
4543 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4544   // Copy the master thread's place partition to the team struct
4545   kmp_info_t *master_th = team->t.t_threads[0];
4546   KMP_DEBUG_ASSERT(master_th != NULL);
4547   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4548   int first_place = master_th->th.th_first_place;
4549   int last_place = master_th->th.th_last_place;
4550   int masters_place = master_th->th.th_current_place;
4551   team->t.t_first_place = first_place;
4552   team->t.t_last_place = last_place;
4553 
4554   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4555                 "bound to place %d partition = [%d,%d]\n",
4556                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4557                 team->t.t_id, masters_place, first_place, last_place));
4558 
4559   switch (proc_bind) {
4560 
4561   case proc_bind_default:
4562     // serial teams might have the proc_bind policy set to proc_bind_default. It
4563     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4564     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4565     break;
4566 
4567   case proc_bind_master: {
4568     int f;
4569     int n_th = team->t.t_nproc;
4570     for (f = 1; f < n_th; f++) {
4571       kmp_info_t *th = team->t.t_threads[f];
4572       KMP_DEBUG_ASSERT(th != NULL);
4573       th->th.th_first_place = first_place;
4574       th->th.th_last_place = last_place;
4575       th->th.th_new_place = masters_place;
4576       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4577           team->t.t_display_affinity != 1) {
4578         team->t.t_display_affinity = 1;
4579       }
4580 
4581       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4582                      "partition = [%d,%d]\n",
4583                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4584                      f, masters_place, first_place, last_place));
4585     }
4586   } break;
4587 
4588   case proc_bind_close: {
4589     int f;
4590     int n_th = team->t.t_nproc;
4591     int n_places;
4592     if (first_place <= last_place) {
4593       n_places = last_place - first_place + 1;
4594     } else {
4595       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4596     }
4597     if (n_th <= n_places) {
4598       int place = masters_place;
4599       for (f = 1; f < n_th; f++) {
4600         kmp_info_t *th = team->t.t_threads[f];
4601         KMP_DEBUG_ASSERT(th != NULL);
4602 
4603         if (place == last_place) {
4604           place = first_place;
4605         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4606           place = 0;
4607         } else {
4608           place++;
4609         }
4610         th->th.th_first_place = first_place;
4611         th->th.th_last_place = last_place;
4612         th->th.th_new_place = place;
4613         if (__kmp_display_affinity && place != th->th.th_current_place &&
4614             team->t.t_display_affinity != 1) {
4615           team->t.t_display_affinity = 1;
4616         }
4617 
4618         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4619                        "partition = [%d,%d]\n",
4620                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4621                        team->t.t_id, f, place, first_place, last_place));
4622       }
4623     } else {
4624       int S, rem, gap, s_count;
4625       S = n_th / n_places;
4626       s_count = 0;
4627       rem = n_th - (S * n_places);
4628       gap = rem > 0 ? n_places / rem : n_places;
4629       int place = masters_place;
4630       int gap_ct = gap;
4631       for (f = 0; f < n_th; f++) {
4632         kmp_info_t *th = team->t.t_threads[f];
4633         KMP_DEBUG_ASSERT(th != NULL);
4634 
4635         th->th.th_first_place = first_place;
4636         th->th.th_last_place = last_place;
4637         th->th.th_new_place = place;
4638         if (__kmp_display_affinity && place != th->th.th_current_place &&
4639             team->t.t_display_affinity != 1) {
4640           team->t.t_display_affinity = 1;
4641         }
4642         s_count++;
4643 
4644         if ((s_count == S) && rem && (gap_ct == gap)) {
4645           // do nothing, add an extra thread to place on next iteration
4646         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4647           // we added an extra thread to this place; move to next place
4648           if (place == last_place) {
4649             place = first_place;
4650           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4651             place = 0;
4652           } else {
4653             place++;
4654           }
4655           s_count = 0;
4656           gap_ct = 1;
4657           rem--;
4658         } else if (s_count == S) { // place full; don't add extra
4659           if (place == last_place) {
4660             place = first_place;
4661           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4662             place = 0;
4663           } else {
4664             place++;
4665           }
4666           gap_ct++;
4667           s_count = 0;
4668         }
4669 
4670         KA_TRACE(100,
4671                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4672                   "partition = [%d,%d]\n",
4673                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4674                   th->th.th_new_place, first_place, last_place));
4675       }
4676       KMP_DEBUG_ASSERT(place == masters_place);
4677     }
4678   } break;
4679 
4680   case proc_bind_spread: {
4681     int f;
4682     int n_th = team->t.t_nproc;
4683     int n_places;
4684     int thidx;
4685     if (first_place <= last_place) {
4686       n_places = last_place - first_place + 1;
4687     } else {
4688       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4689     }
4690     if (n_th <= n_places) {
4691       int place = -1;
4692 
4693       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4694         int S = n_places / n_th;
4695         int s_count, rem, gap, gap_ct;
4696 
4697         place = masters_place;
4698         rem = n_places - n_th * S;
4699         gap = rem ? n_th / rem : 1;
4700         gap_ct = gap;
4701         thidx = n_th;
4702         if (update_master_only == 1)
4703           thidx = 1;
4704         for (f = 0; f < thidx; f++) {
4705           kmp_info_t *th = team->t.t_threads[f];
4706           KMP_DEBUG_ASSERT(th != NULL);
4707 
4708           th->th.th_first_place = place;
4709           th->th.th_new_place = place;
4710           if (__kmp_display_affinity && place != th->th.th_current_place &&
4711               team->t.t_display_affinity != 1) {
4712             team->t.t_display_affinity = 1;
4713           }
4714           s_count = 1;
4715           while (s_count < S) {
4716             if (place == last_place) {
4717               place = first_place;
4718             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4719               place = 0;
4720             } else {
4721               place++;
4722             }
4723             s_count++;
4724           }
4725           if (rem && (gap_ct == gap)) {
4726             if (place == last_place) {
4727               place = first_place;
4728             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4729               place = 0;
4730             } else {
4731               place++;
4732             }
4733             rem--;
4734             gap_ct = 0;
4735           }
4736           th->th.th_last_place = place;
4737           gap_ct++;
4738 
4739           if (place == last_place) {
4740             place = first_place;
4741           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4742             place = 0;
4743           } else {
4744             place++;
4745           }
4746 
4747           KA_TRACE(100,
4748                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4749                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4750                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4751                     f, th->th.th_new_place, th->th.th_first_place,
4752                     th->th.th_last_place, __kmp_affinity_num_masks));
4753         }
4754       } else {
4755         /* Having uniform space of available computation places I can create
4756            T partitions of round(P/T) size and put threads into the first
4757            place of each partition. */
4758         double current = static_cast<double>(masters_place);
4759         double spacing =
4760             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4761         int first, last;
4762         kmp_info_t *th;
4763 
4764         thidx = n_th + 1;
4765         if (update_master_only == 1)
4766           thidx = 1;
4767         for (f = 0; f < thidx; f++) {
4768           first = static_cast<int>(current);
4769           last = static_cast<int>(current + spacing) - 1;
4770           KMP_DEBUG_ASSERT(last >= first);
4771           if (first >= n_places) {
4772             if (masters_place) {
4773               first -= n_places;
4774               last -= n_places;
4775               if (first == (masters_place + 1)) {
4776                 KMP_DEBUG_ASSERT(f == n_th);
4777                 first--;
4778               }
4779               if (last == masters_place) {
4780                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4781                 last--;
4782               }
4783             } else {
4784               KMP_DEBUG_ASSERT(f == n_th);
4785               first = 0;
4786               last = 0;
4787             }
4788           }
4789           if (last >= n_places) {
4790             last = (n_places - 1);
4791           }
4792           place = first;
4793           current += spacing;
4794           if (f < n_th) {
4795             KMP_DEBUG_ASSERT(0 <= first);
4796             KMP_DEBUG_ASSERT(n_places > first);
4797             KMP_DEBUG_ASSERT(0 <= last);
4798             KMP_DEBUG_ASSERT(n_places > last);
4799             KMP_DEBUG_ASSERT(last_place >= first_place);
4800             th = team->t.t_threads[f];
4801             KMP_DEBUG_ASSERT(th);
4802             th->th.th_first_place = first;
4803             th->th.th_new_place = place;
4804             th->th.th_last_place = last;
4805             if (__kmp_display_affinity && place != th->th.th_current_place &&
4806                 team->t.t_display_affinity != 1) {
4807               team->t.t_display_affinity = 1;
4808             }
4809             KA_TRACE(100,
4810                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4811                       "partition = [%d,%d], spacing = %.4f\n",
4812                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4813                       team->t.t_id, f, th->th.th_new_place,
4814                       th->th.th_first_place, th->th.th_last_place, spacing));
4815           }
4816         }
4817       }
4818       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4819     } else {
4820       int S, rem, gap, s_count;
4821       S = n_th / n_places;
4822       s_count = 0;
4823       rem = n_th - (S * n_places);
4824       gap = rem > 0 ? n_places / rem : n_places;
4825       int place = masters_place;
4826       int gap_ct = gap;
4827       thidx = n_th;
4828       if (update_master_only == 1)
4829         thidx = 1;
4830       for (f = 0; f < thidx; f++) {
4831         kmp_info_t *th = team->t.t_threads[f];
4832         KMP_DEBUG_ASSERT(th != NULL);
4833 
4834         th->th.th_first_place = place;
4835         th->th.th_last_place = place;
4836         th->th.th_new_place = place;
4837         if (__kmp_display_affinity && place != th->th.th_current_place &&
4838             team->t.t_display_affinity != 1) {
4839           team->t.t_display_affinity = 1;
4840         }
4841         s_count++;
4842 
4843         if ((s_count == S) && rem && (gap_ct == gap)) {
4844           // do nothing, add an extra thread to place on next iteration
4845         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4846           // we added an extra thread to this place; move on to next place
4847           if (place == last_place) {
4848             place = first_place;
4849           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4850             place = 0;
4851           } else {
4852             place++;
4853           }
4854           s_count = 0;
4855           gap_ct = 1;
4856           rem--;
4857         } else if (s_count == S) { // place is full; don't add extra thread
4858           if (place == last_place) {
4859             place = first_place;
4860           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4861             place = 0;
4862           } else {
4863             place++;
4864           }
4865           gap_ct++;
4866           s_count = 0;
4867         }
4868 
4869         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4870                        "partition = [%d,%d]\n",
4871                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4872                        team->t.t_id, f, th->th.th_new_place,
4873                        th->th.th_first_place, th->th.th_last_place));
4874       }
4875       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4876     }
4877   } break;
4878 
4879   default:
4880     break;
4881   }
4882 
4883   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4884 }
4885 
4886 #endif // KMP_AFFINITY_SUPPORTED
4887 
4888 /* allocate a new team data structure to use.  take one off of the free pool if
4889    available */
4890 kmp_team_t *
4891 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4892 #if OMPT_SUPPORT
4893                     ompt_data_t ompt_parallel_data,
4894 #endif
4895                     kmp_proc_bind_t new_proc_bind,
4896                     kmp_internal_control_t *new_icvs,
4897                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4898   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4899   int f;
4900   kmp_team_t *team;
4901   int use_hot_team = !root->r.r_active;
4902   int level = 0;
4903 
4904   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4905   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4906   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4907   KMP_MB();
4908 
4909 #if KMP_NESTED_HOT_TEAMS
4910   kmp_hot_team_ptr_t *hot_teams;
4911   if (master) {
4912     team = master->th.th_team;
4913     level = team->t.t_active_level;
4914     if (master->th.th_teams_microtask) { // in teams construct?
4915       if (master->th.th_teams_size.nteams > 1 &&
4916           ( // #teams > 1
4917               team->t.t_pkfn ==
4918                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4919               master->th.th_teams_level <
4920                   team->t.t_level)) { // or nested parallel inside the teams
4921         ++level; // not increment if #teams==1, or for outer fork of the teams;
4922         // increment otherwise
4923       }
4924     }
4925     hot_teams = master->th.th_hot_teams;
4926     if (level < __kmp_hot_teams_max_level && hot_teams &&
4927         hot_teams[level].hot_team) {
4928       // hot team has already been allocated for given level
4929       use_hot_team = 1;
4930     } else {
4931       use_hot_team = 0;
4932     }
4933   } else {
4934     // check we won't access uninitialized hot_teams, just in case
4935     KMP_DEBUG_ASSERT(new_nproc == 1);
4936   }
4937 #endif
4938   // Optimization to use a "hot" team
4939   if (use_hot_team && new_nproc > 1) {
4940     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4941 #if KMP_NESTED_HOT_TEAMS
4942     team = hot_teams[level].hot_team;
4943 #else
4944     team = root->r.r_hot_team;
4945 #endif
4946 #if KMP_DEBUG
4947     if (__kmp_tasking_mode != tskm_immediate_exec) {
4948       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4949                     "task_team[1] = %p before reinit\n",
4950                     team->t.t_task_team[0], team->t.t_task_team[1]));
4951     }
4952 #endif
4953 
4954     // Has the number of threads changed?
4955     /* Let's assume the most common case is that the number of threads is
4956        unchanged, and put that case first. */
4957     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4958       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4959       // This case can mean that omp_set_num_threads() was called and the hot
4960       // team size was already reduced, so we check the special flag
4961       if (team->t.t_size_changed == -1) {
4962         team->t.t_size_changed = 1;
4963       } else {
4964         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4965       }
4966 
4967       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4968       kmp_r_sched_t new_sched = new_icvs->sched;
4969       // set master's schedule as new run-time schedule
4970       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4971 
4972       __kmp_reinitialize_team(team, new_icvs,
4973                               root->r.r_uber_thread->th.th_ident);
4974 
4975       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4976                     team->t.t_threads[0], team));
4977       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4978 
4979 #if KMP_AFFINITY_SUPPORTED
4980       if ((team->t.t_size_changed == 0) &&
4981           (team->t.t_proc_bind == new_proc_bind)) {
4982         if (new_proc_bind == proc_bind_spread) {
4983           __kmp_partition_places(
4984               team, 1); // add flag to update only master for spread
4985         }
4986         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4987                        "proc_bind = %d, partition = [%d,%d]\n",
4988                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4989                        team->t.t_last_place));
4990       } else {
4991         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4992         __kmp_partition_places(team);
4993       }
4994 #else
4995       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4996 #endif /* KMP_AFFINITY_SUPPORTED */
4997     } else if (team->t.t_nproc > new_nproc) {
4998       KA_TRACE(20,
4999                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5000                 new_nproc));
5001 
5002       team->t.t_size_changed = 1;
5003 #if KMP_NESTED_HOT_TEAMS
5004       if (__kmp_hot_teams_mode == 0) {
5005         // AC: saved number of threads should correspond to team's value in this
5006         // mode, can be bigger in mode 1, when hot team has threads in reserve
5007         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5008         hot_teams[level].hot_team_nth = new_nproc;
5009 #endif // KMP_NESTED_HOT_TEAMS
5010         /* release the extra threads we don't need any more */
5011         for (f = new_nproc; f < team->t.t_nproc; f++) {
5012           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5013           if (__kmp_tasking_mode != tskm_immediate_exec) {
5014             // When decreasing team size, threads no longer in the team should
5015             // unref task team.
5016             team->t.t_threads[f]->th.th_task_team = NULL;
5017           }
5018           __kmp_free_thread(team->t.t_threads[f]);
5019           team->t.t_threads[f] = NULL;
5020         }
5021 #if KMP_NESTED_HOT_TEAMS
5022       } // (__kmp_hot_teams_mode == 0)
5023       else {
5024         // When keeping extra threads in team, switch threads to wait on own
5025         // b_go flag
5026         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5027           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5028           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5029           for (int b = 0; b < bs_last_barrier; ++b) {
5030             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5031               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5032             }
5033             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5034           }
5035         }
5036       }
5037 #endif // KMP_NESTED_HOT_TEAMS
5038       team->t.t_nproc = new_nproc;
5039       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5040       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5041       __kmp_reinitialize_team(team, new_icvs,
5042                               root->r.r_uber_thread->th.th_ident);
5043 
5044       // Update remaining threads
5045       for (f = 0; f < new_nproc; ++f) {
5046         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5047       }
5048 
5049       // restore the current task state of the master thread: should be the
5050       // implicit task
5051       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5052                     team->t.t_threads[0], team));
5053 
5054       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5055 
5056 #ifdef KMP_DEBUG
5057       for (f = 0; f < team->t.t_nproc; f++) {
5058         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5059                          team->t.t_threads[f]->th.th_team_nproc ==
5060                              team->t.t_nproc);
5061       }
5062 #endif
5063 
5064       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5065 #if KMP_AFFINITY_SUPPORTED
5066       __kmp_partition_places(team);
5067 #endif
5068     } else { // team->t.t_nproc < new_nproc
5069 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5070       kmp_affin_mask_t *old_mask;
5071       if (KMP_AFFINITY_CAPABLE()) {
5072         KMP_CPU_ALLOC(old_mask);
5073       }
5074 #endif
5075 
5076       KA_TRACE(20,
5077                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5078                 new_nproc));
5079 
5080       team->t.t_size_changed = 1;
5081 
5082 #if KMP_NESTED_HOT_TEAMS
5083       int avail_threads = hot_teams[level].hot_team_nth;
5084       if (new_nproc < avail_threads)
5085         avail_threads = new_nproc;
5086       kmp_info_t **other_threads = team->t.t_threads;
5087       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5088         // Adjust barrier data of reserved threads (if any) of the team
5089         // Other data will be set in __kmp_initialize_info() below.
5090         int b;
5091         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5092         for (b = 0; b < bs_last_barrier; ++b) {
5093           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5094           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5095 #if USE_DEBUGGER
5096           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5097 #endif
5098         }
5099       }
5100       if (hot_teams[level].hot_team_nth >= new_nproc) {
5101         // we have all needed threads in reserve, no need to allocate any
5102         // this only possible in mode 1, cannot have reserved threads in mode 0
5103         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5104         team->t.t_nproc = new_nproc; // just get reserved threads involved
5105       } else {
5106         // we may have some threads in reserve, but not enough
5107         team->t.t_nproc =
5108             hot_teams[level]
5109                 .hot_team_nth; // get reserved threads involved if any
5110         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5111 #endif // KMP_NESTED_HOT_TEAMS
5112         if (team->t.t_max_nproc < new_nproc) {
5113           /* reallocate larger arrays */
5114           __kmp_reallocate_team_arrays(team, new_nproc);
5115           __kmp_reinitialize_team(team, new_icvs, NULL);
5116         }
5117 
5118 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5119         /* Temporarily set full mask for master thread before creation of
5120            workers. The reason is that workers inherit the affinity from master,
5121            so if a lot of workers are created on the single core quickly, they
5122            don't get a chance to set their own affinity for a long time. */
5123         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5124 #endif
5125 
5126         /* allocate new threads for the hot team */
5127         for (f = team->t.t_nproc; f < new_nproc; f++) {
5128           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5129           KMP_DEBUG_ASSERT(new_worker);
5130           team->t.t_threads[f] = new_worker;
5131 
5132           KA_TRACE(20,
5133                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5134                     "join=%llu, plain=%llu\n",
5135                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5136                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5137                     team->t.t_bar[bs_plain_barrier].b_arrived));
5138 
5139           { // Initialize barrier data for new threads.
5140             int b;
5141             kmp_balign_t *balign = new_worker->th.th_bar;
5142             for (b = 0; b < bs_last_barrier; ++b) {
5143               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5144               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5145                                KMP_BARRIER_PARENT_FLAG);
5146 #if USE_DEBUGGER
5147               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5148 #endif
5149             }
5150           }
5151         }
5152 
5153 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5154         if (KMP_AFFINITY_CAPABLE()) {
5155           /* Restore initial master thread's affinity mask */
5156           __kmp_set_system_affinity(old_mask, TRUE);
5157           KMP_CPU_FREE(old_mask);
5158         }
5159 #endif
5160 #if KMP_NESTED_HOT_TEAMS
5161       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5162 #endif // KMP_NESTED_HOT_TEAMS
5163       /* make sure everyone is syncronized */
5164       int old_nproc = team->t.t_nproc; // save old value and use to update only
5165       // new threads below
5166       __kmp_initialize_team(team, new_nproc, new_icvs,
5167                             root->r.r_uber_thread->th.th_ident);
5168 
5169       /* reinitialize the threads */
5170       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5171       for (f = 0; f < team->t.t_nproc; ++f)
5172         __kmp_initialize_info(team->t.t_threads[f], team, f,
5173                               __kmp_gtid_from_tid(f, team));
5174 
5175       if (level) { // set th_task_state for new threads in nested hot team
5176         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5177         // only need to set the th_task_state for the new threads. th_task_state
5178         // for master thread will not be accurate until after this in
5179         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5180         // correct value.
5181         for (f = old_nproc; f < team->t.t_nproc; ++f)
5182           team->t.t_threads[f]->th.th_task_state =
5183               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5184       } else { // set th_task_state for new threads in non-nested hot team
5185         int old_state =
5186             team->t.t_threads[0]->th.th_task_state; // copy master's state
5187         for (f = old_nproc; f < team->t.t_nproc; ++f)
5188           team->t.t_threads[f]->th.th_task_state = old_state;
5189       }
5190 
5191 #ifdef KMP_DEBUG
5192       for (f = 0; f < team->t.t_nproc; ++f) {
5193         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5194                          team->t.t_threads[f]->th.th_team_nproc ==
5195                              team->t.t_nproc);
5196       }
5197 #endif
5198 
5199       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5200 #if KMP_AFFINITY_SUPPORTED
5201       __kmp_partition_places(team);
5202 #endif
5203     } // Check changes in number of threads
5204 
5205     kmp_info_t *master = team->t.t_threads[0];
5206     if (master->th.th_teams_microtask) {
5207       for (f = 1; f < new_nproc; ++f) {
5208         // propagate teams construct specific info to workers
5209         kmp_info_t *thr = team->t.t_threads[f];
5210         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5211         thr->th.th_teams_level = master->th.th_teams_level;
5212         thr->th.th_teams_size = master->th.th_teams_size;
5213       }
5214     }
5215 #if KMP_NESTED_HOT_TEAMS
5216     if (level) {
5217       // Sync barrier state for nested hot teams, not needed for outermost hot
5218       // team.
5219       for (f = 1; f < new_nproc; ++f) {
5220         kmp_info_t *thr = team->t.t_threads[f];
5221         int b;
5222         kmp_balign_t *balign = thr->th.th_bar;
5223         for (b = 0; b < bs_last_barrier; ++b) {
5224           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5225           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5226 #if USE_DEBUGGER
5227           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5228 #endif
5229         }
5230       }
5231     }
5232 #endif // KMP_NESTED_HOT_TEAMS
5233 
5234     /* reallocate space for arguments if necessary */
5235     __kmp_alloc_argv_entries(argc, team, TRUE);
5236     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5237     // The hot team re-uses the previous task team,
5238     // if untouched during the previous release->gather phase.
5239 
5240     KF_TRACE(10, (" hot_team = %p\n", team));
5241 
5242 #if KMP_DEBUG
5243     if (__kmp_tasking_mode != tskm_immediate_exec) {
5244       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5245                     "task_team[1] = %p after reinit\n",
5246                     team->t.t_task_team[0], team->t.t_task_team[1]));
5247     }
5248 #endif
5249 
5250 #if OMPT_SUPPORT
5251     __ompt_team_assign_id(team, ompt_parallel_data);
5252 #endif
5253 
5254     KMP_MB();
5255 
5256     return team;
5257   }
5258 
5259   /* next, let's try to take one from the team pool */
5260   KMP_MB();
5261   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5262     /* TODO: consider resizing undersized teams instead of reaping them, now
5263        that we have a resizing mechanism */
5264     if (team->t.t_max_nproc >= max_nproc) {
5265       /* take this team from the team pool */
5266       __kmp_team_pool = team->t.t_next_pool;
5267 
5268       /* setup the team for fresh use */
5269       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5270 
5271       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5272                     "task_team[1] %p to NULL\n",
5273                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5274       team->t.t_task_team[0] = NULL;
5275       team->t.t_task_team[1] = NULL;
5276 
5277       /* reallocate space for arguments if necessary */
5278       __kmp_alloc_argv_entries(argc, team, TRUE);
5279       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5280 
5281       KA_TRACE(
5282           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5283                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5284       { // Initialize barrier data.
5285         int b;
5286         for (b = 0; b < bs_last_barrier; ++b) {
5287           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5288 #if USE_DEBUGGER
5289           team->t.t_bar[b].b_master_arrived = 0;
5290           team->t.t_bar[b].b_team_arrived = 0;
5291 #endif
5292         }
5293       }
5294 
5295       team->t.t_proc_bind = new_proc_bind;
5296 
5297       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5298                     team->t.t_id));
5299 
5300 #if OMPT_SUPPORT
5301       __ompt_team_assign_id(team, ompt_parallel_data);
5302 #endif
5303 
5304       KMP_MB();
5305 
5306       return team;
5307     }
5308 
5309     /* reap team if it is too small, then loop back and check the next one */
5310     // not sure if this is wise, but, will be redone during the hot-teams
5311     // rewrite.
5312     /* TODO: Use technique to find the right size hot-team, don't reap them */
5313     team = __kmp_reap_team(team);
5314     __kmp_team_pool = team;
5315   }
5316 
5317   /* nothing available in the pool, no matter, make a new team! */
5318   KMP_MB();
5319   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5320 
5321   /* and set it up */
5322   team->t.t_max_nproc = max_nproc;
5323   /* NOTE well, for some reason allocating one big buffer and dividing it up
5324      seems to really hurt performance a lot on the P4, so, let's not use this */
5325   __kmp_allocate_team_arrays(team, max_nproc);
5326 
5327   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5328   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5329 
5330   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5331                 "%p to NULL\n",
5332                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5333   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5334   // memory, no need to duplicate
5335   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5336   // memory, no need to duplicate
5337 
5338   if (__kmp_storage_map) {
5339     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5340   }
5341 
5342   /* allocate space for arguments */
5343   __kmp_alloc_argv_entries(argc, team, FALSE);
5344   team->t.t_argc = argc;
5345 
5346   KA_TRACE(20,
5347            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5348             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5349   { // Initialize barrier data.
5350     int b;
5351     for (b = 0; b < bs_last_barrier; ++b) {
5352       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5353 #if USE_DEBUGGER
5354       team->t.t_bar[b].b_master_arrived = 0;
5355       team->t.t_bar[b].b_team_arrived = 0;
5356 #endif
5357     }
5358   }
5359 
5360   team->t.t_proc_bind = new_proc_bind;
5361 
5362 #if OMPT_SUPPORT
5363   __ompt_team_assign_id(team, ompt_parallel_data);
5364   team->t.ompt_serialized_team_info = NULL;
5365 #endif
5366 
5367   KMP_MB();
5368 
5369   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5370                 team->t.t_id));
5371 
5372   return team;
5373 }
5374 
5375 /* TODO implement hot-teams at all levels */
5376 /* TODO implement lazy thread release on demand (disband request) */
5377 
5378 /* free the team.  return it to the team pool.  release all the threads
5379  * associated with it */
5380 void __kmp_free_team(kmp_root_t *root,
5381                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5382   int f;
5383   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5384                 team->t.t_id));
5385 
5386   /* verify state */
5387   KMP_DEBUG_ASSERT(root);
5388   KMP_DEBUG_ASSERT(team);
5389   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5390   KMP_DEBUG_ASSERT(team->t.t_threads);
5391 
5392   int use_hot_team = team == root->r.r_hot_team;
5393 #if KMP_NESTED_HOT_TEAMS
5394   int level;
5395   kmp_hot_team_ptr_t *hot_teams;
5396   if (master) {
5397     level = team->t.t_active_level - 1;
5398     if (master->th.th_teams_microtask) { // in teams construct?
5399       if (master->th.th_teams_size.nteams > 1) {
5400         ++level; // level was not increased in teams construct for
5401         // team_of_masters
5402       }
5403       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5404           master->th.th_teams_level == team->t.t_level) {
5405         ++level; // level was not increased in teams construct for
5406         // team_of_workers before the parallel
5407       } // team->t.t_level will be increased inside parallel
5408     }
5409     hot_teams = master->th.th_hot_teams;
5410     if (level < __kmp_hot_teams_max_level) {
5411       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5412       use_hot_team = 1;
5413     }
5414   }
5415 #endif // KMP_NESTED_HOT_TEAMS
5416 
5417   /* team is done working */
5418   TCW_SYNC_PTR(team->t.t_pkfn,
5419                NULL); // Important for Debugging Support Library.
5420 #if KMP_OS_WINDOWS
5421   team->t.t_copyin_counter = 0; // init counter for possible reuse
5422 #endif
5423   // Do not reset pointer to parent team to NULL for hot teams.
5424 
5425   /* if we are non-hot team, release our threads */
5426   if (!use_hot_team) {
5427     if (__kmp_tasking_mode != tskm_immediate_exec) {
5428       // Wait for threads to reach reapable state
5429       for (f = 1; f < team->t.t_nproc; ++f) {
5430         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5431         kmp_info_t *th = team->t.t_threads[f];
5432         volatile kmp_uint32 *state = &th->th.th_reap_state;
5433         while (*state != KMP_SAFE_TO_REAP) {
5434 #if KMP_OS_WINDOWS
5435           // On Windows a thread can be killed at any time, check this
5436           DWORD ecode;
5437           if (!__kmp_is_thread_alive(th, &ecode)) {
5438             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5439             break;
5440           }
5441 #endif
5442           // first check if thread is sleeping
5443           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5444           if (fl.is_sleeping())
5445             fl.resume(__kmp_gtid_from_thread(th));
5446           KMP_CPU_PAUSE();
5447         }
5448       }
5449 
5450       // Delete task teams
5451       int tt_idx;
5452       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5453         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5454         if (task_team != NULL) {
5455           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5456             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5457             team->t.t_threads[f]->th.th_task_team = NULL;
5458           }
5459           KA_TRACE(
5460               20,
5461               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5462                __kmp_get_gtid(), task_team, team->t.t_id));
5463 #if KMP_NESTED_HOT_TEAMS
5464           __kmp_free_task_team(master, task_team);
5465 #endif
5466           team->t.t_task_team[tt_idx] = NULL;
5467         }
5468       }
5469     }
5470 
5471     // Reset pointer to parent team only for non-hot teams.
5472     team->t.t_parent = NULL;
5473     team->t.t_level = 0;
5474     team->t.t_active_level = 0;
5475 
5476     /* free the worker threads */
5477     for (f = 1; f < team->t.t_nproc; ++f) {
5478       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5479       __kmp_free_thread(team->t.t_threads[f]);
5480       team->t.t_threads[f] = NULL;
5481     }
5482 
5483     /* put the team back in the team pool */
5484     /* TODO limit size of team pool, call reap_team if pool too large */
5485     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5486     __kmp_team_pool = (volatile kmp_team_t *)team;
5487   } else { // Check if team was created for the masters in a teams construct
5488     // See if first worker is a CG root
5489     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5490                      team->t.t_threads[1]->th.th_cg_roots);
5491     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5492       // Clean up the CG root nodes on workers so that this team can be re-used
5493       for (f = 1; f < team->t.t_nproc; ++f) {
5494         kmp_info_t *thr = team->t.t_threads[f];
5495         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5496                          thr->th.th_cg_roots->cg_root == thr);
5497         // Pop current CG root off list
5498         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5499         thr->th.th_cg_roots = tmp->up;
5500         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5501                        " up to node %p. cg_nthreads was %d\n",
5502                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5503         int i = tmp->cg_nthreads--;
5504         if (i == 1) {
5505           __kmp_free(tmp); // free CG if we are the last thread in it
5506         }
5507         // Restore current task's thread_limit from CG root
5508         if (thr->th.th_cg_roots)
5509           thr->th.th_current_task->td_icvs.thread_limit =
5510               thr->th.th_cg_roots->cg_thread_limit;
5511       }
5512     }
5513   }
5514 
5515   KMP_MB();
5516 }
5517 
5518 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5519 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5520   kmp_team_t *next_pool = team->t.t_next_pool;
5521 
5522   KMP_DEBUG_ASSERT(team);
5523   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5524   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5525   KMP_DEBUG_ASSERT(team->t.t_threads);
5526   KMP_DEBUG_ASSERT(team->t.t_argv);
5527 
5528   /* TODO clean the threads that are a part of this? */
5529 
5530   /* free stuff */
5531   __kmp_free_team_arrays(team);
5532   if (team->t.t_argv != &team->t.t_inline_argv[0])
5533     __kmp_free((void *)team->t.t_argv);
5534   __kmp_free(team);
5535 
5536   KMP_MB();
5537   return next_pool;
5538 }
5539 
5540 // Free the thread.  Don't reap it, just place it on the pool of available
5541 // threads.
5542 //
5543 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5544 // binding for the affinity mechanism to be useful.
5545 //
5546 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5547 // However, we want to avoid a potential performance problem by always
5548 // scanning through the list to find the correct point at which to insert
5549 // the thread (potential N**2 behavior).  To do this we keep track of the
5550 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5551 // With single-level parallelism, threads will always be added to the tail
5552 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5553 // parallelism, all bets are off and we may need to scan through the entire
5554 // free list.
5555 //
5556 // This change also has a potentially large performance benefit, for some
5557 // applications.  Previously, as threads were freed from the hot team, they
5558 // would be placed back on the free list in inverse order.  If the hot team
5559 // grew back to it's original size, then the freed thread would be placed
5560 // back on the hot team in reverse order.  This could cause bad cache
5561 // locality problems on programs where the size of the hot team regularly
5562 // grew and shrunk.
5563 //
5564 // Now, for single-level parallelism, the OMP tid is always == gtid.
5565 void __kmp_free_thread(kmp_info_t *this_th) {
5566   int gtid;
5567   kmp_info_t **scan;
5568 
5569   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5570                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5571 
5572   KMP_DEBUG_ASSERT(this_th);
5573 
5574   // When moving thread to pool, switch thread to wait on own b_go flag, and
5575   // uninitialized (NULL team).
5576   int b;
5577   kmp_balign_t *balign = this_th->th.th_bar;
5578   for (b = 0; b < bs_last_barrier; ++b) {
5579     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5580       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5581     balign[b].bb.team = NULL;
5582     balign[b].bb.leaf_kids = 0;
5583   }
5584   this_th->th.th_task_state = 0;
5585   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5586 
5587   /* put thread back on the free pool */
5588   TCW_PTR(this_th->th.th_team, NULL);
5589   TCW_PTR(this_th->th.th_root, NULL);
5590   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5591 
5592   while (this_th->th.th_cg_roots) {
5593     this_th->th.th_cg_roots->cg_nthreads--;
5594     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5595                    " %p of thread  %p to %d\n",
5596                    this_th, this_th->th.th_cg_roots,
5597                    this_th->th.th_cg_roots->cg_root,
5598                    this_th->th.th_cg_roots->cg_nthreads));
5599     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5600     if (tmp->cg_root == this_th) { // Thread is a cg_root
5601       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5602       KA_TRACE(
5603           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5604       this_th->th.th_cg_roots = tmp->up;
5605       __kmp_free(tmp);
5606     } else { // Worker thread
5607       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5608         __kmp_free(tmp);
5609       }
5610       this_th->th.th_cg_roots = NULL;
5611       break;
5612     }
5613   }
5614 
5615   /* If the implicit task assigned to this thread can be used by other threads
5616    * -> multiple threads can share the data and try to free the task at
5617    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5618    * with higher probability when hot team is disabled but can occurs even when
5619    * the hot team is enabled */
5620   __kmp_free_implicit_task(this_th);
5621   this_th->th.th_current_task = NULL;
5622 
5623   // If the __kmp_thread_pool_insert_pt is already past the new insert
5624   // point, then we need to re-scan the entire list.
5625   gtid = this_th->th.th_info.ds.ds_gtid;
5626   if (__kmp_thread_pool_insert_pt != NULL) {
5627     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5628     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5629       __kmp_thread_pool_insert_pt = NULL;
5630     }
5631   }
5632 
5633   // Scan down the list to find the place to insert the thread.
5634   // scan is the address of a link in the list, possibly the address of
5635   // __kmp_thread_pool itself.
5636   //
5637   // In the absence of nested parallelism, the for loop will have 0 iterations.
5638   if (__kmp_thread_pool_insert_pt != NULL) {
5639     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5640   } else {
5641     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5642   }
5643   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5644        scan = &((*scan)->th.th_next_pool))
5645     ;
5646 
5647   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5648   // to its address.
5649   TCW_PTR(this_th->th.th_next_pool, *scan);
5650   __kmp_thread_pool_insert_pt = *scan = this_th;
5651   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5652                    (this_th->th.th_info.ds.ds_gtid <
5653                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5654   TCW_4(this_th->th.th_in_pool, TRUE);
5655   __kmp_suspend_initialize_thread(this_th);
5656   __kmp_lock_suspend_mx(this_th);
5657   if (this_th->th.th_active == TRUE) {
5658     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5659     this_th->th.th_active_in_pool = TRUE;
5660   }
5661 #if KMP_DEBUG
5662   else {
5663     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5664   }
5665 #endif
5666   __kmp_unlock_suspend_mx(this_th);
5667 
5668   TCW_4(__kmp_nth, __kmp_nth - 1);
5669 
5670 #ifdef KMP_ADJUST_BLOCKTIME
5671   /* Adjust blocktime back to user setting or default if necessary */
5672   /* Middle initialization might never have occurred                */
5673   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5674     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5675     if (__kmp_nth <= __kmp_avail_proc) {
5676       __kmp_zero_bt = FALSE;
5677     }
5678   }
5679 #endif /* KMP_ADJUST_BLOCKTIME */
5680 
5681   KMP_MB();
5682 }
5683 
5684 /* ------------------------------------------------------------------------ */
5685 
5686 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5687   int gtid = this_thr->th.th_info.ds.ds_gtid;
5688   /*    void                 *stack_data;*/
5689   kmp_team_t **volatile pteam;
5690 
5691   KMP_MB();
5692   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5693 
5694   if (__kmp_env_consistency_check) {
5695     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5696   }
5697 
5698 #if OMPT_SUPPORT
5699   ompt_data_t *thread_data;
5700   if (ompt_enabled.enabled) {
5701     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5702     *thread_data = ompt_data_none;
5703 
5704     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5705     this_thr->th.ompt_thread_info.wait_id = 0;
5706     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5707     this_thr->th.ompt_thread_info.parallel_flags = 0;
5708     if (ompt_enabled.ompt_callback_thread_begin) {
5709       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5710           ompt_thread_worker, thread_data);
5711     }
5712     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5713   }
5714 #endif
5715 
5716   /* This is the place where threads wait for work */
5717   while (!TCR_4(__kmp_global.g.g_done)) {
5718     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5719     KMP_MB();
5720 
5721     /* wait for work to do */
5722     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5723 
5724     /* No tid yet since not part of a team */
5725     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5726 
5727 #if OMPT_SUPPORT
5728     if (ompt_enabled.enabled) {
5729       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5730     }
5731 #endif
5732 
5733     pteam = &this_thr->th.th_team;
5734 
5735     /* have we been allocated? */
5736     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5737       /* we were just woken up, so run our new task */
5738       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5739         int rc;
5740         KA_TRACE(20,
5741                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5742                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5743                   (*pteam)->t.t_pkfn));
5744 
5745         updateHWFPControl(*pteam);
5746 
5747 #if OMPT_SUPPORT
5748         if (ompt_enabled.enabled) {
5749           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5750         }
5751 #endif
5752 
5753         rc = (*pteam)->t.t_invoke(gtid);
5754         KMP_ASSERT(rc);
5755 
5756         KMP_MB();
5757         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5758                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5759                       (*pteam)->t.t_pkfn));
5760       }
5761 #if OMPT_SUPPORT
5762       if (ompt_enabled.enabled) {
5763         /* no frame set while outside task */
5764         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5765 
5766         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5767       }
5768 #endif
5769       /* join barrier after parallel region */
5770       __kmp_join_barrier(gtid);
5771     }
5772   }
5773   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5774 
5775 #if OMPT_SUPPORT
5776   if (ompt_enabled.ompt_callback_thread_end) {
5777     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5778   }
5779 #endif
5780 
5781   this_thr->th.th_task_team = NULL;
5782   /* run the destructors for the threadprivate data for this thread */
5783   __kmp_common_destroy_gtid(gtid);
5784 
5785   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5786   KMP_MB();
5787   return this_thr;
5788 }
5789 
5790 /* ------------------------------------------------------------------------ */
5791 
5792 void __kmp_internal_end_dest(void *specific_gtid) {
5793 #if KMP_COMPILER_ICC
5794 #pragma warning(push)
5795 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5796 // significant bits
5797 #endif
5798   // Make sure no significant bits are lost
5799   int gtid = (kmp_intptr_t)specific_gtid - 1;
5800 #if KMP_COMPILER_ICC
5801 #pragma warning(pop)
5802 #endif
5803 
5804   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5805   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5806    * this is because 0 is reserved for the nothing-stored case */
5807 
5808   /* josh: One reason for setting the gtid specific data even when it is being
5809      destroyed by pthread is to allow gtid lookup through thread specific data
5810      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5811      that gets executed in the call to __kmp_internal_end_thread, actually
5812      gets the gtid through the thread specific data.  Setting it here seems
5813      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5814      to run smoothly.
5815      todo: get rid of this after we remove the dependence on
5816      __kmp_gtid_get_specific  */
5817   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5818     __kmp_gtid_set_specific(gtid);
5819 #ifdef KMP_TDATA_GTID
5820   __kmp_gtid = gtid;
5821 #endif
5822   __kmp_internal_end_thread(gtid);
5823 }
5824 
5825 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5826 
5827 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5828   __kmp_internal_end_atexit();
5829 }
5830 
5831 #endif
5832 
5833 /* [Windows] josh: when the atexit handler is called, there may still be more
5834    than one thread alive */
5835 void __kmp_internal_end_atexit(void) {
5836   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5837   /* [Windows]
5838      josh: ideally, we want to completely shutdown the library in this atexit
5839      handler, but stat code that depends on thread specific data for gtid fails
5840      because that data becomes unavailable at some point during the shutdown, so
5841      we call __kmp_internal_end_thread instead. We should eventually remove the
5842      dependency on __kmp_get_specific_gtid in the stat code and use
5843      __kmp_internal_end_library to cleanly shutdown the library.
5844 
5845      // TODO: Can some of this comment about GVS be removed?
5846      I suspect that the offending stat code is executed when the calling thread
5847      tries to clean up a dead root thread's data structures, resulting in GVS
5848      code trying to close the GVS structures for that thread, but since the stat
5849      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5850      the calling thread is cleaning up itself instead of another thread, it get
5851      confused. This happens because allowing a thread to unregister and cleanup
5852      another thread is a recent modification for addressing an issue.
5853      Based on the current design (20050722), a thread may end up
5854      trying to unregister another thread only if thread death does not trigger
5855      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5856      thread specific data destructor function to detect thread death. For
5857      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5858      is nothing.  Thus, the workaround is applicable only for Windows static
5859      stat library. */
5860   __kmp_internal_end_library(-1);
5861 #if KMP_OS_WINDOWS
5862   __kmp_close_console();
5863 #endif
5864 }
5865 
5866 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5867   // It is assumed __kmp_forkjoin_lock is acquired.
5868 
5869   int gtid;
5870 
5871   KMP_DEBUG_ASSERT(thread != NULL);
5872 
5873   gtid = thread->th.th_info.ds.ds_gtid;
5874 
5875   if (!is_root) {
5876     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5877       /* Assume the threads are at the fork barrier here */
5878       KA_TRACE(
5879           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5880                gtid));
5881       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5882        * (GEH) */
5883       ANNOTATE_HAPPENS_BEFORE(thread);
5884       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5885       __kmp_release_64(&flag);
5886     }
5887 
5888     // Terminate OS thread.
5889     __kmp_reap_worker(thread);
5890 
5891     // The thread was killed asynchronously.  If it was actively
5892     // spinning in the thread pool, decrement the global count.
5893     //
5894     // There is a small timing hole here - if the worker thread was just waking
5895     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5896     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5897     // the global counter might not get updated.
5898     //
5899     // Currently, this can only happen as the library is unloaded,
5900     // so there are no harmful side effects.
5901     if (thread->th.th_active_in_pool) {
5902       thread->th.th_active_in_pool = FALSE;
5903       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5904       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5905     }
5906   }
5907 
5908   __kmp_free_implicit_task(thread);
5909 
5910 // Free the fast memory for tasking
5911 #if USE_FAST_MEMORY
5912   __kmp_free_fast_memory(thread);
5913 #endif /* USE_FAST_MEMORY */
5914 
5915   __kmp_suspend_uninitialize_thread(thread);
5916 
5917   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5918   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5919 
5920   --__kmp_all_nth;
5921 // __kmp_nth was decremented when thread is added to the pool.
5922 
5923 #ifdef KMP_ADJUST_BLOCKTIME
5924   /* Adjust blocktime back to user setting or default if necessary */
5925   /* Middle initialization might never have occurred                */
5926   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5927     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5928     if (__kmp_nth <= __kmp_avail_proc) {
5929       __kmp_zero_bt = FALSE;
5930     }
5931   }
5932 #endif /* KMP_ADJUST_BLOCKTIME */
5933 
5934   /* free the memory being used */
5935   if (__kmp_env_consistency_check) {
5936     if (thread->th.th_cons) {
5937       __kmp_free_cons_stack(thread->th.th_cons);
5938       thread->th.th_cons = NULL;
5939     }
5940   }
5941 
5942   if (thread->th.th_pri_common != NULL) {
5943     __kmp_free(thread->th.th_pri_common);
5944     thread->th.th_pri_common = NULL;
5945   }
5946 
5947   if (thread->th.th_task_state_memo_stack != NULL) {
5948     __kmp_free(thread->th.th_task_state_memo_stack);
5949     thread->th.th_task_state_memo_stack = NULL;
5950   }
5951 
5952 #if KMP_USE_BGET
5953   if (thread->th.th_local.bget_data != NULL) {
5954     __kmp_finalize_bget(thread);
5955   }
5956 #endif
5957 
5958 #if KMP_AFFINITY_SUPPORTED
5959   if (thread->th.th_affin_mask != NULL) {
5960     KMP_CPU_FREE(thread->th.th_affin_mask);
5961     thread->th.th_affin_mask = NULL;
5962   }
5963 #endif /* KMP_AFFINITY_SUPPORTED */
5964 
5965 #if KMP_USE_HIER_SCHED
5966   if (thread->th.th_hier_bar_data != NULL) {
5967     __kmp_free(thread->th.th_hier_bar_data);
5968     thread->th.th_hier_bar_data = NULL;
5969   }
5970 #endif
5971 
5972   __kmp_reap_team(thread->th.th_serial_team);
5973   thread->th.th_serial_team = NULL;
5974   __kmp_free(thread);
5975 
5976   KMP_MB();
5977 
5978 } // __kmp_reap_thread
5979 
5980 static void __kmp_internal_end(void) {
5981   int i;
5982 
5983   /* First, unregister the library */
5984   __kmp_unregister_library();
5985 
5986 #if KMP_OS_WINDOWS
5987   /* In Win static library, we can't tell when a root actually dies, so we
5988      reclaim the data structures for any root threads that have died but not
5989      unregistered themselves, in order to shut down cleanly.
5990      In Win dynamic library we also can't tell when a thread dies.  */
5991   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5992 // dead roots
5993 #endif
5994 
5995   for (i = 0; i < __kmp_threads_capacity; i++)
5996     if (__kmp_root[i])
5997       if (__kmp_root[i]->r.r_active)
5998         break;
5999   KMP_MB(); /* Flush all pending memory write invalidates.  */
6000   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6001 
6002   if (i < __kmp_threads_capacity) {
6003 #if KMP_USE_MONITOR
6004     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6005     KMP_MB(); /* Flush all pending memory write invalidates.  */
6006 
6007     // Need to check that monitor was initialized before reaping it. If we are
6008     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6009     // __kmp_monitor will appear to contain valid data, but it is only valid in
6010     // the parent process, not the child.
6011     // New behavior (201008): instead of keying off of the flag
6012     // __kmp_init_parallel, the monitor thread creation is keyed off
6013     // of the new flag __kmp_init_monitor.
6014     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6015     if (TCR_4(__kmp_init_monitor)) {
6016       __kmp_reap_monitor(&__kmp_monitor);
6017       TCW_4(__kmp_init_monitor, 0);
6018     }
6019     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6020     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6021 #endif // KMP_USE_MONITOR
6022   } else {
6023 /* TODO move this to cleanup code */
6024 #ifdef KMP_DEBUG
6025     /* make sure that everything has properly ended */
6026     for (i = 0; i < __kmp_threads_capacity; i++) {
6027       if (__kmp_root[i]) {
6028         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6029         //                    there can be uber threads alive here
6030         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6031       }
6032     }
6033 #endif
6034 
6035     KMP_MB();
6036 
6037     // Reap the worker threads.
6038     // This is valid for now, but be careful if threads are reaped sooner.
6039     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6040       // Get the next thread from the pool.
6041       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6042       __kmp_thread_pool = thread->th.th_next_pool;
6043       // Reap it.
6044       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6045       thread->th.th_next_pool = NULL;
6046       thread->th.th_in_pool = FALSE;
6047       __kmp_reap_thread(thread, 0);
6048     }
6049     __kmp_thread_pool_insert_pt = NULL;
6050 
6051     // Reap teams.
6052     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6053       // Get the next team from the pool.
6054       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6055       __kmp_team_pool = team->t.t_next_pool;
6056       // Reap it.
6057       team->t.t_next_pool = NULL;
6058       __kmp_reap_team(team);
6059     }
6060 
6061     __kmp_reap_task_teams();
6062 
6063 #if KMP_OS_UNIX
6064     // Threads that are not reaped should not access any resources since they
6065     // are going to be deallocated soon, so the shutdown sequence should wait
6066     // until all threads either exit the final spin-waiting loop or begin
6067     // sleeping after the given blocktime.
6068     for (i = 0; i < __kmp_threads_capacity; i++) {
6069       kmp_info_t *thr = __kmp_threads[i];
6070       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6071         KMP_CPU_PAUSE();
6072     }
6073 #endif
6074 
6075     for (i = 0; i < __kmp_threads_capacity; ++i) {
6076       // TBD: Add some checking...
6077       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6078     }
6079 
6080     /* Make sure all threadprivate destructors get run by joining with all
6081        worker threads before resetting this flag */
6082     TCW_SYNC_4(__kmp_init_common, FALSE);
6083 
6084     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6085     KMP_MB();
6086 
6087 #if KMP_USE_MONITOR
6088     // See note above: One of the possible fixes for CQ138434 / CQ140126
6089     //
6090     // FIXME: push both code fragments down and CSE them?
6091     // push them into __kmp_cleanup() ?
6092     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6093     if (TCR_4(__kmp_init_monitor)) {
6094       __kmp_reap_monitor(&__kmp_monitor);
6095       TCW_4(__kmp_init_monitor, 0);
6096     }
6097     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6098     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6099 #endif
6100   } /* else !__kmp_global.t_active */
6101   TCW_4(__kmp_init_gtid, FALSE);
6102   KMP_MB(); /* Flush all pending memory write invalidates.  */
6103 
6104   __kmp_cleanup();
6105 #if OMPT_SUPPORT
6106   ompt_fini();
6107 #endif
6108 }
6109 
6110 void __kmp_internal_end_library(int gtid_req) {
6111   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6112   /* this shouldn't be a race condition because __kmp_internal_end() is the
6113      only place to clear __kmp_serial_init */
6114   /* we'll check this later too, after we get the lock */
6115   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6116   // redundant, because the next check will work in any case.
6117   if (__kmp_global.g.g_abort) {
6118     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6119     /* TODO abort? */
6120     return;
6121   }
6122   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6123     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6124     return;
6125   }
6126 
6127   KMP_MB(); /* Flush all pending memory write invalidates.  */
6128 
6129   /* find out who we are and what we should do */
6130   {
6131     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6132     KA_TRACE(
6133         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6134     if (gtid == KMP_GTID_SHUTDOWN) {
6135       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6136                     "already shutdown\n"));
6137       return;
6138     } else if (gtid == KMP_GTID_MONITOR) {
6139       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6140                     "registered, or system shutdown\n"));
6141       return;
6142     } else if (gtid == KMP_GTID_DNE) {
6143       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6144                     "shutdown\n"));
6145       /* we don't know who we are, but we may still shutdown the library */
6146     } else if (KMP_UBER_GTID(gtid)) {
6147       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6148       if (__kmp_root[gtid]->r.r_active) {
6149         __kmp_global.g.g_abort = -1;
6150         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6151         KA_TRACE(10,
6152                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6153                   gtid));
6154         return;
6155       } else {
6156         KA_TRACE(
6157             10,
6158             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6159         __kmp_unregister_root_current_thread(gtid);
6160       }
6161     } else {
6162 /* worker threads may call this function through the atexit handler, if they
6163  * call exit() */
6164 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6165    TODO: do a thorough shutdown instead */
6166 #ifdef DUMP_DEBUG_ON_EXIT
6167       if (__kmp_debug_buf)
6168         __kmp_dump_debug_buffer();
6169 #endif
6170       return;
6171     }
6172   }
6173   /* synchronize the termination process */
6174   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6175 
6176   /* have we already finished */
6177   if (__kmp_global.g.g_abort) {
6178     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6179     /* TODO abort? */
6180     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6181     return;
6182   }
6183   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6184     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6185     return;
6186   }
6187 
6188   /* We need this lock to enforce mutex between this reading of
6189      __kmp_threads_capacity and the writing by __kmp_register_root.
6190      Alternatively, we can use a counter of roots that is atomically updated by
6191      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6192      __kmp_internal_end_*.  */
6193   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6194 
6195   /* now we can safely conduct the actual termination */
6196   __kmp_internal_end();
6197 
6198   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6199   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6200 
6201   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6202 
6203 #ifdef DUMP_DEBUG_ON_EXIT
6204   if (__kmp_debug_buf)
6205     __kmp_dump_debug_buffer();
6206 #endif
6207 
6208 #if KMP_OS_WINDOWS
6209   __kmp_close_console();
6210 #endif
6211 
6212   __kmp_fini_allocator();
6213 
6214 } // __kmp_internal_end_library
6215 
6216 void __kmp_internal_end_thread(int gtid_req) {
6217   int i;
6218 
6219   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6220   /* this shouldn't be a race condition because __kmp_internal_end() is the
6221    * only place to clear __kmp_serial_init */
6222   /* we'll check this later too, after we get the lock */
6223   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6224   // redundant, because the next check will work in any case.
6225   if (__kmp_global.g.g_abort) {
6226     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6227     /* TODO abort? */
6228     return;
6229   }
6230   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6231     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6232     return;
6233   }
6234 
6235   KMP_MB(); /* Flush all pending memory write invalidates.  */
6236 
6237   /* find out who we are and what we should do */
6238   {
6239     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6240     KA_TRACE(10,
6241              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6242     if (gtid == KMP_GTID_SHUTDOWN) {
6243       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6244                     "already shutdown\n"));
6245       return;
6246     } else if (gtid == KMP_GTID_MONITOR) {
6247       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6248                     "registered, or system shutdown\n"));
6249       return;
6250     } else if (gtid == KMP_GTID_DNE) {
6251       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6252                     "shutdown\n"));
6253       return;
6254       /* we don't know who we are */
6255     } else if (KMP_UBER_GTID(gtid)) {
6256       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6257       if (__kmp_root[gtid]->r.r_active) {
6258         __kmp_global.g.g_abort = -1;
6259         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6260         KA_TRACE(10,
6261                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6262                   gtid));
6263         return;
6264       } else {
6265         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6266                       gtid));
6267         __kmp_unregister_root_current_thread(gtid);
6268       }
6269     } else {
6270       /* just a worker thread, let's leave */
6271       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6272 
6273       if (gtid >= 0) {
6274         __kmp_threads[gtid]->th.th_task_team = NULL;
6275       }
6276 
6277       KA_TRACE(10,
6278                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6279                 gtid));
6280       return;
6281     }
6282   }
6283 #if KMP_DYNAMIC_LIB
6284   if (__kmp_pause_status != kmp_hard_paused)
6285   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6286   // because we will better shutdown later in the library destructor.
6287   {
6288     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6289     return;
6290   }
6291 #endif
6292   /* synchronize the termination process */
6293   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6294 
6295   /* have we already finished */
6296   if (__kmp_global.g.g_abort) {
6297     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6298     /* TODO abort? */
6299     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6300     return;
6301   }
6302   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6303     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6304     return;
6305   }
6306 
6307   /* We need this lock to enforce mutex between this reading of
6308      __kmp_threads_capacity and the writing by __kmp_register_root.
6309      Alternatively, we can use a counter of roots that is atomically updated by
6310      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6311      __kmp_internal_end_*.  */
6312 
6313   /* should we finish the run-time?  are all siblings done? */
6314   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6315 
6316   for (i = 0; i < __kmp_threads_capacity; ++i) {
6317     if (KMP_UBER_GTID(i)) {
6318       KA_TRACE(
6319           10,
6320           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6321       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6322       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6323       return;
6324     }
6325   }
6326 
6327   /* now we can safely conduct the actual termination */
6328 
6329   __kmp_internal_end();
6330 
6331   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6332   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6333 
6334   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6335 
6336 #ifdef DUMP_DEBUG_ON_EXIT
6337   if (__kmp_debug_buf)
6338     __kmp_dump_debug_buffer();
6339 #endif
6340 } // __kmp_internal_end_thread
6341 
6342 // -----------------------------------------------------------------------------
6343 // Library registration stuff.
6344 
6345 static long __kmp_registration_flag = 0;
6346 // Random value used to indicate library initialization.
6347 static char *__kmp_registration_str = NULL;
6348 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6349 
6350 static inline char *__kmp_reg_status_name() {
6351   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6352      each thread. If registration and unregistration go in different threads
6353      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6354      env var can not be found, because the name will contain different pid. */
6355   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6356 } // __kmp_reg_status_get
6357 
6358 void __kmp_register_library_startup(void) {
6359 
6360   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6361   int done = 0;
6362   union {
6363     double dtime;
6364     long ltime;
6365   } time;
6366 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6367   __kmp_initialize_system_tick();
6368 #endif
6369   __kmp_read_system_time(&time.dtime);
6370   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6371   __kmp_registration_str =
6372       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6373                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6374 
6375   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6376                 __kmp_registration_str));
6377 
6378   while (!done) {
6379 
6380     char *value = NULL; // Actual value of the environment variable.
6381 
6382     // Set environment variable, but do not overwrite if it is exist.
6383     __kmp_env_set(name, __kmp_registration_str, 0);
6384     // Check the variable is written.
6385     value = __kmp_env_get(name);
6386     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6387 
6388       done = 1; // Ok, environment variable set successfully, exit the loop.
6389 
6390     } else {
6391 
6392       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6393       // Check whether it alive or dead.
6394       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6395       char *tail = value;
6396       char *flag_addr_str = NULL;
6397       char *flag_val_str = NULL;
6398       char const *file_name = NULL;
6399       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6400       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6401       file_name = tail;
6402       if (tail != NULL) {
6403         long *flag_addr = 0;
6404         long flag_val = 0;
6405         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6406         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6407         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6408           // First, check whether environment-encoded address is mapped into
6409           // addr space.
6410           // If so, dereference it to see if it still has the right value.
6411           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6412             neighbor = 1;
6413           } else {
6414             // If not, then we know the other copy of the library is no longer
6415             // running.
6416             neighbor = 2;
6417           }
6418         }
6419       }
6420       switch (neighbor) {
6421       case 0: // Cannot parse environment variable -- neighbor status unknown.
6422         // Assume it is the incompatible format of future version of the
6423         // library. Assume the other library is alive.
6424         // WARN( ... ); // TODO: Issue a warning.
6425         file_name = "unknown library";
6426         KMP_FALLTHROUGH();
6427       // Attention! Falling to the next case. That's intentional.
6428       case 1: { // Neighbor is alive.
6429         // Check it is allowed.
6430         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6431         if (!__kmp_str_match_true(duplicate_ok)) {
6432           // That's not allowed. Issue fatal error.
6433           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6434                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6435         }
6436         KMP_INTERNAL_FREE(duplicate_ok);
6437         __kmp_duplicate_library_ok = 1;
6438         done = 1; // Exit the loop.
6439       } break;
6440       case 2: { // Neighbor is dead.
6441         // Clear the variable and try to register library again.
6442         __kmp_env_unset(name);
6443       } break;
6444       default: { KMP_DEBUG_ASSERT(0); } break;
6445       }
6446     }
6447     KMP_INTERNAL_FREE((void *)value);
6448   }
6449   KMP_INTERNAL_FREE((void *)name);
6450 
6451 } // func __kmp_register_library_startup
6452 
6453 void __kmp_unregister_library(void) {
6454 
6455   char *name = __kmp_reg_status_name();
6456   char *value = __kmp_env_get(name);
6457 
6458   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6459   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6460   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6461     // Ok, this is our variable. Delete it.
6462     __kmp_env_unset(name);
6463   }
6464 
6465   KMP_INTERNAL_FREE(__kmp_registration_str);
6466   KMP_INTERNAL_FREE(value);
6467   KMP_INTERNAL_FREE(name);
6468 
6469   __kmp_registration_flag = 0;
6470   __kmp_registration_str = NULL;
6471 
6472 } // __kmp_unregister_library
6473 
6474 // End of Library registration stuff.
6475 // -----------------------------------------------------------------------------
6476 
6477 #if KMP_MIC_SUPPORTED
6478 
6479 static void __kmp_check_mic_type() {
6480   kmp_cpuid_t cpuid_state = {0};
6481   kmp_cpuid_t *cs_p = &cpuid_state;
6482   __kmp_x86_cpuid(1, 0, cs_p);
6483   // We don't support mic1 at the moment
6484   if ((cs_p->eax & 0xff0) == 0xB10) {
6485     __kmp_mic_type = mic2;
6486   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6487     __kmp_mic_type = mic3;
6488   } else {
6489     __kmp_mic_type = non_mic;
6490   }
6491 }
6492 
6493 #endif /* KMP_MIC_SUPPORTED */
6494 
6495 static void __kmp_do_serial_initialize(void) {
6496   int i, gtid;
6497   int size;
6498 
6499   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6500 
6501   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6502   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6503   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6504   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6505   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6506 
6507 #if OMPT_SUPPORT
6508   ompt_pre_init();
6509 #endif
6510 
6511   __kmp_validate_locks();
6512 
6513   /* Initialize internal memory allocator */
6514   __kmp_init_allocator();
6515 
6516   /* Register the library startup via an environment variable and check to see
6517      whether another copy of the library is already registered. */
6518 
6519   __kmp_register_library_startup();
6520 
6521   /* TODO reinitialization of library */
6522   if (TCR_4(__kmp_global.g.g_done)) {
6523     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6524   }
6525 
6526   __kmp_global.g.g_abort = 0;
6527   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6528 
6529 /* initialize the locks */
6530 #if KMP_USE_ADAPTIVE_LOCKS
6531 #if KMP_DEBUG_ADAPTIVE_LOCKS
6532   __kmp_init_speculative_stats();
6533 #endif
6534 #endif
6535 #if KMP_STATS_ENABLED
6536   __kmp_stats_init();
6537 #endif
6538   __kmp_init_lock(&__kmp_global_lock);
6539   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6540   __kmp_init_lock(&__kmp_debug_lock);
6541   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6542   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6543   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6544   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6545   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6546   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6547   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6548   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6549   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6550   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6551   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6552   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6553   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6554   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6555   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6556 #if KMP_USE_MONITOR
6557   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6558 #endif
6559   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6560 
6561   /* conduct initialization and initial setup of configuration */
6562 
6563   __kmp_runtime_initialize();
6564 
6565 #if KMP_MIC_SUPPORTED
6566   __kmp_check_mic_type();
6567 #endif
6568 
6569 // Some global variable initialization moved here from kmp_env_initialize()
6570 #ifdef KMP_DEBUG
6571   kmp_diag = 0;
6572 #endif
6573   __kmp_abort_delay = 0;
6574 
6575   // From __kmp_init_dflt_team_nth()
6576   /* assume the entire machine will be used */
6577   __kmp_dflt_team_nth_ub = __kmp_xproc;
6578   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6579     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6580   }
6581   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6582     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6583   }
6584   __kmp_max_nth = __kmp_sys_max_nth;
6585   __kmp_cg_max_nth = __kmp_sys_max_nth;
6586   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6587   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6588     __kmp_teams_max_nth = __kmp_sys_max_nth;
6589   }
6590 
6591   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6592   // part
6593   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6594 #if KMP_USE_MONITOR
6595   __kmp_monitor_wakeups =
6596       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6597   __kmp_bt_intervals =
6598       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6599 #endif
6600   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6601   __kmp_library = library_throughput;
6602   // From KMP_SCHEDULE initialization
6603   __kmp_static = kmp_sch_static_balanced;
6604 // AC: do not use analytical here, because it is non-monotonous
6605 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6606 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6607 // need to repeat assignment
6608 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6609 // bit control and barrier method control parts
6610 #if KMP_FAST_REDUCTION_BARRIER
6611 #define kmp_reduction_barrier_gather_bb ((int)1)
6612 #define kmp_reduction_barrier_release_bb ((int)1)
6613 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6614 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6615 #endif // KMP_FAST_REDUCTION_BARRIER
6616   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6617     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6618     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6619     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6620     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6621 #if KMP_FAST_REDUCTION_BARRIER
6622     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6623       // lin_64 ): hyper,1
6624       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6625       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6626       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6627       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6628     }
6629 #endif // KMP_FAST_REDUCTION_BARRIER
6630   }
6631 #if KMP_FAST_REDUCTION_BARRIER
6632 #undef kmp_reduction_barrier_release_pat
6633 #undef kmp_reduction_barrier_gather_pat
6634 #undef kmp_reduction_barrier_release_bb
6635 #undef kmp_reduction_barrier_gather_bb
6636 #endif // KMP_FAST_REDUCTION_BARRIER
6637 #if KMP_MIC_SUPPORTED
6638   if (__kmp_mic_type == mic2) { // KNC
6639     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6640     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6641     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6642         1; // forkjoin release
6643     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6644     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6645   }
6646 #if KMP_FAST_REDUCTION_BARRIER
6647   if (__kmp_mic_type == mic2) { // KNC
6648     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6649     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6650   }
6651 #endif // KMP_FAST_REDUCTION_BARRIER
6652 #endif // KMP_MIC_SUPPORTED
6653 
6654 // From KMP_CHECKS initialization
6655 #ifdef KMP_DEBUG
6656   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6657 #else
6658   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6659 #endif
6660 
6661   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6662   __kmp_foreign_tp = TRUE;
6663 
6664   __kmp_global.g.g_dynamic = FALSE;
6665   __kmp_global.g.g_dynamic_mode = dynamic_default;
6666 
6667   __kmp_env_initialize(NULL);
6668 
6669 // Print all messages in message catalog for testing purposes.
6670 #ifdef KMP_DEBUG
6671   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6672   if (__kmp_str_match_true(val)) {
6673     kmp_str_buf_t buffer;
6674     __kmp_str_buf_init(&buffer);
6675     __kmp_i18n_dump_catalog(&buffer);
6676     __kmp_printf("%s", buffer.str);
6677     __kmp_str_buf_free(&buffer);
6678   }
6679   __kmp_env_free(&val);
6680 #endif
6681 
6682   __kmp_threads_capacity =
6683       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6684   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6685   __kmp_tp_capacity = __kmp_default_tp_capacity(
6686       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6687 
6688   // If the library is shut down properly, both pools must be NULL. Just in
6689   // case, set them to NULL -- some memory may leak, but subsequent code will
6690   // work even if pools are not freed.
6691   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6692   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6693   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6694   __kmp_thread_pool = NULL;
6695   __kmp_thread_pool_insert_pt = NULL;
6696   __kmp_team_pool = NULL;
6697 
6698   /* Allocate all of the variable sized records */
6699   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6700    * expandable */
6701   /* Since allocation is cache-aligned, just add extra padding at the end */
6702   size =
6703       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6704       CACHE_LINE;
6705   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6706   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6707                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6708 
6709   /* init thread counts */
6710   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6711                    0); // Asserts fail if the library is reinitializing and
6712   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6713   __kmp_all_nth = 0;
6714   __kmp_nth = 0;
6715 
6716   /* setup the uber master thread and hierarchy */
6717   gtid = __kmp_register_root(TRUE);
6718   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6719   KMP_ASSERT(KMP_UBER_GTID(gtid));
6720   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6721 
6722   KMP_MB(); /* Flush all pending memory write invalidates.  */
6723 
6724   __kmp_common_initialize();
6725 
6726 #if KMP_OS_UNIX
6727   /* invoke the child fork handler */
6728   __kmp_register_atfork();
6729 #endif
6730 
6731 #if !KMP_DYNAMIC_LIB
6732   {
6733     /* Invoke the exit handler when the program finishes, only for static
6734        library. For dynamic library, we already have _fini and DllMain. */
6735     int rc = atexit(__kmp_internal_end_atexit);
6736     if (rc != 0) {
6737       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6738                   __kmp_msg_null);
6739     }
6740   }
6741 #endif
6742 
6743 #if KMP_HANDLE_SIGNALS
6744 #if KMP_OS_UNIX
6745   /* NOTE: make sure that this is called before the user installs their own
6746      signal handlers so that the user handlers are called first. this way they
6747      can return false, not call our handler, avoid terminating the library, and
6748      continue execution where they left off. */
6749   __kmp_install_signals(FALSE);
6750 #endif /* KMP_OS_UNIX */
6751 #if KMP_OS_WINDOWS
6752   __kmp_install_signals(TRUE);
6753 #endif /* KMP_OS_WINDOWS */
6754 #endif
6755 
6756   /* we have finished the serial initialization */
6757   __kmp_init_counter++;
6758 
6759   __kmp_init_serial = TRUE;
6760 
6761   if (__kmp_settings) {
6762     __kmp_env_print();
6763   }
6764 
6765   if (__kmp_display_env || __kmp_display_env_verbose) {
6766     __kmp_env_print_2();
6767   }
6768 
6769 #if OMPT_SUPPORT
6770   ompt_post_init();
6771 #endif
6772 
6773   KMP_MB();
6774 
6775   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6776 }
6777 
6778 void __kmp_serial_initialize(void) {
6779   if (__kmp_init_serial) {
6780     return;
6781   }
6782   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6783   if (__kmp_init_serial) {
6784     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6785     return;
6786   }
6787   __kmp_do_serial_initialize();
6788   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6789 }
6790 
6791 static void __kmp_do_middle_initialize(void) {
6792   int i, j;
6793   int prev_dflt_team_nth;
6794 
6795   if (!__kmp_init_serial) {
6796     __kmp_do_serial_initialize();
6797   }
6798 
6799   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6800 
6801   // Save the previous value for the __kmp_dflt_team_nth so that
6802   // we can avoid some reinitialization if it hasn't changed.
6803   prev_dflt_team_nth = __kmp_dflt_team_nth;
6804 
6805 #if KMP_AFFINITY_SUPPORTED
6806   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6807   // number of cores on the machine.
6808   __kmp_affinity_initialize();
6809 
6810   // Run through the __kmp_threads array and set the affinity mask
6811   // for each root thread that is currently registered with the RTL.
6812   for (i = 0; i < __kmp_threads_capacity; i++) {
6813     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6814       __kmp_affinity_set_init_mask(i, TRUE);
6815     }
6816   }
6817 #endif /* KMP_AFFINITY_SUPPORTED */
6818 
6819   KMP_ASSERT(__kmp_xproc > 0);
6820   if (__kmp_avail_proc == 0) {
6821     __kmp_avail_proc = __kmp_xproc;
6822   }
6823 
6824   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6825   // correct them now
6826   j = 0;
6827   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6828     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6829         __kmp_avail_proc;
6830     j++;
6831   }
6832 
6833   if (__kmp_dflt_team_nth == 0) {
6834 #ifdef KMP_DFLT_NTH_CORES
6835     // Default #threads = #cores
6836     __kmp_dflt_team_nth = __kmp_ncores;
6837     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6838                   "__kmp_ncores (%d)\n",
6839                   __kmp_dflt_team_nth));
6840 #else
6841     // Default #threads = #available OS procs
6842     __kmp_dflt_team_nth = __kmp_avail_proc;
6843     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6844                   "__kmp_avail_proc(%d)\n",
6845                   __kmp_dflt_team_nth));
6846 #endif /* KMP_DFLT_NTH_CORES */
6847   }
6848 
6849   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6850     __kmp_dflt_team_nth = KMP_MIN_NTH;
6851   }
6852   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6853     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6854   }
6855 
6856   // There's no harm in continuing if the following check fails,
6857   // but it indicates an error in the previous logic.
6858   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6859 
6860   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6861     // Run through the __kmp_threads array and set the num threads icv for each
6862     // root thread that is currently registered with the RTL (which has not
6863     // already explicitly set its nthreads-var with a call to
6864     // omp_set_num_threads()).
6865     for (i = 0; i < __kmp_threads_capacity; i++) {
6866       kmp_info_t *thread = __kmp_threads[i];
6867       if (thread == NULL)
6868         continue;
6869       if (thread->th.th_current_task->td_icvs.nproc != 0)
6870         continue;
6871 
6872       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6873     }
6874   }
6875   KA_TRACE(
6876       20,
6877       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6878        __kmp_dflt_team_nth));
6879 
6880 #ifdef KMP_ADJUST_BLOCKTIME
6881   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6882   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6883     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6884     if (__kmp_nth > __kmp_avail_proc) {
6885       __kmp_zero_bt = TRUE;
6886     }
6887   }
6888 #endif /* KMP_ADJUST_BLOCKTIME */
6889 
6890   /* we have finished middle initialization */
6891   TCW_SYNC_4(__kmp_init_middle, TRUE);
6892 
6893   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6894 }
6895 
6896 void __kmp_middle_initialize(void) {
6897   if (__kmp_init_middle) {
6898     return;
6899   }
6900   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6901   if (__kmp_init_middle) {
6902     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6903     return;
6904   }
6905   __kmp_do_middle_initialize();
6906   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6907 }
6908 
6909 void __kmp_parallel_initialize(void) {
6910   int gtid = __kmp_entry_gtid(); // this might be a new root
6911 
6912   /* synchronize parallel initialization (for sibling) */
6913   if (TCR_4(__kmp_init_parallel))
6914     return;
6915   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6916   if (TCR_4(__kmp_init_parallel)) {
6917     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6918     return;
6919   }
6920 
6921   /* TODO reinitialization after we have already shut down */
6922   if (TCR_4(__kmp_global.g.g_done)) {
6923     KA_TRACE(
6924         10,
6925         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6926     __kmp_infinite_loop();
6927   }
6928 
6929   /* jc: The lock __kmp_initz_lock is already held, so calling
6930      __kmp_serial_initialize would cause a deadlock.  So we call
6931      __kmp_do_serial_initialize directly. */
6932   if (!__kmp_init_middle) {
6933     __kmp_do_middle_initialize();
6934   }
6935   __kmp_resume_if_hard_paused();
6936 
6937   /* begin initialization */
6938   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6939   KMP_ASSERT(KMP_UBER_GTID(gtid));
6940 
6941 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6942   // Save the FP control regs.
6943   // Worker threads will set theirs to these values at thread startup.
6944   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6945   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6946   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6947 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6948 
6949 #if KMP_OS_UNIX
6950 #if KMP_HANDLE_SIGNALS
6951   /*  must be after __kmp_serial_initialize  */
6952   __kmp_install_signals(TRUE);
6953 #endif
6954 #endif
6955 
6956   __kmp_suspend_initialize();
6957 
6958 #if defined(USE_LOAD_BALANCE)
6959   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6960     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6961   }
6962 #else
6963   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6964     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6965   }
6966 #endif
6967 
6968   if (__kmp_version) {
6969     __kmp_print_version_2();
6970   }
6971 
6972   /* we have finished parallel initialization */
6973   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6974 
6975   KMP_MB();
6976   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6977 
6978   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6979 }
6980 
6981 /* ------------------------------------------------------------------------ */
6982 
6983 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6984                                    kmp_team_t *team) {
6985   kmp_disp_t *dispatch;
6986 
6987   KMP_MB();
6988 
6989   /* none of the threads have encountered any constructs, yet. */
6990   this_thr->th.th_local.this_construct = 0;
6991 #if KMP_CACHE_MANAGE
6992   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6993 #endif /* KMP_CACHE_MANAGE */
6994   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6995   KMP_DEBUG_ASSERT(dispatch);
6996   KMP_DEBUG_ASSERT(team->t.t_dispatch);
6997   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6998   // this_thr->th.th_info.ds.ds_tid ] );
6999 
7000   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7001   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7002   if (__kmp_env_consistency_check)
7003     __kmp_push_parallel(gtid, team->t.t_ident);
7004 
7005   KMP_MB(); /* Flush all pending memory write invalidates.  */
7006 }
7007 
7008 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7009                                   kmp_team_t *team) {
7010   if (__kmp_env_consistency_check)
7011     __kmp_pop_parallel(gtid, team->t.t_ident);
7012 
7013   __kmp_finish_implicit_task(this_thr);
7014 }
7015 
7016 int __kmp_invoke_task_func(int gtid) {
7017   int rc;
7018   int tid = __kmp_tid_from_gtid(gtid);
7019   kmp_info_t *this_thr = __kmp_threads[gtid];
7020   kmp_team_t *team = this_thr->th.th_team;
7021 
7022   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7023 #if USE_ITT_BUILD
7024   if (__itt_stack_caller_create_ptr) {
7025     __kmp_itt_stack_callee_enter(
7026         (__itt_caller)
7027             team->t.t_stack_id); // inform ittnotify about entering user's code
7028   }
7029 #endif /* USE_ITT_BUILD */
7030 #if INCLUDE_SSC_MARKS
7031   SSC_MARK_INVOKING();
7032 #endif
7033 
7034 #if OMPT_SUPPORT
7035   void *dummy;
7036   void **exit_frame_p;
7037   ompt_data_t *my_task_data;
7038   ompt_data_t *my_parallel_data;
7039   int ompt_team_size;
7040 
7041   if (ompt_enabled.enabled) {
7042     exit_frame_p = &(
7043         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7044   } else {
7045     exit_frame_p = &dummy;
7046   }
7047 
7048   my_task_data =
7049       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7050   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7051   if (ompt_enabled.ompt_callback_implicit_task) {
7052     ompt_team_size = team->t.t_nproc;
7053     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7054         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7055         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7056     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7057   }
7058 #endif
7059 
7060 #if KMP_STATS_ENABLED
7061   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7062   if (previous_state == stats_state_e::TEAMS_REGION) {
7063     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7064   } else {
7065     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7066   }
7067   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7068 #endif
7069 
7070   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7071                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7072 #if OMPT_SUPPORT
7073                               ,
7074                               exit_frame_p
7075 #endif
7076                               );
7077 #if OMPT_SUPPORT
7078   *exit_frame_p = NULL;
7079    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7080 #endif
7081 
7082 #if KMP_STATS_ENABLED
7083   if (previous_state == stats_state_e::TEAMS_REGION) {
7084     KMP_SET_THREAD_STATE(previous_state);
7085   }
7086   KMP_POP_PARTITIONED_TIMER();
7087 #endif
7088 
7089 #if USE_ITT_BUILD
7090   if (__itt_stack_caller_create_ptr) {
7091     __kmp_itt_stack_callee_leave(
7092         (__itt_caller)
7093             team->t.t_stack_id); // inform ittnotify about leaving user's code
7094   }
7095 #endif /* USE_ITT_BUILD */
7096   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7097 
7098   return rc;
7099 }
7100 
7101 void __kmp_teams_master(int gtid) {
7102   // This routine is called by all master threads in teams construct
7103   kmp_info_t *thr = __kmp_threads[gtid];
7104   kmp_team_t *team = thr->th.th_team;
7105   ident_t *loc = team->t.t_ident;
7106   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7107   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7108   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7109   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7110                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7111 
7112   // This thread is a new CG root.  Set up the proper variables.
7113   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7114   tmp->cg_root = thr; // Make thr the CG root
7115   // Init to thread limit that was stored when league masters were forked
7116   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7117   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7118   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7119                  " cg_nthreads to 1\n",
7120                  thr, tmp));
7121   tmp->up = thr->th.th_cg_roots;
7122   thr->th.th_cg_roots = tmp;
7123 
7124 // Launch league of teams now, but not let workers execute
7125 // (they hang on fork barrier until next parallel)
7126 #if INCLUDE_SSC_MARKS
7127   SSC_MARK_FORKING();
7128 #endif
7129   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7130                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7131                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7132 #if INCLUDE_SSC_MARKS
7133   SSC_MARK_JOINING();
7134 #endif
7135   // If the team size was reduced from the limit, set it to the new size
7136   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7137     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7138   // AC: last parameter "1" eliminates join barrier which won't work because
7139   // worker threads are in a fork barrier waiting for more parallel regions
7140   __kmp_join_call(loc, gtid
7141 #if OMPT_SUPPORT
7142                   ,
7143                   fork_context_intel
7144 #endif
7145                   ,
7146                   1);
7147 }
7148 
7149 int __kmp_invoke_teams_master(int gtid) {
7150   kmp_info_t *this_thr = __kmp_threads[gtid];
7151   kmp_team_t *team = this_thr->th.th_team;
7152 #if KMP_DEBUG
7153   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7154     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7155                      (void *)__kmp_teams_master);
7156 #endif
7157   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7158 #if OMPT_SUPPORT
7159   int tid = __kmp_tid_from_gtid(gtid);
7160   ompt_data_t *task_data =
7161       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7162   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7163   if (ompt_enabled.ompt_callback_implicit_task) {
7164     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7165         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7166         ompt_task_initial);
7167     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7168   }
7169 #endif
7170   __kmp_teams_master(gtid);
7171 #if OMPT_SUPPORT
7172   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7173 #endif
7174   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7175   return 1;
7176 }
7177 
7178 /* this sets the requested number of threads for the next parallel region
7179    encountered by this team. since this should be enclosed in the forkjoin
7180    critical section it should avoid race conditions with asymmetrical nested
7181    parallelism */
7182 
7183 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7184   kmp_info_t *thr = __kmp_threads[gtid];
7185 
7186   if (num_threads > 0)
7187     thr->th.th_set_nproc = num_threads;
7188 }
7189 
7190 /* this sets the requested number of teams for the teams region and/or
7191    the number of threads for the next parallel region encountered  */
7192 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7193                           int num_threads) {
7194   kmp_info_t *thr = __kmp_threads[gtid];
7195   KMP_DEBUG_ASSERT(num_teams >= 0);
7196   KMP_DEBUG_ASSERT(num_threads >= 0);
7197 
7198   if (num_teams == 0)
7199     num_teams = 1; // default number of teams is 1.
7200   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7201     if (!__kmp_reserve_warn) {
7202       __kmp_reserve_warn = 1;
7203       __kmp_msg(kmp_ms_warning,
7204                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7205                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7206     }
7207     num_teams = __kmp_teams_max_nth;
7208   }
7209   // Set number of teams (number of threads in the outer "parallel" of the
7210   // teams)
7211   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7212 
7213   // Remember the number of threads for inner parallel regions
7214   if (!TCR_4(__kmp_init_middle))
7215     __kmp_middle_initialize(); // get internal globals calculated
7216   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7217   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7218   if (num_threads == 0) {
7219     num_threads = __kmp_avail_proc / num_teams;
7220     // adjust num_threads w/o warning as it is not user setting
7221     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7222     // no thread_limit clause specified -  do not change thread-limit-var ICV
7223     if (num_threads > __kmp_dflt_team_nth) {
7224       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7225     }
7226     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7227       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7228     } // prevent team size to exceed thread-limit-var
7229     if (num_teams * num_threads > __kmp_teams_max_nth) {
7230       num_threads = __kmp_teams_max_nth / num_teams;
7231     }
7232   } else {
7233     // This thread will be the master of the league masters
7234     // Store new thread limit; old limit is saved in th_cg_roots list
7235     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7236     // num_threads = min(num_threads, nthreads-var)
7237     if (num_threads > __kmp_dflt_team_nth) {
7238       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7239     }
7240     if (num_teams * num_threads > __kmp_teams_max_nth) {
7241       int new_threads = __kmp_teams_max_nth / num_teams;
7242       if (!__kmp_reserve_warn) { // user asked for too many threads
7243         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7244         __kmp_msg(kmp_ms_warning,
7245                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7246                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7247       }
7248       num_threads = new_threads;
7249     }
7250   }
7251   thr->th.th_teams_size.nth = num_threads;
7252 }
7253 
7254 // Set the proc_bind var to use in the following parallel region.
7255 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7256   kmp_info_t *thr = __kmp_threads[gtid];
7257   thr->th.th_set_proc_bind = proc_bind;
7258 }
7259 
7260 /* Launch the worker threads into the microtask. */
7261 
7262 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7263   kmp_info_t *this_thr = __kmp_threads[gtid];
7264 
7265 #ifdef KMP_DEBUG
7266   int f;
7267 #endif /* KMP_DEBUG */
7268 
7269   KMP_DEBUG_ASSERT(team);
7270   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7271   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7272   KMP_MB(); /* Flush all pending memory write invalidates.  */
7273 
7274   team->t.t_construct = 0; /* no single directives seen yet */
7275   team->t.t_ordered.dt.t_value =
7276       0; /* thread 0 enters the ordered section first */
7277 
7278   /* Reset the identifiers on the dispatch buffer */
7279   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7280   if (team->t.t_max_nproc > 1) {
7281     int i;
7282     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7283       team->t.t_disp_buffer[i].buffer_index = i;
7284       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7285     }
7286   } else {
7287     team->t.t_disp_buffer[0].buffer_index = 0;
7288     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7289   }
7290 
7291   KMP_MB(); /* Flush all pending memory write invalidates.  */
7292   KMP_ASSERT(this_thr->th.th_team == team);
7293 
7294 #ifdef KMP_DEBUG
7295   for (f = 0; f < team->t.t_nproc; f++) {
7296     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7297                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7298   }
7299 #endif /* KMP_DEBUG */
7300 
7301   /* release the worker threads so they may begin working */
7302   __kmp_fork_barrier(gtid, 0);
7303 }
7304 
7305 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7306   kmp_info_t *this_thr = __kmp_threads[gtid];
7307 
7308   KMP_DEBUG_ASSERT(team);
7309   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7310   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7311   KMP_MB(); /* Flush all pending memory write invalidates.  */
7312 
7313 /* Join barrier after fork */
7314 
7315 #ifdef KMP_DEBUG
7316   if (__kmp_threads[gtid] &&
7317       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7318     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7319                  __kmp_threads[gtid]);
7320     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7321                  "team->t.t_nproc=%d\n",
7322                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7323                  team->t.t_nproc);
7324     __kmp_print_structure();
7325   }
7326   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7327                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7328 #endif /* KMP_DEBUG */
7329 
7330   __kmp_join_barrier(gtid); /* wait for everyone */
7331 #if OMPT_SUPPORT
7332   if (ompt_enabled.enabled &&
7333       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7334     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7335     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7336     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7337 #if OMPT_OPTIONAL
7338     void *codeptr = NULL;
7339     if (KMP_MASTER_TID(ds_tid) &&
7340         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7341          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7342       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7343 
7344     if (ompt_enabled.ompt_callback_sync_region_wait) {
7345       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7346           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7347           codeptr);
7348     }
7349     if (ompt_enabled.ompt_callback_sync_region) {
7350       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7351           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7352           codeptr);
7353     }
7354 #endif
7355     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7356       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7357           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7358     }
7359   }
7360 #endif
7361 
7362   KMP_MB(); /* Flush all pending memory write invalidates.  */
7363   KMP_ASSERT(this_thr->th.th_team == team);
7364 }
7365 
7366 /* ------------------------------------------------------------------------ */
7367 
7368 #ifdef USE_LOAD_BALANCE
7369 
7370 // Return the worker threads actively spinning in the hot team, if we
7371 // are at the outermost level of parallelism.  Otherwise, return 0.
7372 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7373   int i;
7374   int retval;
7375   kmp_team_t *hot_team;
7376 
7377   if (root->r.r_active) {
7378     return 0;
7379   }
7380   hot_team = root->r.r_hot_team;
7381   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7382     return hot_team->t.t_nproc - 1; // Don't count master thread
7383   }
7384 
7385   // Skip the master thread - it is accounted for elsewhere.
7386   retval = 0;
7387   for (i = 1; i < hot_team->t.t_nproc; i++) {
7388     if (hot_team->t.t_threads[i]->th.th_active) {
7389       retval++;
7390     }
7391   }
7392   return retval;
7393 }
7394 
7395 // Perform an automatic adjustment to the number of
7396 // threads used by the next parallel region.
7397 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7398   int retval;
7399   int pool_active;
7400   int hot_team_active;
7401   int team_curr_active;
7402   int system_active;
7403 
7404   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7405                 set_nproc));
7406   KMP_DEBUG_ASSERT(root);
7407   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7408                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7409   KMP_DEBUG_ASSERT(set_nproc > 1);
7410 
7411   if (set_nproc == 1) {
7412     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7413     return 1;
7414   }
7415 
7416   // Threads that are active in the thread pool, active in the hot team for this
7417   // particular root (if we are at the outer par level), and the currently
7418   // executing thread (to become the master) are available to add to the new
7419   // team, but are currently contributing to the system load, and must be
7420   // accounted for.
7421   pool_active = __kmp_thread_pool_active_nth;
7422   hot_team_active = __kmp_active_hot_team_nproc(root);
7423   team_curr_active = pool_active + hot_team_active + 1;
7424 
7425   // Check the system load.
7426   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7427   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7428                 "hot team active = %d\n",
7429                 system_active, pool_active, hot_team_active));
7430 
7431   if (system_active < 0) {
7432     // There was an error reading the necessary info from /proc, so use the
7433     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7434     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7435     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7436     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7437 
7438     // Make this call behave like the thread limit algorithm.
7439     retval = __kmp_avail_proc - __kmp_nth +
7440              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7441     if (retval > set_nproc) {
7442       retval = set_nproc;
7443     }
7444     if (retval < KMP_MIN_NTH) {
7445       retval = KMP_MIN_NTH;
7446     }
7447 
7448     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7449                   retval));
7450     return retval;
7451   }
7452 
7453   // There is a slight delay in the load balance algorithm in detecting new
7454   // running procs. The real system load at this instant should be at least as
7455   // large as the #active omp thread that are available to add to the team.
7456   if (system_active < team_curr_active) {
7457     system_active = team_curr_active;
7458   }
7459   retval = __kmp_avail_proc - system_active + team_curr_active;
7460   if (retval > set_nproc) {
7461     retval = set_nproc;
7462   }
7463   if (retval < KMP_MIN_NTH) {
7464     retval = KMP_MIN_NTH;
7465   }
7466 
7467   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7468   return retval;
7469 } // __kmp_load_balance_nproc()
7470 
7471 #endif /* USE_LOAD_BALANCE */
7472 
7473 /* ------------------------------------------------------------------------ */
7474 
7475 /* NOTE: this is called with the __kmp_init_lock held */
7476 void __kmp_cleanup(void) {
7477   int f;
7478 
7479   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7480 
7481   if (TCR_4(__kmp_init_parallel)) {
7482 #if KMP_HANDLE_SIGNALS
7483     __kmp_remove_signals();
7484 #endif
7485     TCW_4(__kmp_init_parallel, FALSE);
7486   }
7487 
7488   if (TCR_4(__kmp_init_middle)) {
7489 #if KMP_AFFINITY_SUPPORTED
7490     __kmp_affinity_uninitialize();
7491 #endif /* KMP_AFFINITY_SUPPORTED */
7492     __kmp_cleanup_hierarchy();
7493     TCW_4(__kmp_init_middle, FALSE);
7494   }
7495 
7496   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7497 
7498   if (__kmp_init_serial) {
7499     __kmp_runtime_destroy();
7500     __kmp_init_serial = FALSE;
7501   }
7502 
7503   __kmp_cleanup_threadprivate_caches();
7504 
7505   for (f = 0; f < __kmp_threads_capacity; f++) {
7506     if (__kmp_root[f] != NULL) {
7507       __kmp_free(__kmp_root[f]);
7508       __kmp_root[f] = NULL;
7509     }
7510   }
7511   __kmp_free(__kmp_threads);
7512   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7513   // there is no need in freeing __kmp_root.
7514   __kmp_threads = NULL;
7515   __kmp_root = NULL;
7516   __kmp_threads_capacity = 0;
7517 
7518 #if KMP_USE_DYNAMIC_LOCK
7519   __kmp_cleanup_indirect_user_locks();
7520 #else
7521   __kmp_cleanup_user_locks();
7522 #endif
7523 
7524 #if KMP_AFFINITY_SUPPORTED
7525   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7526   __kmp_cpuinfo_file = NULL;
7527 #endif /* KMP_AFFINITY_SUPPORTED */
7528 
7529 #if KMP_USE_ADAPTIVE_LOCKS
7530 #if KMP_DEBUG_ADAPTIVE_LOCKS
7531   __kmp_print_speculative_stats();
7532 #endif
7533 #endif
7534   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7535   __kmp_nested_nth.nth = NULL;
7536   __kmp_nested_nth.size = 0;
7537   __kmp_nested_nth.used = 0;
7538   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7539   __kmp_nested_proc_bind.bind_types = NULL;
7540   __kmp_nested_proc_bind.size = 0;
7541   __kmp_nested_proc_bind.used = 0;
7542   if (__kmp_affinity_format) {
7543     KMP_INTERNAL_FREE(__kmp_affinity_format);
7544     __kmp_affinity_format = NULL;
7545   }
7546 
7547   __kmp_i18n_catclose();
7548 
7549 #if KMP_USE_HIER_SCHED
7550   __kmp_hier_scheds.deallocate();
7551 #endif
7552 
7553 #if KMP_STATS_ENABLED
7554   __kmp_stats_fini();
7555 #endif
7556 
7557   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7558 }
7559 
7560 /* ------------------------------------------------------------------------ */
7561 
7562 int __kmp_ignore_mppbeg(void) {
7563   char *env;
7564 
7565   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7566     if (__kmp_str_match_false(env))
7567       return FALSE;
7568   }
7569   // By default __kmpc_begin() is no-op.
7570   return TRUE;
7571 }
7572 
7573 int __kmp_ignore_mppend(void) {
7574   char *env;
7575 
7576   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7577     if (__kmp_str_match_false(env))
7578       return FALSE;
7579   }
7580   // By default __kmpc_end() is no-op.
7581   return TRUE;
7582 }
7583 
7584 void __kmp_internal_begin(void) {
7585   int gtid;
7586   kmp_root_t *root;
7587 
7588   /* this is a very important step as it will register new sibling threads
7589      and assign these new uber threads a new gtid */
7590   gtid = __kmp_entry_gtid();
7591   root = __kmp_threads[gtid]->th.th_root;
7592   KMP_ASSERT(KMP_UBER_GTID(gtid));
7593 
7594   if (root->r.r_begin)
7595     return;
7596   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7597   if (root->r.r_begin) {
7598     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7599     return;
7600   }
7601 
7602   root->r.r_begin = TRUE;
7603 
7604   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7605 }
7606 
7607 /* ------------------------------------------------------------------------ */
7608 
7609 void __kmp_user_set_library(enum library_type arg) {
7610   int gtid;
7611   kmp_root_t *root;
7612   kmp_info_t *thread;
7613 
7614   /* first, make sure we are initialized so we can get our gtid */
7615 
7616   gtid = __kmp_entry_gtid();
7617   thread = __kmp_threads[gtid];
7618 
7619   root = thread->th.th_root;
7620 
7621   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7622                 library_serial));
7623   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7624                                   thread */
7625     KMP_WARNING(SetLibraryIncorrectCall);
7626     return;
7627   }
7628 
7629   switch (arg) {
7630   case library_serial:
7631     thread->th.th_set_nproc = 0;
7632     set__nproc(thread, 1);
7633     break;
7634   case library_turnaround:
7635     thread->th.th_set_nproc = 0;
7636     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7637                                            : __kmp_dflt_team_nth_ub);
7638     break;
7639   case library_throughput:
7640     thread->th.th_set_nproc = 0;
7641     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7642                                            : __kmp_dflt_team_nth_ub);
7643     break;
7644   default:
7645     KMP_FATAL(UnknownLibraryType, arg);
7646   }
7647 
7648   __kmp_aux_set_library(arg);
7649 }
7650 
7651 void __kmp_aux_set_stacksize(size_t arg) {
7652   if (!__kmp_init_serial)
7653     __kmp_serial_initialize();
7654 
7655 #if KMP_OS_DARWIN
7656   if (arg & (0x1000 - 1)) {
7657     arg &= ~(0x1000 - 1);
7658     if (arg + 0x1000) /* check for overflow if we round up */
7659       arg += 0x1000;
7660   }
7661 #endif
7662   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7663 
7664   /* only change the default stacksize before the first parallel region */
7665   if (!TCR_4(__kmp_init_parallel)) {
7666     size_t value = arg; /* argument is in bytes */
7667 
7668     if (value < __kmp_sys_min_stksize)
7669       value = __kmp_sys_min_stksize;
7670     else if (value > KMP_MAX_STKSIZE)
7671       value = KMP_MAX_STKSIZE;
7672 
7673     __kmp_stksize = value;
7674 
7675     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7676   }
7677 
7678   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7679 }
7680 
7681 /* set the behaviour of the runtime library */
7682 /* TODO this can cause some odd behaviour with sibling parallelism... */
7683 void __kmp_aux_set_library(enum library_type arg) {
7684   __kmp_library = arg;
7685 
7686   switch (__kmp_library) {
7687   case library_serial: {
7688     KMP_INFORM(LibraryIsSerial);
7689   } break;
7690   case library_turnaround:
7691     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7692       __kmp_use_yield = 2; // only yield when oversubscribed
7693     break;
7694   case library_throughput:
7695     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7696       __kmp_dflt_blocktime = 200;
7697     break;
7698   default:
7699     KMP_FATAL(UnknownLibraryType, arg);
7700   }
7701 }
7702 
7703 /* Getting team information common for all team API */
7704 // Returns NULL if not in teams construct
7705 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7706   kmp_info_t *thr = __kmp_entry_thread();
7707   teams_serialized = 0;
7708   if (thr->th.th_teams_microtask) {
7709     kmp_team_t *team = thr->th.th_team;
7710     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7711     int ii = team->t.t_level;
7712     teams_serialized = team->t.t_serialized;
7713     int level = tlevel + 1;
7714     KMP_DEBUG_ASSERT(ii >= tlevel);
7715     while (ii > level) {
7716       for (teams_serialized = team->t.t_serialized;
7717            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7718       }
7719       if (team->t.t_serialized && (!teams_serialized)) {
7720         team = team->t.t_parent;
7721         continue;
7722       }
7723       if (ii > level) {
7724         team = team->t.t_parent;
7725         ii--;
7726       }
7727     }
7728     return team;
7729   }
7730   return NULL;
7731 }
7732 
7733 int __kmp_aux_get_team_num() {
7734   int serialized;
7735   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7736   if (team) {
7737     if (serialized > 1) {
7738       return 0; // teams region is serialized ( 1 team of 1 thread ).
7739     } else {
7740       return team->t.t_master_tid;
7741     }
7742   }
7743   return 0;
7744 }
7745 
7746 int __kmp_aux_get_num_teams() {
7747   int serialized;
7748   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7749   if (team) {
7750     if (serialized > 1) {
7751       return 1;
7752     } else {
7753       return team->t.t_parent->t.t_nproc;
7754     }
7755   }
7756   return 1;
7757 }
7758 
7759 /* ------------------------------------------------------------------------ */
7760 
7761 /*
7762  * Affinity Format Parser
7763  *
7764  * Field is in form of: %[[[0].]size]type
7765  * % and type are required (%% means print a literal '%')
7766  * type is either single char or long name surrounded by {},
7767  * e.g., N or {num_threads}
7768  * 0 => leading zeros
7769  * . => right justified when size is specified
7770  * by default output is left justified
7771  * size is the *minimum* field length
7772  * All other characters are printed as is
7773  *
7774  * Available field types:
7775  * L {thread_level}      - omp_get_level()
7776  * n {thread_num}        - omp_get_thread_num()
7777  * h {host}              - name of host machine
7778  * P {process_id}        - process id (integer)
7779  * T {thread_identifier} - native thread identifier (integer)
7780  * N {num_threads}       - omp_get_num_threads()
7781  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7782  * a {thread_affinity}   - comma separated list of integers or integer ranges
7783  *                         (values of affinity mask)
7784  *
7785  * Implementation-specific field types can be added
7786  * If a type is unknown, print "undefined"
7787 */
7788 
7789 // Structure holding the short name, long name, and corresponding data type
7790 // for snprintf.  A table of these will represent the entire valid keyword
7791 // field types.
7792 typedef struct kmp_affinity_format_field_t {
7793   char short_name; // from spec e.g., L -> thread level
7794   const char *long_name; // from spec thread_level -> thread level
7795   char field_format; // data type for snprintf (typically 'd' or 's'
7796   // for integer or string)
7797 } kmp_affinity_format_field_t;
7798 
7799 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7800 #if KMP_AFFINITY_SUPPORTED
7801     {'A', "thread_affinity", 's'},
7802 #endif
7803     {'t', "team_num", 'd'},
7804     {'T', "num_teams", 'd'},
7805     {'L', "nesting_level", 'd'},
7806     {'n', "thread_num", 'd'},
7807     {'N', "num_threads", 'd'},
7808     {'a', "ancestor_tnum", 'd'},
7809     {'H', "host", 's'},
7810     {'P', "process_id", 'd'},
7811     {'i', "native_thread_id", 'd'}};
7812 
7813 // Return the number of characters it takes to hold field
7814 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7815                                             const char **ptr,
7816                                             kmp_str_buf_t *field_buffer) {
7817   int rc, format_index, field_value;
7818   const char *width_left, *width_right;
7819   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7820   static const int FORMAT_SIZE = 20;
7821   char format[FORMAT_SIZE] = {0};
7822   char absolute_short_name = 0;
7823 
7824   KMP_DEBUG_ASSERT(gtid >= 0);
7825   KMP_DEBUG_ASSERT(th);
7826   KMP_DEBUG_ASSERT(**ptr == '%');
7827   KMP_DEBUG_ASSERT(field_buffer);
7828 
7829   __kmp_str_buf_clear(field_buffer);
7830 
7831   // Skip the initial %
7832   (*ptr)++;
7833 
7834   // Check for %% first
7835   if (**ptr == '%') {
7836     __kmp_str_buf_cat(field_buffer, "%", 1);
7837     (*ptr)++; // skip over the second %
7838     return 1;
7839   }
7840 
7841   // Parse field modifiers if they are present
7842   pad_zeros = false;
7843   if (**ptr == '0') {
7844     pad_zeros = true;
7845     (*ptr)++; // skip over 0
7846   }
7847   right_justify = false;
7848   if (**ptr == '.') {
7849     right_justify = true;
7850     (*ptr)++; // skip over .
7851   }
7852   // Parse width of field: [width_left, width_right)
7853   width_left = width_right = NULL;
7854   if (**ptr >= '0' && **ptr <= '9') {
7855     width_left = *ptr;
7856     SKIP_DIGITS(*ptr);
7857     width_right = *ptr;
7858   }
7859 
7860   // Create the format for KMP_SNPRINTF based on flags parsed above
7861   format_index = 0;
7862   format[format_index++] = '%';
7863   if (!right_justify)
7864     format[format_index++] = '-';
7865   if (pad_zeros)
7866     format[format_index++] = '0';
7867   if (width_left && width_right) {
7868     int i = 0;
7869     // Only allow 8 digit number widths.
7870     // This also prevents overflowing format variable
7871     while (i < 8 && width_left < width_right) {
7872       format[format_index++] = *width_left;
7873       width_left++;
7874       i++;
7875     }
7876   }
7877 
7878   // Parse a name (long or short)
7879   // Canonicalize the name into absolute_short_name
7880   found_valid_name = false;
7881   parse_long_name = (**ptr == '{');
7882   if (parse_long_name)
7883     (*ptr)++; // skip initial left brace
7884   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7885                              sizeof(__kmp_affinity_format_table[0]);
7886        ++i) {
7887     char short_name = __kmp_affinity_format_table[i].short_name;
7888     const char *long_name = __kmp_affinity_format_table[i].long_name;
7889     char field_format = __kmp_affinity_format_table[i].field_format;
7890     if (parse_long_name) {
7891       int length = KMP_STRLEN(long_name);
7892       if (strncmp(*ptr, long_name, length) == 0) {
7893         found_valid_name = true;
7894         (*ptr) += length; // skip the long name
7895       }
7896     } else if (**ptr == short_name) {
7897       found_valid_name = true;
7898       (*ptr)++; // skip the short name
7899     }
7900     if (found_valid_name) {
7901       format[format_index++] = field_format;
7902       format[format_index++] = '\0';
7903       absolute_short_name = short_name;
7904       break;
7905     }
7906   }
7907   if (parse_long_name) {
7908     if (**ptr != '}') {
7909       absolute_short_name = 0;
7910     } else {
7911       (*ptr)++; // skip over the right brace
7912     }
7913   }
7914 
7915   // Attempt to fill the buffer with the requested
7916   // value using snprintf within __kmp_str_buf_print()
7917   switch (absolute_short_name) {
7918   case 't':
7919     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7920     break;
7921   case 'T':
7922     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7923     break;
7924   case 'L':
7925     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7926     break;
7927   case 'n':
7928     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7929     break;
7930   case 'H': {
7931     static const int BUFFER_SIZE = 256;
7932     char buf[BUFFER_SIZE];
7933     __kmp_expand_host_name(buf, BUFFER_SIZE);
7934     rc = __kmp_str_buf_print(field_buffer, format, buf);
7935   } break;
7936   case 'P':
7937     rc = __kmp_str_buf_print(field_buffer, format, getpid());
7938     break;
7939   case 'i':
7940     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7941     break;
7942   case 'N':
7943     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7944     break;
7945   case 'a':
7946     field_value =
7947         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7948     rc = __kmp_str_buf_print(field_buffer, format, field_value);
7949     break;
7950 #if KMP_AFFINITY_SUPPORTED
7951   case 'A': {
7952     kmp_str_buf_t buf;
7953     __kmp_str_buf_init(&buf);
7954     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7955     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7956     __kmp_str_buf_free(&buf);
7957   } break;
7958 #endif
7959   default:
7960     // According to spec, If an implementation does not have info for field
7961     // type, then "undefined" is printed
7962     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7963     // Skip the field
7964     if (parse_long_name) {
7965       SKIP_TOKEN(*ptr);
7966       if (**ptr == '}')
7967         (*ptr)++;
7968     } else {
7969       (*ptr)++;
7970     }
7971   }
7972 
7973   KMP_ASSERT(format_index <= FORMAT_SIZE);
7974   return rc;
7975 }
7976 
7977 /*
7978  * Return number of characters needed to hold the affinity string
7979  * (not including null byte character)
7980  * The resultant string is printed to buffer, which the caller can then
7981  * handle afterwards
7982 */
7983 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7984                                   kmp_str_buf_t *buffer) {
7985   const char *parse_ptr;
7986   size_t retval;
7987   const kmp_info_t *th;
7988   kmp_str_buf_t field;
7989 
7990   KMP_DEBUG_ASSERT(buffer);
7991   KMP_DEBUG_ASSERT(gtid >= 0);
7992 
7993   __kmp_str_buf_init(&field);
7994   __kmp_str_buf_clear(buffer);
7995 
7996   th = __kmp_threads[gtid];
7997   retval = 0;
7998 
7999   // If format is NULL or zero-length string, then we use
8000   // affinity-format-var ICV
8001   parse_ptr = format;
8002   if (parse_ptr == NULL || *parse_ptr == '\0') {
8003     parse_ptr = __kmp_affinity_format;
8004   }
8005   KMP_DEBUG_ASSERT(parse_ptr);
8006 
8007   while (*parse_ptr != '\0') {
8008     // Parse a field
8009     if (*parse_ptr == '%') {
8010       // Put field in the buffer
8011       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8012       __kmp_str_buf_catbuf(buffer, &field);
8013       retval += rc;
8014     } else {
8015       // Put literal character in buffer
8016       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8017       retval++;
8018       parse_ptr++;
8019     }
8020   }
8021   __kmp_str_buf_free(&field);
8022   return retval;
8023 }
8024 
8025 // Displays the affinity string to stdout
8026 void __kmp_aux_display_affinity(int gtid, const char *format) {
8027   kmp_str_buf_t buf;
8028   __kmp_str_buf_init(&buf);
8029   __kmp_aux_capture_affinity(gtid, format, &buf);
8030   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8031   __kmp_str_buf_free(&buf);
8032 }
8033 
8034 /* ------------------------------------------------------------------------ */
8035 
8036 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8037   int blocktime = arg; /* argument is in milliseconds */
8038 #if KMP_USE_MONITOR
8039   int bt_intervals;
8040 #endif
8041   int bt_set;
8042 
8043   __kmp_save_internal_controls(thread);
8044 
8045   /* Normalize and set blocktime for the teams */
8046   if (blocktime < KMP_MIN_BLOCKTIME)
8047     blocktime = KMP_MIN_BLOCKTIME;
8048   else if (blocktime > KMP_MAX_BLOCKTIME)
8049     blocktime = KMP_MAX_BLOCKTIME;
8050 
8051   set__blocktime_team(thread->th.th_team, tid, blocktime);
8052   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8053 
8054 #if KMP_USE_MONITOR
8055   /* Calculate and set blocktime intervals for the teams */
8056   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8057 
8058   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8059   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8060 #endif
8061 
8062   /* Set whether blocktime has been set to "TRUE" */
8063   bt_set = TRUE;
8064 
8065   set__bt_set_team(thread->th.th_team, tid, bt_set);
8066   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8067 #if KMP_USE_MONITOR
8068   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8069                 "bt_intervals=%d, monitor_updates=%d\n",
8070                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8071                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8072                 __kmp_monitor_wakeups));
8073 #else
8074   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8075                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8076                 thread->th.th_team->t.t_id, tid, blocktime));
8077 #endif
8078 }
8079 
8080 void __kmp_aux_set_defaults(char const *str, int len) {
8081   if (!__kmp_init_serial) {
8082     __kmp_serial_initialize();
8083   }
8084   __kmp_env_initialize(str);
8085 
8086   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8087     __kmp_env_print();
8088   }
8089 } // __kmp_aux_set_defaults
8090 
8091 /* ------------------------------------------------------------------------ */
8092 /* internal fast reduction routines */
8093 
8094 PACKED_REDUCTION_METHOD_T
8095 __kmp_determine_reduction_method(
8096     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8097     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8098     kmp_critical_name *lck) {
8099 
8100   // Default reduction method: critical construct ( lck != NULL, like in current
8101   // PAROPT )
8102   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8103   // can be selected by RTL
8104   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8105   // can be selected by RTL
8106   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8107   // among generated by PAROPT.
8108 
8109   PACKED_REDUCTION_METHOD_T retval;
8110 
8111   int team_size;
8112 
8113   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8114   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8115 
8116 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8117   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8118 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8119 
8120   retval = critical_reduce_block;
8121 
8122   // another choice of getting a team size (with 1 dynamic deference) is slower
8123   team_size = __kmp_get_team_num_threads(global_tid);
8124   if (team_size == 1) {
8125 
8126     retval = empty_reduce_block;
8127 
8128   } else {
8129 
8130     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8131 
8132 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8133     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8134 
8135 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8136     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8137 
8138     int teamsize_cutoff = 4;
8139 
8140 #if KMP_MIC_SUPPORTED
8141     if (__kmp_mic_type != non_mic) {
8142       teamsize_cutoff = 8;
8143     }
8144 #endif
8145     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8146     if (tree_available) {
8147       if (team_size <= teamsize_cutoff) {
8148         if (atomic_available) {
8149           retval = atomic_reduce_block;
8150         }
8151       } else {
8152         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8153       }
8154     } else if (atomic_available) {
8155       retval = atomic_reduce_block;
8156     }
8157 #else
8158 #error "Unknown or unsupported OS"
8159 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8160        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8161 
8162 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8163 
8164 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8165 
8166     // basic tuning
8167 
8168     if (atomic_available) {
8169       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8170         retval = atomic_reduce_block;
8171       }
8172     } // otherwise: use critical section
8173 
8174 #elif KMP_OS_DARWIN
8175 
8176     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8177     if (atomic_available && (num_vars <= 3)) {
8178       retval = atomic_reduce_block;
8179     } else if (tree_available) {
8180       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8181           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8182         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8183       }
8184     } // otherwise: use critical section
8185 
8186 #else
8187 #error "Unknown or unsupported OS"
8188 #endif
8189 
8190 #else
8191 #error "Unknown or unsupported architecture"
8192 #endif
8193   }
8194 
8195   // KMP_FORCE_REDUCTION
8196 
8197   // If the team is serialized (team_size == 1), ignore the forced reduction
8198   // method and stay with the unsynchronized method (empty_reduce_block)
8199   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8200       team_size != 1) {
8201 
8202     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8203 
8204     int atomic_available, tree_available;
8205 
8206     switch ((forced_retval = __kmp_force_reduction_method)) {
8207     case critical_reduce_block:
8208       KMP_ASSERT(lck); // lck should be != 0
8209       break;
8210 
8211     case atomic_reduce_block:
8212       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8213       if (!atomic_available) {
8214         KMP_WARNING(RedMethodNotSupported, "atomic");
8215         forced_retval = critical_reduce_block;
8216       }
8217       break;
8218 
8219     case tree_reduce_block:
8220       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8221       if (!tree_available) {
8222         KMP_WARNING(RedMethodNotSupported, "tree");
8223         forced_retval = critical_reduce_block;
8224       } else {
8225 #if KMP_FAST_REDUCTION_BARRIER
8226         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8227 #endif
8228       }
8229       break;
8230 
8231     default:
8232       KMP_ASSERT(0); // "unsupported method specified"
8233     }
8234 
8235     retval = forced_retval;
8236   }
8237 
8238   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8239 
8240 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8241 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8242 
8243   return (retval);
8244 }
8245 // this function is for testing set/get/determine reduce method
8246 kmp_int32 __kmp_get_reduce_method(void) {
8247   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8248 }
8249 
8250 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8251 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8252 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8253 
8254 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8255 // OpenMP is used subsequently.
8256 void __kmp_hard_pause() {
8257   __kmp_pause_status = kmp_hard_paused;
8258   __kmp_internal_end_thread(-1);
8259 }
8260 
8261 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8262 void __kmp_resume_if_soft_paused() {
8263   if (__kmp_pause_status == kmp_soft_paused) {
8264     __kmp_pause_status = kmp_not_paused;
8265 
8266     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8267       kmp_info_t *thread = __kmp_threads[gtid];
8268       if (thread) { // Wake it if sleeping
8269         kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8270         if (fl.is_sleeping())
8271           fl.resume(gtid);
8272         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8273           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8274         } else { // thread holds the lock and may sleep soon
8275           do { // until either the thread sleeps, or we can get the lock
8276             if (fl.is_sleeping()) {
8277               fl.resume(gtid);
8278               break;
8279             } else if (__kmp_try_suspend_mx(thread)) {
8280               __kmp_unlock_suspend_mx(thread);
8281               break;
8282             }
8283           } while (1);
8284         }
8285       }
8286     }
8287   }
8288 }
8289 
8290 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8291 // TODO: add warning messages
8292 int __kmp_pause_resource(kmp_pause_status_t level) {
8293   if (level == kmp_not_paused) { // requesting resume
8294     if (__kmp_pause_status == kmp_not_paused) {
8295       // error message about runtime not being paused, so can't resume
8296       return 1;
8297     } else {
8298       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8299                        __kmp_pause_status == kmp_hard_paused);
8300       __kmp_pause_status = kmp_not_paused;
8301       return 0;
8302     }
8303   } else if (level == kmp_soft_paused) { // requesting soft pause
8304     if (__kmp_pause_status != kmp_not_paused) {
8305       // error message about already being paused
8306       return 1;
8307     } else {
8308       __kmp_soft_pause();
8309       return 0;
8310     }
8311   } else if (level == kmp_hard_paused) { // requesting hard pause
8312     if (__kmp_pause_status != kmp_not_paused) {
8313       // error message about already being paused
8314       return 1;
8315     } else {
8316       __kmp_hard_pause();
8317       return 0;
8318     }
8319   } else {
8320     // error message about invalid level
8321     return 1;
8322   }
8323 }
8324 
8325 
8326 void __kmp_omp_display_env(int verbose) {
8327   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8328   if (__kmp_init_serial == 0)
8329     __kmp_do_serial_initialize();
8330   __kmp_display_env_impl(!verbose, verbose);
8331   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8332 }
8333