1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 #if OMPTARGET_PROFILING_SUPPORT
36 #include "llvm/Support/TimeProfiler.h"
37 static char *ProfileTraceFile = nullptr;
38 #endif
39 
40 /* these are temporary issues to be dealt with */
41 #define KMP_USE_PRCTL 0
42 
43 #if KMP_OS_WINDOWS
44 #include <process.h>
45 #endif
46 
47 #include "tsan_annotations.h"
48 
49 #if KMP_OS_WINDOWS
50 // windows does not need include files as it doesn't use shared memory
51 #else
52 #include <sys/mman.h>
53 #include <sys/stat.h>
54 #include <fcntl.h>
55 #define SHM_SIZE 1024
56 #endif
57 
58 #if defined(KMP_GOMP_COMPAT)
59 char const __kmp_version_alt_comp[] =
60     KMP_VERSION_PREFIX "alternative compiler support: yes";
61 #endif /* defined(KMP_GOMP_COMPAT) */
62 
63 char const __kmp_version_omp_api[] =
64     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
65 
66 #ifdef KMP_DEBUG
67 char const __kmp_version_lock[] =
68     KMP_VERSION_PREFIX "lock type: run time selectable";
69 #endif /* KMP_DEBUG */
70 
71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
72 
73 /* ------------------------------------------------------------------------ */
74 
75 #if KMP_USE_MONITOR
76 kmp_info_t __kmp_monitor;
77 #endif
78 
79 /* Forward declarations */
80 
81 void __kmp_cleanup(void);
82 
83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
84                                   int gtid);
85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
86                                   kmp_internal_control_t *new_icvs,
87                                   ident_t *loc);
88 #if KMP_AFFINITY_SUPPORTED
89 static void __kmp_partition_places(kmp_team_t *team,
90                                    int update_master_only = 0);
91 #endif
92 static void __kmp_do_serial_initialize(void);
93 void __kmp_fork_barrier(int gtid, int tid);
94 void __kmp_join_barrier(int gtid);
95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
96                           kmp_internal_control_t *new_icvs, ident_t *loc);
97 
98 #ifdef USE_LOAD_BALANCE
99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
100 #endif
101 
102 static int __kmp_expand_threads(int nNeed);
103 #if KMP_OS_WINDOWS
104 static int __kmp_unregister_root_other_thread(int gtid);
105 #endif
106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
108 
109 /* Calculate the identifier of the current thread */
110 /* fast (and somewhat portable) way to get unique identifier of executing
111    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
112 int __kmp_get_global_thread_id() {
113   int i;
114   kmp_info_t **other_threads;
115   size_t stack_data;
116   char *stack_addr;
117   size_t stack_size;
118   char *stack_base;
119 
120   KA_TRACE(
121       1000,
122       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
123        __kmp_nth, __kmp_all_nth));
124 
125   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
126      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
127      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
128      __kmp_init_gtid for this to work. */
129 
130   if (!TCR_4(__kmp_init_gtid))
131     return KMP_GTID_DNE;
132 
133 #ifdef KMP_TDATA_GTID
134   if (TCR_4(__kmp_gtid_mode) >= 3) {
135     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
136     return __kmp_gtid;
137   }
138 #endif
139   if (TCR_4(__kmp_gtid_mode) >= 2) {
140     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
141     return __kmp_gtid_get_specific();
142   }
143   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
144 
145   stack_addr = (char *)&stack_data;
146   other_threads = __kmp_threads;
147 
148   /* ATT: The code below is a source of potential bugs due to unsynchronized
149      access to __kmp_threads array. For example:
150      1. Current thread loads other_threads[i] to thr and checks it, it is
151         non-NULL.
152      2. Current thread is suspended by OS.
153      3. Another thread unregisters and finishes (debug versions of free()
154         may fill memory with something like 0xEF).
155      4. Current thread is resumed.
156      5. Current thread reads junk from *thr.
157      TODO: Fix it.  --ln  */
158 
159   for (i = 0; i < __kmp_threads_capacity; i++) {
160 
161     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
162     if (!thr)
163       continue;
164 
165     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
166     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
167 
168     /* stack grows down -- search through all of the active threads */
169 
170     if (stack_addr <= stack_base) {
171       size_t stack_diff = stack_base - stack_addr;
172 
173       if (stack_diff <= stack_size) {
174         /* The only way we can be closer than the allocated */
175         /* stack size is if we are running on this thread. */
176         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
177         return i;
178       }
179     }
180   }
181 
182   /* get specific to try and determine our gtid */
183   KA_TRACE(1000,
184            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
185             "thread, using TLS\n"));
186   i = __kmp_gtid_get_specific();
187 
188   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
189 
190   /* if we havn't been assigned a gtid, then return code */
191   if (i < 0)
192     return i;
193 
194   /* dynamically updated stack window for uber threads to avoid get_specific
195      call */
196   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
197     KMP_FATAL(StackOverflow, i);
198   }
199 
200   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201   if (stack_addr > stack_base) {
202     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
203     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
205                 stack_base);
206   } else {
207     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
208             stack_base - stack_addr);
209   }
210 
211   /* Reprint stack bounds for ubermaster since they have been refined */
212   if (__kmp_storage_map) {
213     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
214     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
215     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
216                                  other_threads[i]->th.th_info.ds.ds_stacksize,
217                                  "th_%d stack (refinement)", i);
218   }
219   return i;
220 }
221 
222 int __kmp_get_global_thread_id_reg() {
223   int gtid;
224 
225   if (!__kmp_init_serial) {
226     gtid = KMP_GTID_DNE;
227   } else
228 #ifdef KMP_TDATA_GTID
229       if (TCR_4(__kmp_gtid_mode) >= 3) {
230     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
231     gtid = __kmp_gtid;
232   } else
233 #endif
234       if (TCR_4(__kmp_gtid_mode) >= 2) {
235     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
236     gtid = __kmp_gtid_get_specific();
237   } else {
238     KA_TRACE(1000,
239              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
240     gtid = __kmp_get_global_thread_id();
241   }
242 
243   /* we must be a new uber master sibling thread */
244   if (gtid == KMP_GTID_DNE) {
245     KA_TRACE(10,
246              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
247               "Registering a new gtid.\n"));
248     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
249     if (!__kmp_init_serial) {
250       __kmp_do_serial_initialize();
251       gtid = __kmp_gtid_get_specific();
252     } else {
253       gtid = __kmp_register_root(FALSE);
254     }
255     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
256     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
257   }
258 
259   KMP_DEBUG_ASSERT(gtid >= 0);
260 
261   return gtid;
262 }
263 
264 /* caller must hold forkjoin_lock */
265 void __kmp_check_stack_overlap(kmp_info_t *th) {
266   int f;
267   char *stack_beg = NULL;
268   char *stack_end = NULL;
269   int gtid;
270 
271   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
272   if (__kmp_storage_map) {
273     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
274     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
275 
276     gtid = __kmp_gtid_from_thread(th);
277 
278     if (gtid == KMP_GTID_MONITOR) {
279       __kmp_print_storage_map_gtid(
280           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281           "th_%s stack (%s)", "mon",
282           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283     } else {
284       __kmp_print_storage_map_gtid(
285           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286           "th_%d stack (%s)", gtid,
287           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288     }
289   }
290 
291   /* No point in checking ubermaster threads since they use refinement and
292    * cannot overlap */
293   gtid = __kmp_gtid_from_thread(th);
294   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
295     KA_TRACE(10,
296              ("__kmp_check_stack_overlap: performing extensive checking\n"));
297     if (stack_beg == NULL) {
298       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
299       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
300     }
301 
302     for (f = 0; f < __kmp_threads_capacity; f++) {
303       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
304 
305       if (f_th && f_th != th) {
306         char *other_stack_end =
307             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
308         char *other_stack_beg =
309             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
310         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
311             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
312 
313           /* Print the other stack values before the abort */
314           if (__kmp_storage_map)
315             __kmp_print_storage_map_gtid(
316                 -1, other_stack_beg, other_stack_end,
317                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
318                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
319 
320           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
321                       __kmp_msg_null);
322         }
323       }
324     }
325   }
326   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
327 }
328 
329 /* ------------------------------------------------------------------------ */
330 
331 void __kmp_infinite_loop(void) {
332   static int done = FALSE;
333 
334   while (!done) {
335     KMP_YIELD(TRUE);
336   }
337 }
338 
339 #define MAX_MESSAGE 512
340 
341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
342                                   char const *format, ...) {
343   char buffer[MAX_MESSAGE];
344   va_list ap;
345 
346   va_start(ap, format);
347   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
348                p2, (unsigned long)size, format);
349   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
350   __kmp_vprintf(kmp_err, buffer, ap);
351 #if KMP_PRINT_DATA_PLACEMENT
352   int node;
353   if (gtid >= 0) {
354     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
355       if (__kmp_storage_map_verbose) {
356         node = __kmp_get_host_node(p1);
357         if (node < 0) /* doesn't work, so don't try this next time */
358           __kmp_storage_map_verbose = FALSE;
359         else {
360           char *last;
361           int lastNode;
362           int localProc = __kmp_get_cpu_from_gtid(gtid);
363 
364           const int page_size = KMP_GET_PAGE_SIZE();
365 
366           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
367           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
368           if (localProc >= 0)
369             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
370                                  localProc >> 1);
371           else
372             __kmp_printf_no_lock("  GTID %d\n", gtid);
373 #if KMP_USE_PRCTL
374           /* The more elaborate format is disabled for now because of the prctl
375            * hanging bug. */
376           do {
377             last = p1;
378             lastNode = node;
379             /* This loop collates adjacent pages with the same host node. */
380             do {
381               (char *)p1 += page_size;
382             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
383             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
384                                  lastNode);
385           } while (p1 <= p2);
386 #else
387           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
388                                (char *)p1 + (page_size - 1),
389                                __kmp_get_host_node(p1));
390           if (p1 < p2) {
391             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
392                                  (char *)p2 + (page_size - 1),
393                                  __kmp_get_host_node(p2));
394           }
395 #endif
396         }
397       }
398     } else
399       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
400   }
401 #endif /* KMP_PRINT_DATA_PLACEMENT */
402   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
403 }
404 
405 void __kmp_warn(char const *format, ...) {
406   char buffer[MAX_MESSAGE];
407   va_list ap;
408 
409   if (__kmp_generate_warnings == kmp_warnings_off) {
410     return;
411   }
412 
413   va_start(ap, format);
414 
415   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
416   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
417   __kmp_vprintf(kmp_err, buffer, ap);
418   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
419 
420   va_end(ap);
421 }
422 
423 void __kmp_abort_process() {
424   // Later threads may stall here, but that's ok because abort() will kill them.
425   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
426 
427   if (__kmp_debug_buf) {
428     __kmp_dump_debug_buffer();
429   }
430 
431   if (KMP_OS_WINDOWS) {
432     // Let other threads know of abnormal termination and prevent deadlock
433     // if abort happened during library initialization or shutdown
434     __kmp_global.g.g_abort = SIGABRT;
435 
436     /* On Windows* OS by default abort() causes pop-up error box, which stalls
437        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
438        boxes. _set_abort_behavior() works well, but this function is not
439        available in VS7 (this is not problem for DLL, but it is a problem for
440        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
441        help, at least in some versions of MS C RTL.
442 
443        It seems following sequence is the only way to simulate abort() and
444        avoid pop-up error box. */
445     raise(SIGABRT);
446     _exit(3); // Just in case, if signal ignored, exit anyway.
447   } else {
448     __kmp_unregister_library();
449     abort();
450   }
451 
452   __kmp_infinite_loop();
453   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
454 
455 } // __kmp_abort_process
456 
457 void __kmp_abort_thread(void) {
458   // TODO: Eliminate g_abort global variable and this function.
459   // In case of abort just call abort(), it will kill all the threads.
460   __kmp_infinite_loop();
461 } // __kmp_abort_thread
462 
463 /* Print out the storage map for the major kmp_info_t thread data structures
464    that are allocated together. */
465 
466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
467   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
468                                gtid);
469 
470   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
471                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
472 
473   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
474                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
475 
476   __kmp_print_storage_map_gtid(
477       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
478       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
481                                &thr->th.th_bar[bs_plain_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
483                                gtid);
484 
485   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
486                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
487                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
488                                gtid);
489 
490 #if KMP_FAST_REDUCTION_BARRIER
491   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
492                                &thr->th.th_bar[bs_reduction_barrier + 1],
493                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
494                                gtid);
495 #endif // KMP_FAST_REDUCTION_BARRIER
496 }
497 
498 /* Print out the storage map for the major kmp_team_t team data structures
499    that are allocated together. */
500 
501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
502                                          int team_id, int num_thr) {
503   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
504   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
505                                header, team_id);
506 
507   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
508                                &team->t.t_bar[bs_last_barrier],
509                                sizeof(kmp_balign_team_t) * bs_last_barrier,
510                                "%s_%d.t_bar", header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
513                                &team->t.t_bar[bs_plain_barrier + 1],
514                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
515                                header, team_id);
516 
517   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
518                                &team->t.t_bar[bs_forkjoin_barrier + 1],
519                                sizeof(kmp_balign_team_t),
520                                "%s_%d.t_bar[forkjoin]", header, team_id);
521 
522 #if KMP_FAST_REDUCTION_BARRIER
523   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
524                                &team->t.t_bar[bs_reduction_barrier + 1],
525                                sizeof(kmp_balign_team_t),
526                                "%s_%d.t_bar[reduction]", header, team_id);
527 #endif // KMP_FAST_REDUCTION_BARRIER
528 
529   __kmp_print_storage_map_gtid(
530       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
531       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
532 
533   __kmp_print_storage_map_gtid(
534       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
535       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
536 
537   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
538                                &team->t.t_disp_buffer[num_disp_buff],
539                                sizeof(dispatch_shared_info_t) * num_disp_buff,
540                                "%s_%d.t_disp_buffer", header, team_id);
541 }
542 
543 static void __kmp_init_allocator() { __kmp_init_memkind(); }
544 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
545 
546 /* ------------------------------------------------------------------------ */
547 
548 #if KMP_DYNAMIC_LIB
549 #if KMP_OS_WINDOWS
550 
551 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
552   // TODO: Change to __kmp_break_bootstrap_lock().
553   __kmp_init_bootstrap_lock(lck); // make the lock released
554 }
555 
556 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
557   int i;
558   int thread_count;
559 
560   // PROCESS_DETACH is expected to be called by a thread that executes
561   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
562   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
563   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
564   // threads can be still alive here, although being about to be terminated. The
565   // threads in the array with ds_thread==0 are most suspicious. Actually, it
566   // can be not safe to access the __kmp_threads[].
567 
568   // TODO: does it make sense to check __kmp_roots[] ?
569 
570   // Let's check that there are no other alive threads registered with the OMP
571   // lib.
572   while (1) {
573     thread_count = 0;
574     for (i = 0; i < __kmp_threads_capacity; ++i) {
575       if (!__kmp_threads)
576         continue;
577       kmp_info_t *th = __kmp_threads[i];
578       if (th == NULL)
579         continue;
580       int gtid = th->th.th_info.ds.ds_gtid;
581       if (gtid == gtid_req)
582         continue;
583       if (gtid < 0)
584         continue;
585       DWORD exit_val;
586       int alive = __kmp_is_thread_alive(th, &exit_val);
587       if (alive) {
588         ++thread_count;
589       }
590     }
591     if (thread_count == 0)
592       break; // success
593   }
594 
595   // Assume that I'm alone. Now it might be safe to check and reset locks.
596   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
597   __kmp_reset_lock(&__kmp_forkjoin_lock);
598 #ifdef KMP_DEBUG
599   __kmp_reset_lock(&__kmp_stdio_lock);
600 #endif // KMP_DEBUG
601 }
602 
603 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
604   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
605 
606   switch (fdwReason) {
607 
608   case DLL_PROCESS_ATTACH:
609     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
610 
611     return TRUE;
612 
613   case DLL_PROCESS_DETACH:
614     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
615 
616     if (lpReserved != NULL) {
617       // lpReserved is used for telling the difference:
618       //   lpReserved == NULL when FreeLibrary() was called,
619       //   lpReserved != NULL when the process terminates.
620       // When FreeLibrary() is called, worker threads remain alive. So they will
621       // release the forkjoin lock by themselves. When the process terminates,
622       // worker threads disappear triggering the problem of unreleased forkjoin
623       // lock as described below.
624 
625       // A worker thread can take the forkjoin lock. The problem comes up if
626       // that worker thread becomes dead before it releases the forkjoin lock.
627       // The forkjoin lock remains taken, while the thread executing
628       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
629       // to take the forkjoin lock and will always fail, so that the application
630       // will never finish [normally]. This scenario is possible if
631       // __kmpc_end() has not been executed. It looks like it's not a corner
632       // case, but common cases:
633       // - the main function was compiled by an alternative compiler;
634       // - the main function was compiled by icl but without /Qopenmp
635       //   (application with plugins);
636       // - application terminates by calling C exit(), Fortran CALL EXIT() or
637       //   Fortran STOP.
638       // - alive foreign thread prevented __kmpc_end from doing cleanup.
639       //
640       // This is a hack to work around the problem.
641       // TODO: !!! figure out something better.
642       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
643     }
644 
645     __kmp_internal_end_library(__kmp_gtid_get_specific());
646 
647     return TRUE;
648 
649   case DLL_THREAD_ATTACH:
650     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
651 
652     /* if we want to register new siblings all the time here call
653      * __kmp_get_gtid(); */
654     return TRUE;
655 
656   case DLL_THREAD_DETACH:
657     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
658 
659     __kmp_internal_end_thread(__kmp_gtid_get_specific());
660     return TRUE;
661   }
662 
663   return TRUE;
664 }
665 
666 #endif /* KMP_OS_WINDOWS */
667 #endif /* KMP_DYNAMIC_LIB */
668 
669 /* __kmp_parallel_deo -- Wait until it's our turn. */
670 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
671   int gtid = *gtid_ref;
672 #ifdef BUILD_PARALLEL_ORDERED
673   kmp_team_t *team = __kmp_team_from_gtid(gtid);
674 #endif /* BUILD_PARALLEL_ORDERED */
675 
676   if (__kmp_env_consistency_check) {
677     if (__kmp_threads[gtid]->th.th_root->r.r_active)
678 #if KMP_USE_DYNAMIC_LOCK
679       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
680 #else
681       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
682 #endif
683   }
684 #ifdef BUILD_PARALLEL_ORDERED
685   if (!team->t.t_serialized) {
686     KMP_MB();
687     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
688              NULL);
689     KMP_MB();
690   }
691 #endif /* BUILD_PARALLEL_ORDERED */
692 }
693 
694 /* __kmp_parallel_dxo -- Signal the next task. */
695 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
696   int gtid = *gtid_ref;
697 #ifdef BUILD_PARALLEL_ORDERED
698   int tid = __kmp_tid_from_gtid(gtid);
699   kmp_team_t *team = __kmp_team_from_gtid(gtid);
700 #endif /* BUILD_PARALLEL_ORDERED */
701 
702   if (__kmp_env_consistency_check) {
703     if (__kmp_threads[gtid]->th.th_root->r.r_active)
704       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
705   }
706 #ifdef BUILD_PARALLEL_ORDERED
707   if (!team->t.t_serialized) {
708     KMP_MB(); /* Flush all pending memory write invalidates.  */
709 
710     /* use the tid of the next thread in this team */
711     /* TODO replace with general release procedure */
712     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
713 
714     KMP_MB(); /* Flush all pending memory write invalidates.  */
715   }
716 #endif /* BUILD_PARALLEL_ORDERED */
717 }
718 
719 /* ------------------------------------------------------------------------ */
720 /* The BARRIER for a SINGLE process section is always explicit   */
721 
722 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
723   int status;
724   kmp_info_t *th;
725   kmp_team_t *team;
726 
727   if (!TCR_4(__kmp_init_parallel))
728     __kmp_parallel_initialize();
729   __kmp_resume_if_soft_paused();
730 
731   th = __kmp_threads[gtid];
732   team = th->th.th_team;
733   status = 0;
734 
735   th->th.th_ident = id_ref;
736 
737   if (team->t.t_serialized) {
738     status = 1;
739   } else {
740     kmp_int32 old_this = th->th.th_local.this_construct;
741 
742     ++th->th.th_local.this_construct;
743     /* try to set team count to thread count--success means thread got the
744        single block */
745     /* TODO: Should this be acquire or release? */
746     if (team->t.t_construct == old_this) {
747       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
748                                               th->th.th_local.this_construct);
749     }
750 #if USE_ITT_BUILD
751     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
752         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
753         team->t.t_active_level ==
754             1) { // Only report metadata by master of active team at level 1
755       __kmp_itt_metadata_single(id_ref);
756     }
757 #endif /* USE_ITT_BUILD */
758   }
759 
760   if (__kmp_env_consistency_check) {
761     if (status && push_ws) {
762       __kmp_push_workshare(gtid, ct_psingle, id_ref);
763     } else {
764       __kmp_check_workshare(gtid, ct_psingle, id_ref);
765     }
766   }
767 #if USE_ITT_BUILD
768   if (status) {
769     __kmp_itt_single_start(gtid);
770   }
771 #endif /* USE_ITT_BUILD */
772   return status;
773 }
774 
775 void __kmp_exit_single(int gtid) {
776 #if USE_ITT_BUILD
777   __kmp_itt_single_end(gtid);
778 #endif /* USE_ITT_BUILD */
779   if (__kmp_env_consistency_check)
780     __kmp_pop_workshare(gtid, ct_psingle, NULL);
781 }
782 
783 /* determine if we can go parallel or must use a serialized parallel region and
784  * how many threads we can use
785  * set_nproc is the number of threads requested for the team
786  * returns 0 if we should serialize or only use one thread,
787  * otherwise the number of threads to use
788  * The forkjoin lock is held by the caller. */
789 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
790                                  int master_tid, int set_nthreads,
791                                  int enter_teams) {
792   int capacity;
793   int new_nthreads;
794   KMP_DEBUG_ASSERT(__kmp_init_serial);
795   KMP_DEBUG_ASSERT(root && parent_team);
796   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
797 
798   // If dyn-var is set, dynamically adjust the number of desired threads,
799   // according to the method specified by dynamic_mode.
800   new_nthreads = set_nthreads;
801   if (!get__dynamic_2(parent_team, master_tid)) {
802     ;
803   }
804 #ifdef USE_LOAD_BALANCE
805   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
806     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
807     if (new_nthreads == 1) {
808       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
809                     "reservation to 1 thread\n",
810                     master_tid));
811       return 1;
812     }
813     if (new_nthreads < set_nthreads) {
814       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
815                     "reservation to %d threads\n",
816                     master_tid, new_nthreads));
817     }
818   }
819 #endif /* USE_LOAD_BALANCE */
820   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
821     new_nthreads = __kmp_avail_proc - __kmp_nth +
822                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
823     if (new_nthreads <= 1) {
824       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
825                     "reservation to 1 thread\n",
826                     master_tid));
827       return 1;
828     }
829     if (new_nthreads < set_nthreads) {
830       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
831                     "reservation to %d threads\n",
832                     master_tid, new_nthreads));
833     } else {
834       new_nthreads = set_nthreads;
835     }
836   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
837     if (set_nthreads > 2) {
838       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
839       new_nthreads = (new_nthreads % set_nthreads) + 1;
840       if (new_nthreads == 1) {
841         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
842                       "reservation to 1 thread\n",
843                       master_tid));
844         return 1;
845       }
846       if (new_nthreads < set_nthreads) {
847         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
848                       "reservation to %d threads\n",
849                       master_tid, new_nthreads));
850       }
851     }
852   } else {
853     KMP_ASSERT(0);
854   }
855 
856   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
857   if (__kmp_nth + new_nthreads -
858           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
859       __kmp_max_nth) {
860     int tl_nthreads = __kmp_max_nth - __kmp_nth +
861                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
862     if (tl_nthreads <= 0) {
863       tl_nthreads = 1;
864     }
865 
866     // If dyn-var is false, emit a 1-time warning.
867     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
868       __kmp_reserve_warn = 1;
869       __kmp_msg(kmp_ms_warning,
870                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
871                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
872     }
873     if (tl_nthreads == 1) {
874       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
875                     "reduced reservation to 1 thread\n",
876                     master_tid));
877       return 1;
878     }
879     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
880                   "reservation to %d threads\n",
881                   master_tid, tl_nthreads));
882     new_nthreads = tl_nthreads;
883   }
884 
885   // Respect OMP_THREAD_LIMIT
886   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
887   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
888   if (cg_nthreads + new_nthreads -
889           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
890       max_cg_threads) {
891     int tl_nthreads = max_cg_threads - cg_nthreads +
892                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
893     if (tl_nthreads <= 0) {
894       tl_nthreads = 1;
895     }
896 
897     // If dyn-var is false, emit a 1-time warning.
898     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
899       __kmp_reserve_warn = 1;
900       __kmp_msg(kmp_ms_warning,
901                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
902                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
903     }
904     if (tl_nthreads == 1) {
905       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
906                     "reduced reservation to 1 thread\n",
907                     master_tid));
908       return 1;
909     }
910     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
911                   "reservation to %d threads\n",
912                   master_tid, tl_nthreads));
913     new_nthreads = tl_nthreads;
914   }
915 
916   // Check if the threads array is large enough, or needs expanding.
917   // See comment in __kmp_register_root() about the adjustment if
918   // __kmp_threads[0] == NULL.
919   capacity = __kmp_threads_capacity;
920   if (TCR_PTR(__kmp_threads[0]) == NULL) {
921     --capacity;
922   }
923   if (__kmp_nth + new_nthreads -
924           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
925       capacity) {
926     // Expand the threads array.
927     int slotsRequired = __kmp_nth + new_nthreads -
928                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
929                         capacity;
930     int slotsAdded = __kmp_expand_threads(slotsRequired);
931     if (slotsAdded < slotsRequired) {
932       // The threads array was not expanded enough.
933       new_nthreads -= (slotsRequired - slotsAdded);
934       KMP_ASSERT(new_nthreads >= 1);
935 
936       // If dyn-var is false, emit a 1-time warning.
937       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
938         __kmp_reserve_warn = 1;
939         if (__kmp_tp_cached) {
940           __kmp_msg(kmp_ms_warning,
941                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
942                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
943                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
944         } else {
945           __kmp_msg(kmp_ms_warning,
946                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
947                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
948         }
949       }
950     }
951   }
952 
953 #ifdef KMP_DEBUG
954   if (new_nthreads == 1) {
955     KC_TRACE(10,
956              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
957               "dead roots and rechecking; requested %d threads\n",
958               __kmp_get_gtid(), set_nthreads));
959   } else {
960     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
961                   " %d threads\n",
962                   __kmp_get_gtid(), new_nthreads, set_nthreads));
963   }
964 #endif // KMP_DEBUG
965   return new_nthreads;
966 }
967 
968 /* Allocate threads from the thread pool and assign them to the new team. We are
969    assured that there are enough threads available, because we checked on that
970    earlier within critical section forkjoin */
971 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
972                                     kmp_info_t *master_th, int master_gtid) {
973   int i;
974   int use_hot_team;
975 
976   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
977   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
978   KMP_MB();
979 
980   /* first, let's setup the master thread */
981   master_th->th.th_info.ds.ds_tid = 0;
982   master_th->th.th_team = team;
983   master_th->th.th_team_nproc = team->t.t_nproc;
984   master_th->th.th_team_master = master_th;
985   master_th->th.th_team_serialized = FALSE;
986   master_th->th.th_dispatch = &team->t.t_dispatch[0];
987 
988 /* make sure we are not the optimized hot team */
989 #if KMP_NESTED_HOT_TEAMS
990   use_hot_team = 0;
991   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
992   if (hot_teams) { // hot teams array is not allocated if
993     // KMP_HOT_TEAMS_MAX_LEVEL=0
994     int level = team->t.t_active_level - 1; // index in array of hot teams
995     if (master_th->th.th_teams_microtask) { // are we inside the teams?
996       if (master_th->th.th_teams_size.nteams > 1) {
997         ++level; // level was not increased in teams construct for
998         // team_of_masters
999       }
1000       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1001           master_th->th.th_teams_level == team->t.t_level) {
1002         ++level; // level was not increased in teams construct for
1003         // team_of_workers before the parallel
1004       } // team->t.t_level will be increased inside parallel
1005     }
1006     if (level < __kmp_hot_teams_max_level) {
1007       if (hot_teams[level].hot_team) {
1008         // hot team has already been allocated for given level
1009         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1010         use_hot_team = 1; // the team is ready to use
1011       } else {
1012         use_hot_team = 0; // AC: threads are not allocated yet
1013         hot_teams[level].hot_team = team; // remember new hot team
1014         hot_teams[level].hot_team_nth = team->t.t_nproc;
1015       }
1016     } else {
1017       use_hot_team = 0;
1018     }
1019   }
1020 #else
1021   use_hot_team = team == root->r.r_hot_team;
1022 #endif
1023   if (!use_hot_team) {
1024 
1025     /* install the master thread */
1026     team->t.t_threads[0] = master_th;
1027     __kmp_initialize_info(master_th, team, 0, master_gtid);
1028 
1029     /* now, install the worker threads */
1030     for (i = 1; i < team->t.t_nproc; i++) {
1031 
1032       /* fork or reallocate a new thread and install it in team */
1033       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1034       team->t.t_threads[i] = thr;
1035       KMP_DEBUG_ASSERT(thr);
1036       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1037       /* align team and thread arrived states */
1038       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1039                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1040                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1041                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1042                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1043                     team->t.t_bar[bs_plain_barrier].b_arrived));
1044       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1045       thr->th.th_teams_level = master_th->th.th_teams_level;
1046       thr->th.th_teams_size = master_th->th.th_teams_size;
1047       { // Initialize threads' barrier data.
1048         int b;
1049         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1050         for (b = 0; b < bs_last_barrier; ++b) {
1051           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1052           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1053 #if USE_DEBUGGER
1054           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1055 #endif
1056         }
1057       }
1058     }
1059 
1060 #if KMP_AFFINITY_SUPPORTED
1061     __kmp_partition_places(team);
1062 #endif
1063   }
1064 
1065   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1066     for (i = 0; i < team->t.t_nproc; i++) {
1067       kmp_info_t *thr = team->t.t_threads[i];
1068       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1069           thr->th.th_prev_level != team->t.t_level) {
1070         team->t.t_display_affinity = 1;
1071         break;
1072       }
1073     }
1074   }
1075 
1076   KMP_MB();
1077 }
1078 
1079 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1080 // Propagate any changes to the floating point control registers out to the team
1081 // We try to avoid unnecessary writes to the relevant cache line in the team
1082 // structure, so we don't make changes unless they are needed.
1083 inline static void propagateFPControl(kmp_team_t *team) {
1084   if (__kmp_inherit_fp_control) {
1085     kmp_int16 x87_fpu_control_word;
1086     kmp_uint32 mxcsr;
1087 
1088     // Get master values of FPU control flags (both X87 and vector)
1089     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1090     __kmp_store_mxcsr(&mxcsr);
1091     mxcsr &= KMP_X86_MXCSR_MASK;
1092 
1093     // There is no point looking at t_fp_control_saved here.
1094     // If it is TRUE, we still have to update the values if they are different
1095     // from those we now have. If it is FALSE we didn't save anything yet, but
1096     // our objective is the same. We have to ensure that the values in the team
1097     // are the same as those we have.
1098     // So, this code achieves what we need whether or not t_fp_control_saved is
1099     // true. By checking whether the value needs updating we avoid unnecessary
1100     // writes that would put the cache-line into a written state, causing all
1101     // threads in the team to have to read it again.
1102     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1103     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1104     // Although we don't use this value, other code in the runtime wants to know
1105     // whether it should restore them. So we must ensure it is correct.
1106     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1107   } else {
1108     // Similarly here. Don't write to this cache-line in the team structure
1109     // unless we have to.
1110     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1111   }
1112 }
1113 
1114 // Do the opposite, setting the hardware registers to the updated values from
1115 // the team.
1116 inline static void updateHWFPControl(kmp_team_t *team) {
1117   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1118     // Only reset the fp control regs if they have been changed in the team.
1119     // the parallel region that we are exiting.
1120     kmp_int16 x87_fpu_control_word;
1121     kmp_uint32 mxcsr;
1122     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1123     __kmp_store_mxcsr(&mxcsr);
1124     mxcsr &= KMP_X86_MXCSR_MASK;
1125 
1126     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1127       __kmp_clear_x87_fpu_status_word();
1128       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1129     }
1130 
1131     if (team->t.t_mxcsr != mxcsr) {
1132       __kmp_load_mxcsr(&team->t.t_mxcsr);
1133     }
1134   }
1135 }
1136 #else
1137 #define propagateFPControl(x) ((void)0)
1138 #define updateHWFPControl(x) ((void)0)
1139 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1140 
1141 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1142                                      int realloc); // forward declaration
1143 
1144 /* Run a parallel region that has been serialized, so runs only in a team of the
1145    single master thread. */
1146 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1147   kmp_info_t *this_thr;
1148   kmp_team_t *serial_team;
1149 
1150   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1151 
1152   /* Skip all this code for autopar serialized loops since it results in
1153      unacceptable overhead */
1154   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1155     return;
1156 
1157   if (!TCR_4(__kmp_init_parallel))
1158     __kmp_parallel_initialize();
1159   __kmp_resume_if_soft_paused();
1160 
1161   this_thr = __kmp_threads[global_tid];
1162   serial_team = this_thr->th.th_serial_team;
1163 
1164   /* utilize the serialized team held by this thread */
1165   KMP_DEBUG_ASSERT(serial_team);
1166   KMP_MB();
1167 
1168   if (__kmp_tasking_mode != tskm_immediate_exec) {
1169     KMP_DEBUG_ASSERT(
1170         this_thr->th.th_task_team ==
1171         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1172     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1173                      NULL);
1174     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1175                   "team %p, new task_team = NULL\n",
1176                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1177     this_thr->th.th_task_team = NULL;
1178   }
1179 
1180   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1181   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1182     proc_bind = proc_bind_false;
1183   } else if (proc_bind == proc_bind_default) {
1184     // No proc_bind clause was specified, so use the current value
1185     // of proc-bind-var for this parallel region.
1186     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1187   }
1188   // Reset for next parallel region
1189   this_thr->th.th_set_proc_bind = proc_bind_default;
1190 
1191 #if OMPT_SUPPORT
1192   ompt_data_t ompt_parallel_data = ompt_data_none;
1193   ompt_data_t *implicit_task_data;
1194   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1195   if (ompt_enabled.enabled &&
1196       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1197 
1198     ompt_task_info_t *parent_task_info;
1199     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1200 
1201     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1202     if (ompt_enabled.ompt_callback_parallel_begin) {
1203       int team_size = 1;
1204 
1205       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1206           &(parent_task_info->task_data), &(parent_task_info->frame),
1207           &ompt_parallel_data, team_size,
1208           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1209     }
1210   }
1211 #endif // OMPT_SUPPORT
1212 
1213   if (this_thr->th.th_team != serial_team) {
1214     // Nested level will be an index in the nested nthreads array
1215     int level = this_thr->th.th_team->t.t_level;
1216 
1217     if (serial_team->t.t_serialized) {
1218       /* this serial team was already used
1219          TODO increase performance by making this locks more specific */
1220       kmp_team_t *new_team;
1221 
1222       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1223 
1224       new_team =
1225           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1226 #if OMPT_SUPPORT
1227                               ompt_parallel_data,
1228 #endif
1229                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1230                               0 USE_NESTED_HOT_ARG(NULL));
1231       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1232       KMP_ASSERT(new_team);
1233 
1234       /* setup new serialized team and install it */
1235       new_team->t.t_threads[0] = this_thr;
1236       new_team->t.t_parent = this_thr->th.th_team;
1237       serial_team = new_team;
1238       this_thr->th.th_serial_team = serial_team;
1239 
1240       KF_TRACE(
1241           10,
1242           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1243            global_tid, serial_team));
1244 
1245       /* TODO the above breaks the requirement that if we run out of resources,
1246          then we can still guarantee that serialized teams are ok, since we may
1247          need to allocate a new one */
1248     } else {
1249       KF_TRACE(
1250           10,
1251           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1252            global_tid, serial_team));
1253     }
1254 
1255     /* we have to initialize this serial team */
1256     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1257     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1258     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1259     serial_team->t.t_ident = loc;
1260     serial_team->t.t_serialized = 1;
1261     serial_team->t.t_nproc = 1;
1262     serial_team->t.t_parent = this_thr->th.th_team;
1263     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1264     this_thr->th.th_team = serial_team;
1265     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1266 
1267     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1268                   this_thr->th.th_current_task));
1269     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1270     this_thr->th.th_current_task->td_flags.executing = 0;
1271 
1272     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1273 
1274     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1275        implicit task for each serialized task represented by
1276        team->t.t_serialized? */
1277     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1278               &this_thr->th.th_current_task->td_parent->td_icvs);
1279 
1280     // Thread value exists in the nested nthreads array for the next nested
1281     // level
1282     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1283       this_thr->th.th_current_task->td_icvs.nproc =
1284           __kmp_nested_nth.nth[level + 1];
1285     }
1286 
1287     if (__kmp_nested_proc_bind.used &&
1288         (level + 1 < __kmp_nested_proc_bind.used)) {
1289       this_thr->th.th_current_task->td_icvs.proc_bind =
1290           __kmp_nested_proc_bind.bind_types[level + 1];
1291     }
1292 
1293 #if USE_DEBUGGER
1294     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1295 #endif
1296     this_thr->th.th_info.ds.ds_tid = 0;
1297 
1298     /* set thread cache values */
1299     this_thr->th.th_team_nproc = 1;
1300     this_thr->th.th_team_master = this_thr;
1301     this_thr->th.th_team_serialized = 1;
1302 
1303     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1304     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1305     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1306 
1307     propagateFPControl(serial_team);
1308 
1309     /* check if we need to allocate dispatch buffers stack */
1310     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1311     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1312       serial_team->t.t_dispatch->th_disp_buffer =
1313           (dispatch_private_info_t *)__kmp_allocate(
1314               sizeof(dispatch_private_info_t));
1315     }
1316     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1317 
1318     KMP_MB();
1319 
1320   } else {
1321     /* this serialized team is already being used,
1322      * that's fine, just add another nested level */
1323     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1324     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1325     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1326     ++serial_team->t.t_serialized;
1327     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1328 
1329     // Nested level will be an index in the nested nthreads array
1330     int level = this_thr->th.th_team->t.t_level;
1331     // Thread value exists in the nested nthreads array for the next nested
1332     // level
1333     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1334       this_thr->th.th_current_task->td_icvs.nproc =
1335           __kmp_nested_nth.nth[level + 1];
1336     }
1337     serial_team->t.t_level++;
1338     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1339                   "of serial team %p to %d\n",
1340                   global_tid, serial_team, serial_team->t.t_level));
1341 
1342     /* allocate/push dispatch buffers stack */
1343     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1344     {
1345       dispatch_private_info_t *disp_buffer =
1346           (dispatch_private_info_t *)__kmp_allocate(
1347               sizeof(dispatch_private_info_t));
1348       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1349       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1350     }
1351     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1352 
1353     KMP_MB();
1354   }
1355   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1356 
1357   // Perform the display affinity functionality for
1358   // serialized parallel regions
1359   if (__kmp_display_affinity) {
1360     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1361         this_thr->th.th_prev_num_threads != 1) {
1362       // NULL means use the affinity-format-var ICV
1363       __kmp_aux_display_affinity(global_tid, NULL);
1364       this_thr->th.th_prev_level = serial_team->t.t_level;
1365       this_thr->th.th_prev_num_threads = 1;
1366     }
1367   }
1368 
1369   if (__kmp_env_consistency_check)
1370     __kmp_push_parallel(global_tid, NULL);
1371 #if OMPT_SUPPORT
1372   serial_team->t.ompt_team_info.master_return_address = codeptr;
1373   if (ompt_enabled.enabled &&
1374       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1375     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1376 
1377     ompt_lw_taskteam_t lw_taskteam;
1378     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1379                             &ompt_parallel_data, codeptr);
1380 
1381     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1382     // don't use lw_taskteam after linking. content was swaped
1383 
1384     /* OMPT implicit task begin */
1385     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1386     if (ompt_enabled.ompt_callback_implicit_task) {
1387       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1388           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1389           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1390       OMPT_CUR_TASK_INFO(this_thr)
1391           ->thread_num = __kmp_tid_from_gtid(global_tid);
1392     }
1393 
1394     /* OMPT state */
1395     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1396     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1397   }
1398 #endif
1399 }
1400 
1401 /* most of the work for a fork */
1402 /* return true if we really went parallel, false if serialized */
1403 int __kmp_fork_call(ident_t *loc, int gtid,
1404                     enum fork_context_e call_context, // Intel, GNU, ...
1405                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1406                     kmp_va_list ap) {
1407   void **argv;
1408   int i;
1409   int master_tid;
1410   int master_this_cons;
1411   kmp_team_t *team;
1412   kmp_team_t *parent_team;
1413   kmp_info_t *master_th;
1414   kmp_root_t *root;
1415   int nthreads;
1416   int master_active;
1417   int master_set_numthreads;
1418   int level;
1419   int active_level;
1420   int teams_level;
1421 #if KMP_NESTED_HOT_TEAMS
1422   kmp_hot_team_ptr_t **p_hot_teams;
1423 #endif
1424   { // KMP_TIME_BLOCK
1425     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1426     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1427 
1428     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1429     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1430       /* Some systems prefer the stack for the root thread(s) to start with */
1431       /* some gap from the parent stack to prevent false sharing. */
1432       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1433       /* These 2 lines below are so this does not get optimized out */
1434       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1435         __kmp_stkpadding += (short)((kmp_int64)dummy);
1436     }
1437 
1438     /* initialize if needed */
1439     KMP_DEBUG_ASSERT(
1440         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1441     if (!TCR_4(__kmp_init_parallel))
1442       __kmp_parallel_initialize();
1443     __kmp_resume_if_soft_paused();
1444 
1445     /* setup current data */
1446     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1447     // shutdown
1448     parent_team = master_th->th.th_team;
1449     master_tid = master_th->th.th_info.ds.ds_tid;
1450     master_this_cons = master_th->th.th_local.this_construct;
1451     root = master_th->th.th_root;
1452     master_active = root->r.r_active;
1453     master_set_numthreads = master_th->th.th_set_nproc;
1454 
1455 #if OMPT_SUPPORT
1456     ompt_data_t ompt_parallel_data = ompt_data_none;
1457     ompt_data_t *parent_task_data;
1458     ompt_frame_t *ompt_frame;
1459     ompt_data_t *implicit_task_data;
1460     void *return_address = NULL;
1461 
1462     if (ompt_enabled.enabled) {
1463       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1464                                     NULL, NULL);
1465       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1466     }
1467 #endif
1468 
1469     // Nested level will be an index in the nested nthreads array
1470     level = parent_team->t.t_level;
1471     // used to launch non-serial teams even if nested is not allowed
1472     active_level = parent_team->t.t_active_level;
1473     // needed to check nesting inside the teams
1474     teams_level = master_th->th.th_teams_level;
1475 #if KMP_NESTED_HOT_TEAMS
1476     p_hot_teams = &master_th->th.th_hot_teams;
1477     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1478       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1479           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1480       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1481       // it is either actual or not needed (when active_level > 0)
1482       (*p_hot_teams)[0].hot_team_nth = 1;
1483     }
1484 #endif
1485 
1486 #if OMPT_SUPPORT
1487     if (ompt_enabled.enabled) {
1488       if (ompt_enabled.ompt_callback_parallel_begin) {
1489         int team_size = master_set_numthreads
1490                             ? master_set_numthreads
1491                             : get__nproc_2(parent_team, master_tid);
1492         int flags = OMPT_INVOKER(call_context) |
1493                     ((microtask == (microtask_t)__kmp_teams_master)
1494                          ? ompt_parallel_league
1495                          : ompt_parallel_team);
1496         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1497             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1498             return_address);
1499       }
1500       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1501     }
1502 #endif
1503 
1504     master_th->th.th_ident = loc;
1505 
1506     if (master_th->th.th_teams_microtask && ap &&
1507         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1508       // AC: This is start of parallel that is nested inside teams construct.
1509       // The team is actual (hot), all workers are ready at the fork barrier.
1510       // No lock needed to initialize the team a bit, then free workers.
1511       parent_team->t.t_ident = loc;
1512       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1513       parent_team->t.t_argc = argc;
1514       argv = (void **)parent_team->t.t_argv;
1515       for (i = argc - 1; i >= 0; --i)
1516         *argv++ = va_arg(kmp_va_deref(ap), void *);
1517       // Increment our nested depth levels, but not increase the serialization
1518       if (parent_team == master_th->th.th_serial_team) {
1519         // AC: we are in serialized parallel
1520         __kmpc_serialized_parallel(loc, gtid);
1521         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1522 
1523         if (call_context == fork_context_gnu) {
1524           // AC: need to decrement t_serialized for enquiry functions to work
1525           // correctly, will restore at join time
1526           parent_team->t.t_serialized--;
1527           return TRUE;
1528         }
1529 
1530 #if OMPT_SUPPORT
1531         void *dummy;
1532         void **exit_frame_p;
1533 
1534         ompt_lw_taskteam_t lw_taskteam;
1535 
1536         if (ompt_enabled.enabled) {
1537           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1538                                   &ompt_parallel_data, return_address);
1539           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1540 
1541           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1542           // don't use lw_taskteam after linking. content was swaped
1543 
1544           /* OMPT implicit task begin */
1545           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1546           if (ompt_enabled.ompt_callback_implicit_task) {
1547             OMPT_CUR_TASK_INFO(master_th)
1548                 ->thread_num = __kmp_tid_from_gtid(gtid);
1549             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1550                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1551                 implicit_task_data, 1,
1552                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1553           }
1554 
1555           /* OMPT state */
1556           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1557         } else {
1558           exit_frame_p = &dummy;
1559         }
1560 #endif
1561         // AC: need to decrement t_serialized for enquiry functions to work
1562         // correctly, will restore at join time
1563         parent_team->t.t_serialized--;
1564 
1565         {
1566           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1567           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1568           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1569 #if OMPT_SUPPORT
1570                                  ,
1571                                  exit_frame_p
1572 #endif
1573                                  );
1574         }
1575 
1576 #if OMPT_SUPPORT
1577         if (ompt_enabled.enabled) {
1578           *exit_frame_p = NULL;
1579           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1580           if (ompt_enabled.ompt_callback_implicit_task) {
1581             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1582                 ompt_scope_end, NULL, implicit_task_data, 1,
1583                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1584           }
1585           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1586           __ompt_lw_taskteam_unlink(master_th);
1587           if (ompt_enabled.ompt_callback_parallel_end) {
1588             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1589                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1590                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1591                 return_address);
1592           }
1593           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1594         }
1595 #endif
1596         return TRUE;
1597       }
1598 
1599       parent_team->t.t_pkfn = microtask;
1600       parent_team->t.t_invoke = invoker;
1601       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1602       parent_team->t.t_active_level++;
1603       parent_team->t.t_level++;
1604       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1605 
1606 #if OMPT_SUPPORT
1607       if (ompt_enabled.enabled) {
1608         ompt_lw_taskteam_t lw_taskteam;
1609         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1610                                 &ompt_parallel_data, return_address);
1611         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1612       }
1613 #endif
1614 
1615       /* Change number of threads in the team if requested */
1616       if (master_set_numthreads) { // The parallel has num_threads clause
1617         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1618           // AC: only can reduce number of threads dynamically, can't increase
1619           kmp_info_t **other_threads = parent_team->t.t_threads;
1620           parent_team->t.t_nproc = master_set_numthreads;
1621           for (i = 0; i < master_set_numthreads; ++i) {
1622             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1623           }
1624           // Keep extra threads hot in the team for possible next parallels
1625         }
1626         master_th->th.th_set_nproc = 0;
1627       }
1628 
1629 #if USE_DEBUGGER
1630       if (__kmp_debugging) { // Let debugger override number of threads.
1631         int nth = __kmp_omp_num_threads(loc);
1632         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1633           master_set_numthreads = nth;
1634         }
1635       }
1636 #endif
1637 
1638 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1639       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1640            KMP_ITT_DEBUG) &&
1641           __kmp_forkjoin_frames_mode == 3 &&
1642           parent_team->t.t_active_level == 1 // only report frames at level 1
1643           && master_th->th.th_teams_size.nteams == 1) {
1644         kmp_uint64 tmp_time = __itt_get_timestamp();
1645         master_th->th.th_frame_time = tmp_time;
1646         parent_team->t.t_region_time = tmp_time;
1647       }
1648       if (__itt_stack_caller_create_ptr) {
1649         // create new stack stitching id before entering fork barrier
1650         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1651       }
1652 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1653 
1654       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1655                     "master_th=%p, gtid=%d\n",
1656                     root, parent_team, master_th, gtid));
1657       __kmp_internal_fork(loc, gtid, parent_team);
1658       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1659                     "master_th=%p, gtid=%d\n",
1660                     root, parent_team, master_th, gtid));
1661 
1662       if (call_context == fork_context_gnu)
1663         return TRUE;
1664 
1665       /* Invoke microtask for MASTER thread */
1666       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1667                     parent_team->t.t_id, parent_team->t.t_pkfn));
1668 
1669       if (!parent_team->t.t_invoke(gtid)) {
1670         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1671       }
1672       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1673                     parent_team->t.t_id, parent_team->t.t_pkfn));
1674       KMP_MB(); /* Flush all pending memory write invalidates.  */
1675 
1676       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1677 
1678       return TRUE;
1679     } // Parallel closely nested in teams construct
1680 
1681 #if KMP_DEBUG
1682     if (__kmp_tasking_mode != tskm_immediate_exec) {
1683       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1684                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1685     }
1686 #endif
1687 
1688     if (parent_team->t.t_active_level >=
1689         master_th->th.th_current_task->td_icvs.max_active_levels) {
1690       nthreads = 1;
1691     } else {
1692       int enter_teams = ((ap == NULL && active_level == 0) ||
1693                          (ap && teams_level > 0 && teams_level == level));
1694       nthreads =
1695           master_set_numthreads
1696               ? master_set_numthreads
1697               : get__nproc_2(
1698                     parent_team,
1699                     master_tid); // TODO: get nproc directly from current task
1700 
1701       // Check if we need to take forkjoin lock? (no need for serialized
1702       // parallel out of teams construct). This code moved here from
1703       // __kmp_reserve_threads() to speedup nested serialized parallels.
1704       if (nthreads > 1) {
1705         if ((get__max_active_levels(master_th) == 1 &&
1706              (root->r.r_in_parallel && !enter_teams)) ||
1707             (__kmp_library == library_serial)) {
1708           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1709                         " threads\n",
1710                         gtid, nthreads));
1711           nthreads = 1;
1712         }
1713       }
1714       if (nthreads > 1) {
1715         /* determine how many new threads we can use */
1716         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1717         /* AC: If we execute teams from parallel region (on host), then teams
1718            should be created but each can only have 1 thread if nesting is
1719            disabled. If teams called from serial region, then teams and their
1720            threads should be created regardless of the nesting setting. */
1721         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1722                                          nthreads, enter_teams);
1723         if (nthreads == 1) {
1724           // Free lock for single thread execution here; for multi-thread
1725           // execution it will be freed later after team of threads created
1726           // and initialized
1727           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1728         }
1729       }
1730     }
1731     KMP_DEBUG_ASSERT(nthreads > 0);
1732 
1733     // If we temporarily changed the set number of threads then restore it now
1734     master_th->th.th_set_nproc = 0;
1735 
1736     /* create a serialized parallel region? */
1737     if (nthreads == 1) {
1738 /* josh todo: hypothetical question: what do we do for OS X*? */
1739 #if KMP_OS_LINUX &&                                                            \
1740     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1741       void *args[argc];
1742 #else
1743       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1744 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1745           KMP_ARCH_AARCH64) */
1746 
1747       KA_TRACE(20,
1748                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1749 
1750       __kmpc_serialized_parallel(loc, gtid);
1751 
1752       if (call_context == fork_context_intel) {
1753         /* TODO this sucks, use the compiler itself to pass args! :) */
1754         master_th->th.th_serial_team->t.t_ident = loc;
1755         if (!ap) {
1756           // revert change made in __kmpc_serialized_parallel()
1757           master_th->th.th_serial_team->t.t_level--;
1758 // Get args from parent team for teams construct
1759 
1760 #if OMPT_SUPPORT
1761           void *dummy;
1762           void **exit_frame_p;
1763           ompt_task_info_t *task_info;
1764 
1765           ompt_lw_taskteam_t lw_taskteam;
1766 
1767           if (ompt_enabled.enabled) {
1768             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1769                                     &ompt_parallel_data, return_address);
1770 
1771             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1772             // don't use lw_taskteam after linking. content was swaped
1773 
1774             task_info = OMPT_CUR_TASK_INFO(master_th);
1775             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1776             if (ompt_enabled.ompt_callback_implicit_task) {
1777               OMPT_CUR_TASK_INFO(master_th)
1778                   ->thread_num = __kmp_tid_from_gtid(gtid);
1779               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1780                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1781                   &(task_info->task_data), 1,
1782                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1783                   ompt_task_implicit);
1784             }
1785 
1786             /* OMPT state */
1787             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1788           } else {
1789             exit_frame_p = &dummy;
1790           }
1791 #endif
1792 
1793           {
1794             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1795             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1796             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1797                                    parent_team->t.t_argv
1798 #if OMPT_SUPPORT
1799                                    ,
1800                                    exit_frame_p
1801 #endif
1802                                    );
1803           }
1804 
1805 #if OMPT_SUPPORT
1806           if (ompt_enabled.enabled) {
1807             *exit_frame_p = NULL;
1808             if (ompt_enabled.ompt_callback_implicit_task) {
1809               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1810                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1811                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1812                   ompt_task_implicit);
1813             }
1814             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1815             __ompt_lw_taskteam_unlink(master_th);
1816             if (ompt_enabled.ompt_callback_parallel_end) {
1817               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1818                   &ompt_parallel_data, parent_task_data,
1819                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1820                   return_address);
1821             }
1822             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1823           }
1824 #endif
1825         } else if (microtask == (microtask_t)__kmp_teams_master) {
1826           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1827                            master_th->th.th_serial_team);
1828           team = master_th->th.th_team;
1829           // team->t.t_pkfn = microtask;
1830           team->t.t_invoke = invoker;
1831           __kmp_alloc_argv_entries(argc, team, TRUE);
1832           team->t.t_argc = argc;
1833           argv = (void **)team->t.t_argv;
1834           if (ap) {
1835             for (i = argc - 1; i >= 0; --i)
1836               *argv++ = va_arg(kmp_va_deref(ap), void *);
1837           } else {
1838             for (i = 0; i < argc; ++i)
1839               // Get args from parent team for teams construct
1840               argv[i] = parent_team->t.t_argv[i];
1841           }
1842           // AC: revert change made in __kmpc_serialized_parallel()
1843           //     because initial code in teams should have level=0
1844           team->t.t_level--;
1845           // AC: call special invoker for outer "parallel" of teams construct
1846           invoker(gtid);
1847 #if OMPT_SUPPORT
1848           if (ompt_enabled.enabled) {
1849             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1850             if (ompt_enabled.ompt_callback_implicit_task) {
1851               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1852                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1853                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1854             }
1855             if (ompt_enabled.ompt_callback_parallel_end) {
1856               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1857                   &ompt_parallel_data, parent_task_data,
1858                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1859                   return_address);
1860             }
1861             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1862           }
1863 #endif
1864         } else {
1865           argv = args;
1866           for (i = argc - 1; i >= 0; --i)
1867             *argv++ = va_arg(kmp_va_deref(ap), void *);
1868           KMP_MB();
1869 
1870 #if OMPT_SUPPORT
1871           void *dummy;
1872           void **exit_frame_p;
1873           ompt_task_info_t *task_info;
1874 
1875           ompt_lw_taskteam_t lw_taskteam;
1876 
1877           if (ompt_enabled.enabled) {
1878             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1879                                     &ompt_parallel_data, return_address);
1880             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1881             // don't use lw_taskteam after linking. content was swaped
1882             task_info = OMPT_CUR_TASK_INFO(master_th);
1883             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1884 
1885             /* OMPT implicit task begin */
1886             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1887             if (ompt_enabled.ompt_callback_implicit_task) {
1888               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1889                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1890                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1891                   ompt_task_implicit);
1892               OMPT_CUR_TASK_INFO(master_th)
1893                   ->thread_num = __kmp_tid_from_gtid(gtid);
1894             }
1895 
1896             /* OMPT state */
1897             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1898           } else {
1899             exit_frame_p = &dummy;
1900           }
1901 #endif
1902 
1903           {
1904             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1905             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1906             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1907 #if OMPT_SUPPORT
1908                                    ,
1909                                    exit_frame_p
1910 #endif
1911                                    );
1912           }
1913 
1914 #if OMPT_SUPPORT
1915           if (ompt_enabled.enabled) {
1916             *exit_frame_p = NULL;
1917             if (ompt_enabled.ompt_callback_implicit_task) {
1918               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1919                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1920                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1921                   ompt_task_implicit);
1922             }
1923 
1924             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1925             __ompt_lw_taskteam_unlink(master_th);
1926             if (ompt_enabled.ompt_callback_parallel_end) {
1927               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1928                   &ompt_parallel_data, parent_task_data,
1929                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1930                   return_address);
1931             }
1932             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1933           }
1934 #endif
1935         }
1936       } else if (call_context == fork_context_gnu) {
1937 #if OMPT_SUPPORT
1938         ompt_lw_taskteam_t lwt;
1939         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1940                                 return_address);
1941 
1942         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1943         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1944 // don't use lw_taskteam after linking. content was swaped
1945 #endif
1946 
1947         // we were called from GNU native code
1948         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1949         return FALSE;
1950       } else {
1951         KMP_ASSERT2(call_context < fork_context_last,
1952                     "__kmp_fork_call: unknown fork_context parameter");
1953       }
1954 
1955       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1956       KMP_MB();
1957       return FALSE;
1958     } // if (nthreads == 1)
1959 
1960     // GEH: only modify the executing flag in the case when not serialized
1961     //      serialized case is handled in kmpc_serialized_parallel
1962     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1963                   "curtask=%p, curtask_max_aclevel=%d\n",
1964                   parent_team->t.t_active_level, master_th,
1965                   master_th->th.th_current_task,
1966                   master_th->th.th_current_task->td_icvs.max_active_levels));
1967     // TODO: GEH - cannot do this assertion because root thread not set up as
1968     // executing
1969     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1970     master_th->th.th_current_task->td_flags.executing = 0;
1971 
1972     if (!master_th->th.th_teams_microtask || level > teams_level) {
1973       /* Increment our nested depth level */
1974       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1975     }
1976 
1977     // See if we need to make a copy of the ICVs.
1978     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1979     if ((level + 1 < __kmp_nested_nth.used) &&
1980         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1981       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1982     } else {
1983       nthreads_icv = 0; // don't update
1984     }
1985 
1986     // Figure out the proc_bind_policy for the new team.
1987     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1988     kmp_proc_bind_t proc_bind_icv =
1989         proc_bind_default; // proc_bind_default means don't update
1990     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1991       proc_bind = proc_bind_false;
1992     } else {
1993       if (proc_bind == proc_bind_default) {
1994         // No proc_bind clause specified; use current proc-bind-var for this
1995         // parallel region
1996         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1997       }
1998       /* else: The proc_bind policy was specified explicitly on parallel clause.
1999          This overrides proc-bind-var for this parallel region, but does not
2000          change proc-bind-var. */
2001       // Figure the value of proc-bind-var for the child threads.
2002       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2003           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2004            master_th->th.th_current_task->td_icvs.proc_bind)) {
2005         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2006       }
2007     }
2008 
2009     // Reset for next parallel region
2010     master_th->th.th_set_proc_bind = proc_bind_default;
2011 
2012     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2013       kmp_internal_control_t new_icvs;
2014       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2015       new_icvs.next = NULL;
2016       if (nthreads_icv > 0) {
2017         new_icvs.nproc = nthreads_icv;
2018       }
2019       if (proc_bind_icv != proc_bind_default) {
2020         new_icvs.proc_bind = proc_bind_icv;
2021       }
2022 
2023       /* allocate a new parallel team */
2024       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2025       team = __kmp_allocate_team(root, nthreads, nthreads,
2026 #if OMPT_SUPPORT
2027                                  ompt_parallel_data,
2028 #endif
2029                                  proc_bind, &new_icvs,
2030                                  argc USE_NESTED_HOT_ARG(master_th));
2031     } else {
2032       /* allocate a new parallel team */
2033       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2034       team = __kmp_allocate_team(root, nthreads, nthreads,
2035 #if OMPT_SUPPORT
2036                                  ompt_parallel_data,
2037 #endif
2038                                  proc_bind,
2039                                  &master_th->th.th_current_task->td_icvs,
2040                                  argc USE_NESTED_HOT_ARG(master_th));
2041     }
2042     KF_TRACE(
2043         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2044 
2045     /* setup the new team */
2046     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2047     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2048     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2049     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2050     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2051 #if OMPT_SUPPORT
2052     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2053                           return_address);
2054 #endif
2055     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2056     // TODO: parent_team->t.t_level == INT_MAX ???
2057     if (!master_th->th.th_teams_microtask || level > teams_level) {
2058       int new_level = parent_team->t.t_level + 1;
2059       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2060       new_level = parent_team->t.t_active_level + 1;
2061       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2062     } else {
2063       // AC: Do not increase parallel level at start of the teams construct
2064       int new_level = parent_team->t.t_level;
2065       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2066       new_level = parent_team->t.t_active_level;
2067       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2068     }
2069     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2070     // set master's schedule as new run-time schedule
2071     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2072 
2073     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2074     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2075 
2076     // Update the floating point rounding in the team if required.
2077     propagateFPControl(team);
2078 
2079     if (__kmp_tasking_mode != tskm_immediate_exec) {
2080       // Set master's task team to team's task team. Unless this is hot team, it
2081       // should be NULL.
2082       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2083                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2084       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2085                     "%p, new task_team %p / team %p\n",
2086                     __kmp_gtid_from_thread(master_th),
2087                     master_th->th.th_task_team, parent_team,
2088                     team->t.t_task_team[master_th->th.th_task_state], team));
2089 
2090       if (active_level || master_th->th.th_task_team) {
2091         // Take a memo of master's task_state
2092         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2093         if (master_th->th.th_task_state_top >=
2094             master_th->th.th_task_state_stack_sz) { // increase size
2095           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2096           kmp_uint8 *old_stack, *new_stack;
2097           kmp_uint32 i;
2098           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2099           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2100             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2101           }
2102           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2103                ++i) { // zero-init rest of stack
2104             new_stack[i] = 0;
2105           }
2106           old_stack = master_th->th.th_task_state_memo_stack;
2107           master_th->th.th_task_state_memo_stack = new_stack;
2108           master_th->th.th_task_state_stack_sz = new_size;
2109           __kmp_free(old_stack);
2110         }
2111         // Store master's task_state on stack
2112         master_th->th
2113             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2114             master_th->th.th_task_state;
2115         master_th->th.th_task_state_top++;
2116 #if KMP_NESTED_HOT_TEAMS
2117         if (master_th->th.th_hot_teams &&
2118             active_level < __kmp_hot_teams_max_level &&
2119             team == master_th->th.th_hot_teams[active_level].hot_team) {
2120           // Restore master's nested state if nested hot team
2121           master_th->th.th_task_state =
2122               master_th->th
2123                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2124         } else {
2125 #endif
2126           master_th->th.th_task_state = 0;
2127 #if KMP_NESTED_HOT_TEAMS
2128         }
2129 #endif
2130       }
2131 #if !KMP_NESTED_HOT_TEAMS
2132       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2133                        (team == root->r.r_hot_team));
2134 #endif
2135     }
2136 
2137     KA_TRACE(
2138         20,
2139         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2140          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2141          team->t.t_nproc));
2142     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2143                      (team->t.t_master_tid == 0 &&
2144                       (team->t.t_parent == root->r.r_root_team ||
2145                        team->t.t_parent->t.t_serialized)));
2146     KMP_MB();
2147 
2148     /* now, setup the arguments */
2149     argv = (void **)team->t.t_argv;
2150     if (ap) {
2151       for (i = argc - 1; i >= 0; --i) {
2152         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2153         KMP_CHECK_UPDATE(*argv, new_argv);
2154         argv++;
2155       }
2156     } else {
2157       for (i = 0; i < argc; ++i) {
2158         // Get args from parent team for teams construct
2159         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2160       }
2161     }
2162 
2163     /* now actually fork the threads */
2164     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2165     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2166       root->r.r_active = TRUE;
2167 
2168     __kmp_fork_team_threads(root, team, master_th, gtid);
2169     __kmp_setup_icv_copy(team, nthreads,
2170                          &master_th->th.th_current_task->td_icvs, loc);
2171 
2172 #if OMPT_SUPPORT
2173     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2174 #endif
2175 
2176     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2177 
2178 #if USE_ITT_BUILD
2179     if (team->t.t_active_level == 1 // only report frames at level 1
2180         && !master_th->th.th_teams_microtask) { // not in teams construct
2181 #if USE_ITT_NOTIFY
2182       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2183           (__kmp_forkjoin_frames_mode == 3 ||
2184            __kmp_forkjoin_frames_mode == 1)) {
2185         kmp_uint64 tmp_time = 0;
2186         if (__itt_get_timestamp_ptr)
2187           tmp_time = __itt_get_timestamp();
2188         // Internal fork - report frame begin
2189         master_th->th.th_frame_time = tmp_time;
2190         if (__kmp_forkjoin_frames_mode == 3)
2191           team->t.t_region_time = tmp_time;
2192       } else
2193 // only one notification scheme (either "submit" or "forking/joined", not both)
2194 #endif /* USE_ITT_NOTIFY */
2195           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2196               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2197         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2198         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2199       }
2200     }
2201 #endif /* USE_ITT_BUILD */
2202 
2203     /* now go on and do the work */
2204     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2205     KMP_MB();
2206     KF_TRACE(10,
2207              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2208               root, team, master_th, gtid));
2209 
2210 #if USE_ITT_BUILD
2211     if (__itt_stack_caller_create_ptr) {
2212       team->t.t_stack_id =
2213           __kmp_itt_stack_caller_create(); // create new stack stitching id
2214       // before entering fork barrier
2215     }
2216 #endif /* USE_ITT_BUILD */
2217 
2218     // AC: skip __kmp_internal_fork at teams construct, let only master
2219     // threads execute
2220     if (ap) {
2221       __kmp_internal_fork(loc, gtid, team);
2222       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2223                     "master_th=%p, gtid=%d\n",
2224                     root, team, master_th, gtid));
2225     }
2226 
2227     if (call_context == fork_context_gnu) {
2228       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2229       return TRUE;
2230     }
2231 
2232     /* Invoke microtask for MASTER thread */
2233     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2234                   team->t.t_id, team->t.t_pkfn));
2235   } // END of timer KMP_fork_call block
2236 
2237 #if KMP_STATS_ENABLED
2238   // If beginning a teams construct, then change thread state
2239   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2240   if (!ap) {
2241     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2242   }
2243 #endif
2244 
2245   if (!team->t.t_invoke(gtid)) {
2246     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2247   }
2248 
2249 #if KMP_STATS_ENABLED
2250   // If was beginning of a teams construct, then reset thread state
2251   if (!ap) {
2252     KMP_SET_THREAD_STATE(previous_state);
2253   }
2254 #endif
2255 
2256   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2257                 team->t.t_id, team->t.t_pkfn));
2258   KMP_MB(); /* Flush all pending memory write invalidates.  */
2259 
2260   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2261 
2262 #if OMPT_SUPPORT
2263   if (ompt_enabled.enabled) {
2264     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2265   }
2266 #endif
2267 
2268   return TRUE;
2269 }
2270 
2271 #if OMPT_SUPPORT
2272 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2273                                             kmp_team_t *team) {
2274   // restore state outside the region
2275   thread->th.ompt_thread_info.state =
2276       ((team->t.t_serialized) ? ompt_state_work_serial
2277                               : ompt_state_work_parallel);
2278 }
2279 
2280 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2281                                    kmp_team_t *team, ompt_data_t *parallel_data,
2282                                    int flags, void *codeptr) {
2283   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2284   if (ompt_enabled.ompt_callback_parallel_end) {
2285     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2286         parallel_data, &(task_info->task_data), flags, codeptr);
2287   }
2288 
2289   task_info->frame.enter_frame = ompt_data_none;
2290   __kmp_join_restore_state(thread, team);
2291 }
2292 #endif
2293 
2294 void __kmp_join_call(ident_t *loc, int gtid
2295 #if OMPT_SUPPORT
2296                      ,
2297                      enum fork_context_e fork_context
2298 #endif
2299                      ,
2300                      int exit_teams) {
2301   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2302   kmp_team_t *team;
2303   kmp_team_t *parent_team;
2304   kmp_info_t *master_th;
2305   kmp_root_t *root;
2306   int master_active;
2307 
2308   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2309 
2310   /* setup current data */
2311   master_th = __kmp_threads[gtid];
2312   root = master_th->th.th_root;
2313   team = master_th->th.th_team;
2314   parent_team = team->t.t_parent;
2315 
2316   master_th->th.th_ident = loc;
2317 
2318 #if OMPT_SUPPORT
2319   void *team_microtask = (void *)team->t.t_pkfn;
2320   // For GOMP interface with serialized parallel, need the
2321   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2322   // and end-parallel events.
2323   if (ompt_enabled.enabled &&
2324       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2325     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2326   }
2327 #endif
2328 
2329 #if KMP_DEBUG
2330   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2331     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2332                   "th_task_team = %p\n",
2333                   __kmp_gtid_from_thread(master_th), team,
2334                   team->t.t_task_team[master_th->th.th_task_state],
2335                   master_th->th.th_task_team));
2336     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2337                      team->t.t_task_team[master_th->th.th_task_state]);
2338   }
2339 #endif
2340 
2341   if (team->t.t_serialized) {
2342     if (master_th->th.th_teams_microtask) {
2343       // We are in teams construct
2344       int level = team->t.t_level;
2345       int tlevel = master_th->th.th_teams_level;
2346       if (level == tlevel) {
2347         // AC: we haven't incremented it earlier at start of teams construct,
2348         //     so do it here - at the end of teams construct
2349         team->t.t_level++;
2350       } else if (level == tlevel + 1) {
2351         // AC: we are exiting parallel inside teams, need to increment
2352         // serialization in order to restore it in the next call to
2353         // __kmpc_end_serialized_parallel
2354         team->t.t_serialized++;
2355       }
2356     }
2357     __kmpc_end_serialized_parallel(loc, gtid);
2358 
2359 #if OMPT_SUPPORT
2360     if (ompt_enabled.enabled) {
2361       __kmp_join_restore_state(master_th, parent_team);
2362     }
2363 #endif
2364 
2365     return;
2366   }
2367 
2368   master_active = team->t.t_master_active;
2369 
2370   if (!exit_teams) {
2371     // AC: No barrier for internal teams at exit from teams construct.
2372     //     But there is barrier for external team (league).
2373     __kmp_internal_join(loc, gtid, team);
2374   } else {
2375     master_th->th.th_task_state =
2376         0; // AC: no tasking in teams (out of any parallel)
2377   }
2378 
2379   KMP_MB();
2380 
2381 #if OMPT_SUPPORT
2382   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2383   void *codeptr = team->t.ompt_team_info.master_return_address;
2384 #endif
2385 
2386 #if USE_ITT_BUILD
2387   if (__itt_stack_caller_create_ptr) {
2388     // destroy the stack stitching id after join barrier
2389     __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2390   }
2391   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2392   if (team->t.t_active_level == 1 &&
2393       (!master_th->th.th_teams_microtask || /* not in teams construct */
2394        master_th->th.th_teams_size.nteams == 1)) {
2395     master_th->th.th_ident = loc;
2396     // only one notification scheme (either "submit" or "forking/joined", not
2397     // both)
2398     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2399         __kmp_forkjoin_frames_mode == 3)
2400       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2401                              master_th->th.th_frame_time, 0, loc,
2402                              master_th->th.th_team_nproc, 1);
2403     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2404              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2405       __kmp_itt_region_joined(gtid);
2406   } // active_level == 1
2407 #endif /* USE_ITT_BUILD */
2408 
2409   if (master_th->th.th_teams_microtask && !exit_teams &&
2410       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2411       team->t.t_level == master_th->th.th_teams_level + 1) {
2412 // AC: We need to leave the team structure intact at the end of parallel
2413 // inside the teams construct, so that at the next parallel same (hot) team
2414 // works, only adjust nesting levels
2415 #if OMPT_SUPPORT
2416     ompt_data_t ompt_parallel_data = ompt_data_none;
2417     if (ompt_enabled.enabled) {
2418       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2419       if (ompt_enabled.ompt_callback_implicit_task) {
2420         int ompt_team_size = team->t.t_nproc;
2421         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2422             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2423             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2424       }
2425       task_info->frame.exit_frame = ompt_data_none;
2426       task_info->task_data = ompt_data_none;
2427       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2428       __ompt_lw_taskteam_unlink(master_th);
2429     }
2430 #endif
2431     /* Decrement our nested depth level */
2432     team->t.t_level--;
2433     team->t.t_active_level--;
2434     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2435 
2436     // Restore number of threads in the team if needed. This code relies on
2437     // the proper adjustment of th_teams_size.nth after the fork in
2438     // __kmp_teams_master on each teams master in the case that
2439     // __kmp_reserve_threads reduced it.
2440     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2441       int old_num = master_th->th.th_team_nproc;
2442       int new_num = master_th->th.th_teams_size.nth;
2443       kmp_info_t **other_threads = team->t.t_threads;
2444       team->t.t_nproc = new_num;
2445       for (int i = 0; i < old_num; ++i) {
2446         other_threads[i]->th.th_team_nproc = new_num;
2447       }
2448       // Adjust states of non-used threads of the team
2449       for (int i = old_num; i < new_num; ++i) {
2450         // Re-initialize thread's barrier data.
2451         KMP_DEBUG_ASSERT(other_threads[i]);
2452         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2453         for (int b = 0; b < bs_last_barrier; ++b) {
2454           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2455           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2456 #if USE_DEBUGGER
2457           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2458 #endif
2459         }
2460         if (__kmp_tasking_mode != tskm_immediate_exec) {
2461           // Synchronize thread's task state
2462           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2463         }
2464       }
2465     }
2466 
2467 #if OMPT_SUPPORT
2468     if (ompt_enabled.enabled) {
2469       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2470                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2471     }
2472 #endif
2473 
2474     return;
2475   }
2476 
2477   /* do cleanup and restore the parent team */
2478   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2479   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2480 
2481   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2482 
2483   /* jc: The following lock has instructions with REL and ACQ semantics,
2484      separating the parallel user code called in this parallel region
2485      from the serial user code called after this function returns. */
2486   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2487 
2488   if (!master_th->th.th_teams_microtask ||
2489       team->t.t_level > master_th->th.th_teams_level) {
2490     /* Decrement our nested depth level */
2491     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2492   }
2493   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2494 
2495 #if OMPT_SUPPORT
2496   if (ompt_enabled.enabled) {
2497     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2498     if (ompt_enabled.ompt_callback_implicit_task) {
2499       int flags = (team_microtask == (void *)__kmp_teams_master)
2500                       ? ompt_task_initial
2501                       : ompt_task_implicit;
2502       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2503       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2504           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2505           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2506     }
2507     task_info->frame.exit_frame = ompt_data_none;
2508     task_info->task_data = ompt_data_none;
2509   }
2510 #endif
2511 
2512   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2513                 master_th, team));
2514   __kmp_pop_current_task_from_thread(master_th);
2515 
2516 #if KMP_AFFINITY_SUPPORTED
2517   // Restore master thread's partition.
2518   master_th->th.th_first_place = team->t.t_first_place;
2519   master_th->th.th_last_place = team->t.t_last_place;
2520 #endif // KMP_AFFINITY_SUPPORTED
2521   master_th->th.th_def_allocator = team->t.t_def_allocator;
2522 
2523   updateHWFPControl(team);
2524 
2525   if (root->r.r_active != master_active)
2526     root->r.r_active = master_active;
2527 
2528   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2529                             master_th)); // this will free worker threads
2530 
2531   /* this race was fun to find. make sure the following is in the critical
2532      region otherwise assertions may fail occasionally since the old team may be
2533      reallocated and the hierarchy appears inconsistent. it is actually safe to
2534      run and won't cause any bugs, but will cause those assertion failures. it's
2535      only one deref&assign so might as well put this in the critical region */
2536   master_th->th.th_team = parent_team;
2537   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2538   master_th->th.th_team_master = parent_team->t.t_threads[0];
2539   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2540 
2541   /* restore serialized team, if need be */
2542   if (parent_team->t.t_serialized &&
2543       parent_team != master_th->th.th_serial_team &&
2544       parent_team != root->r.r_root_team) {
2545     __kmp_free_team(root,
2546                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2547     master_th->th.th_serial_team = parent_team;
2548   }
2549 
2550   if (__kmp_tasking_mode != tskm_immediate_exec) {
2551     if (master_th->th.th_task_state_top >
2552         0) { // Restore task state from memo stack
2553       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2554       // Remember master's state if we re-use this nested hot team
2555       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2556           master_th->th.th_task_state;
2557       --master_th->th.th_task_state_top; // pop
2558       // Now restore state at this level
2559       master_th->th.th_task_state =
2560           master_th->th
2561               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2562     }
2563     // Copy the task team from the parent team to the master thread
2564     master_th->th.th_task_team =
2565         parent_team->t.t_task_team[master_th->th.th_task_state];
2566     KA_TRACE(20,
2567              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2568               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2569               parent_team));
2570   }
2571 
2572   // TODO: GEH - cannot do this assertion because root thread not set up as
2573   // executing
2574   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2575   master_th->th.th_current_task->td_flags.executing = 1;
2576 
2577   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2578 
2579 #if OMPT_SUPPORT
2580   int flags =
2581       OMPT_INVOKER(fork_context) |
2582       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2583                                                       : ompt_parallel_team);
2584   if (ompt_enabled.enabled) {
2585     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2586                     codeptr);
2587   }
2588 #endif
2589 
2590   KMP_MB();
2591   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2592 }
2593 
2594 /* Check whether we should push an internal control record onto the
2595    serial team stack.  If so, do it.  */
2596 void __kmp_save_internal_controls(kmp_info_t *thread) {
2597 
2598   if (thread->th.th_team != thread->th.th_serial_team) {
2599     return;
2600   }
2601   if (thread->th.th_team->t.t_serialized > 1) {
2602     int push = 0;
2603 
2604     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2605       push = 1;
2606     } else {
2607       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2608           thread->th.th_team->t.t_serialized) {
2609         push = 1;
2610       }
2611     }
2612     if (push) { /* push a record on the serial team's stack */
2613       kmp_internal_control_t *control =
2614           (kmp_internal_control_t *)__kmp_allocate(
2615               sizeof(kmp_internal_control_t));
2616 
2617       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2618 
2619       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2620 
2621       control->next = thread->th.th_team->t.t_control_stack_top;
2622       thread->th.th_team->t.t_control_stack_top = control;
2623     }
2624   }
2625 }
2626 
2627 /* Changes set_nproc */
2628 void __kmp_set_num_threads(int new_nth, int gtid) {
2629   kmp_info_t *thread;
2630   kmp_root_t *root;
2631 
2632   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2633   KMP_DEBUG_ASSERT(__kmp_init_serial);
2634 
2635   if (new_nth < 1)
2636     new_nth = 1;
2637   else if (new_nth > __kmp_max_nth)
2638     new_nth = __kmp_max_nth;
2639 
2640   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2641   thread = __kmp_threads[gtid];
2642   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2643     return; // nothing to do
2644 
2645   __kmp_save_internal_controls(thread);
2646 
2647   set__nproc(thread, new_nth);
2648 
2649   // If this omp_set_num_threads() call will cause the hot team size to be
2650   // reduced (in the absence of a num_threads clause), then reduce it now,
2651   // rather than waiting for the next parallel region.
2652   root = thread->th.th_root;
2653   if (__kmp_init_parallel && (!root->r.r_active) &&
2654       (root->r.r_hot_team->t.t_nproc > new_nth)
2655 #if KMP_NESTED_HOT_TEAMS
2656       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2657 #endif
2658       ) {
2659     kmp_team_t *hot_team = root->r.r_hot_team;
2660     int f;
2661 
2662     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2663 
2664     // Release the extra threads we don't need any more.
2665     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2666       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2667       if (__kmp_tasking_mode != tskm_immediate_exec) {
2668         // When decreasing team size, threads no longer in the team should unref
2669         // task team.
2670         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2671       }
2672       __kmp_free_thread(hot_team->t.t_threads[f]);
2673       hot_team->t.t_threads[f] = NULL;
2674     }
2675     hot_team->t.t_nproc = new_nth;
2676 #if KMP_NESTED_HOT_TEAMS
2677     if (thread->th.th_hot_teams) {
2678       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2679       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2680     }
2681 #endif
2682 
2683     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2684 
2685     // Update the t_nproc field in the threads that are still active.
2686     for (f = 0; f < new_nth; f++) {
2687       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2688       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2689     }
2690     // Special flag in case omp_set_num_threads() call
2691     hot_team->t.t_size_changed = -1;
2692   }
2693 }
2694 
2695 /* Changes max_active_levels */
2696 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2697   kmp_info_t *thread;
2698 
2699   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2700                 "%d = (%d)\n",
2701                 gtid, max_active_levels));
2702   KMP_DEBUG_ASSERT(__kmp_init_serial);
2703 
2704   // validate max_active_levels
2705   if (max_active_levels < 0) {
2706     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2707     // We ignore this call if the user has specified a negative value.
2708     // The current setting won't be changed. The last valid setting will be
2709     // used. A warning will be issued (if warnings are allowed as controlled by
2710     // the KMP_WARNINGS env var).
2711     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2712                   "max_active_levels for thread %d = (%d)\n",
2713                   gtid, max_active_levels));
2714     return;
2715   }
2716   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2717     // it's OK, the max_active_levels is within the valid range: [ 0;
2718     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2719     // We allow a zero value. (implementation defined behavior)
2720   } else {
2721     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2722                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2723     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2724     // Current upper limit is MAX_INT. (implementation defined behavior)
2725     // If the input exceeds the upper limit, we correct the input to be the
2726     // upper limit. (implementation defined behavior)
2727     // Actually, the flow should never get here until we use MAX_INT limit.
2728   }
2729   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2730                 "max_active_levels for thread %d = (%d)\n",
2731                 gtid, max_active_levels));
2732 
2733   thread = __kmp_threads[gtid];
2734 
2735   __kmp_save_internal_controls(thread);
2736 
2737   set__max_active_levels(thread, max_active_levels);
2738 }
2739 
2740 /* Gets max_active_levels */
2741 int __kmp_get_max_active_levels(int gtid) {
2742   kmp_info_t *thread;
2743 
2744   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2745   KMP_DEBUG_ASSERT(__kmp_init_serial);
2746 
2747   thread = __kmp_threads[gtid];
2748   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2749   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2750                 "curtask_maxaclevel=%d\n",
2751                 gtid, thread->th.th_current_task,
2752                 thread->th.th_current_task->td_icvs.max_active_levels));
2753   return thread->th.th_current_task->td_icvs.max_active_levels;
2754 }
2755 
2756 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2757 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2758 
2759 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2760 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2761   kmp_info_t *thread;
2762   kmp_sched_t orig_kind;
2763   //    kmp_team_t *team;
2764 
2765   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2766                 gtid, (int)kind, chunk));
2767   KMP_DEBUG_ASSERT(__kmp_init_serial);
2768 
2769   // Check if the kind parameter is valid, correct if needed.
2770   // Valid parameters should fit in one of two intervals - standard or extended:
2771   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2772   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2773   orig_kind = kind;
2774   kind = __kmp_sched_without_mods(kind);
2775 
2776   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2777       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2778     // TODO: Hint needs attention in case we change the default schedule.
2779     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2780               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2781               __kmp_msg_null);
2782     kind = kmp_sched_default;
2783     chunk = 0; // ignore chunk value in case of bad kind
2784   }
2785 
2786   thread = __kmp_threads[gtid];
2787 
2788   __kmp_save_internal_controls(thread);
2789 
2790   if (kind < kmp_sched_upper_std) {
2791     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2792       // differ static chunked vs. unchunked:  chunk should be invalid to
2793       // indicate unchunked schedule (which is the default)
2794       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2795     } else {
2796       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2797           __kmp_sch_map[kind - kmp_sched_lower - 1];
2798     }
2799   } else {
2800     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2801     //    kmp_sched_lower - 2 ];
2802     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2803         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2804                       kmp_sched_lower - 2];
2805   }
2806   __kmp_sched_apply_mods_intkind(
2807       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2808   if (kind == kmp_sched_auto || chunk < 1) {
2809     // ignore parameter chunk for schedule auto
2810     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2811   } else {
2812     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2813   }
2814 }
2815 
2816 /* Gets def_sched_var ICV values */
2817 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2818   kmp_info_t *thread;
2819   enum sched_type th_type;
2820 
2821   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2822   KMP_DEBUG_ASSERT(__kmp_init_serial);
2823 
2824   thread = __kmp_threads[gtid];
2825 
2826   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2827   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2828   case kmp_sch_static:
2829   case kmp_sch_static_greedy:
2830   case kmp_sch_static_balanced:
2831     *kind = kmp_sched_static;
2832     __kmp_sched_apply_mods_stdkind(kind, th_type);
2833     *chunk = 0; // chunk was not set, try to show this fact via zero value
2834     return;
2835   case kmp_sch_static_chunked:
2836     *kind = kmp_sched_static;
2837     break;
2838   case kmp_sch_dynamic_chunked:
2839     *kind = kmp_sched_dynamic;
2840     break;
2841   case kmp_sch_guided_chunked:
2842   case kmp_sch_guided_iterative_chunked:
2843   case kmp_sch_guided_analytical_chunked:
2844     *kind = kmp_sched_guided;
2845     break;
2846   case kmp_sch_auto:
2847     *kind = kmp_sched_auto;
2848     break;
2849   case kmp_sch_trapezoidal:
2850     *kind = kmp_sched_trapezoidal;
2851     break;
2852 #if KMP_STATIC_STEAL_ENABLED
2853   case kmp_sch_static_steal:
2854     *kind = kmp_sched_static_steal;
2855     break;
2856 #endif
2857   default:
2858     KMP_FATAL(UnknownSchedulingType, th_type);
2859   }
2860 
2861   __kmp_sched_apply_mods_stdkind(kind, th_type);
2862   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2863 }
2864 
2865 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2866 
2867   int ii, dd;
2868   kmp_team_t *team;
2869   kmp_info_t *thr;
2870 
2871   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2872   KMP_DEBUG_ASSERT(__kmp_init_serial);
2873 
2874   // validate level
2875   if (level == 0)
2876     return 0;
2877   if (level < 0)
2878     return -1;
2879   thr = __kmp_threads[gtid];
2880   team = thr->th.th_team;
2881   ii = team->t.t_level;
2882   if (level > ii)
2883     return -1;
2884 
2885   if (thr->th.th_teams_microtask) {
2886     // AC: we are in teams region where multiple nested teams have same level
2887     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2888     if (level <=
2889         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2890       KMP_DEBUG_ASSERT(ii >= tlevel);
2891       // AC: As we need to pass by the teams league, we need to artificially
2892       // increase ii
2893       if (ii == tlevel) {
2894         ii += 2; // three teams have same level
2895       } else {
2896         ii++; // two teams have same level
2897       }
2898     }
2899   }
2900 
2901   if (ii == level)
2902     return __kmp_tid_from_gtid(gtid);
2903 
2904   dd = team->t.t_serialized;
2905   level++;
2906   while (ii > level) {
2907     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2908     }
2909     if ((team->t.t_serialized) && (!dd)) {
2910       team = team->t.t_parent;
2911       continue;
2912     }
2913     if (ii > level) {
2914       team = team->t.t_parent;
2915       dd = team->t.t_serialized;
2916       ii--;
2917     }
2918   }
2919 
2920   return (dd > 1) ? (0) : (team->t.t_master_tid);
2921 }
2922 
2923 int __kmp_get_team_size(int gtid, int level) {
2924 
2925   int ii, dd;
2926   kmp_team_t *team;
2927   kmp_info_t *thr;
2928 
2929   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2930   KMP_DEBUG_ASSERT(__kmp_init_serial);
2931 
2932   // validate level
2933   if (level == 0)
2934     return 1;
2935   if (level < 0)
2936     return -1;
2937   thr = __kmp_threads[gtid];
2938   team = thr->th.th_team;
2939   ii = team->t.t_level;
2940   if (level > ii)
2941     return -1;
2942 
2943   if (thr->th.th_teams_microtask) {
2944     // AC: we are in teams region where multiple nested teams have same level
2945     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2946     if (level <=
2947         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2948       KMP_DEBUG_ASSERT(ii >= tlevel);
2949       // AC: As we need to pass by the teams league, we need to artificially
2950       // increase ii
2951       if (ii == tlevel) {
2952         ii += 2; // three teams have same level
2953       } else {
2954         ii++; // two teams have same level
2955       }
2956     }
2957   }
2958 
2959   while (ii > level) {
2960     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2961     }
2962     if (team->t.t_serialized && (!dd)) {
2963       team = team->t.t_parent;
2964       continue;
2965     }
2966     if (ii > level) {
2967       team = team->t.t_parent;
2968       ii--;
2969     }
2970   }
2971 
2972   return team->t.t_nproc;
2973 }
2974 
2975 kmp_r_sched_t __kmp_get_schedule_global() {
2976   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2977   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2978   // independently. So one can get the updated schedule here.
2979 
2980   kmp_r_sched_t r_sched;
2981 
2982   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2983   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2984   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2985   // different roots (even in OMP 2.5)
2986   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2987   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2988   if (s == kmp_sch_static) {
2989     // replace STATIC with more detailed schedule (balanced or greedy)
2990     r_sched.r_sched_type = __kmp_static;
2991   } else if (s == kmp_sch_guided_chunked) {
2992     // replace GUIDED with more detailed schedule (iterative or analytical)
2993     r_sched.r_sched_type = __kmp_guided;
2994   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2995     r_sched.r_sched_type = __kmp_sched;
2996   }
2997   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2998 
2999   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3000     // __kmp_chunk may be wrong here (if it was not ever set)
3001     r_sched.chunk = KMP_DEFAULT_CHUNK;
3002   } else {
3003     r_sched.chunk = __kmp_chunk;
3004   }
3005 
3006   return r_sched;
3007 }
3008 
3009 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3010    at least argc number of *t_argv entries for the requested team. */
3011 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3012 
3013   KMP_DEBUG_ASSERT(team);
3014   if (!realloc || argc > team->t.t_max_argc) {
3015 
3016     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3017                    "current entries=%d\n",
3018                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3019     /* if previously allocated heap space for args, free them */
3020     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3021       __kmp_free((void *)team->t.t_argv);
3022 
3023     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3024       /* use unused space in the cache line for arguments */
3025       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3026       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3027                      "argv entries\n",
3028                      team->t.t_id, team->t.t_max_argc));
3029       team->t.t_argv = &team->t.t_inline_argv[0];
3030       if (__kmp_storage_map) {
3031         __kmp_print_storage_map_gtid(
3032             -1, &team->t.t_inline_argv[0],
3033             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3034             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3035             team->t.t_id);
3036       }
3037     } else {
3038       /* allocate space for arguments in the heap */
3039       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3040                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3041                                : 2 * argc;
3042       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3043                      "argv entries\n",
3044                      team->t.t_id, team->t.t_max_argc));
3045       team->t.t_argv =
3046           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3047       if (__kmp_storage_map) {
3048         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3049                                      &team->t.t_argv[team->t.t_max_argc],
3050                                      sizeof(void *) * team->t.t_max_argc,
3051                                      "team_%d.t_argv", team->t.t_id);
3052       }
3053     }
3054   }
3055 }
3056 
3057 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3058   int i;
3059   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3060   team->t.t_threads =
3061       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3062   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3063       sizeof(dispatch_shared_info_t) * num_disp_buff);
3064   team->t.t_dispatch =
3065       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3066   team->t.t_implicit_task_taskdata =
3067       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3068   team->t.t_max_nproc = max_nth;
3069 
3070   /* setup dispatch buffers */
3071   for (i = 0; i < num_disp_buff; ++i) {
3072     team->t.t_disp_buffer[i].buffer_index = i;
3073     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3074   }
3075 }
3076 
3077 static void __kmp_free_team_arrays(kmp_team_t *team) {
3078   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3079   int i;
3080   for (i = 0; i < team->t.t_max_nproc; ++i) {
3081     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3082       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3083       team->t.t_dispatch[i].th_disp_buffer = NULL;
3084     }
3085   }
3086 #if KMP_USE_HIER_SCHED
3087   __kmp_dispatch_free_hierarchies(team);
3088 #endif
3089   __kmp_free(team->t.t_threads);
3090   __kmp_free(team->t.t_disp_buffer);
3091   __kmp_free(team->t.t_dispatch);
3092   __kmp_free(team->t.t_implicit_task_taskdata);
3093   team->t.t_threads = NULL;
3094   team->t.t_disp_buffer = NULL;
3095   team->t.t_dispatch = NULL;
3096   team->t.t_implicit_task_taskdata = 0;
3097 }
3098 
3099 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3100   kmp_info_t **oldThreads = team->t.t_threads;
3101 
3102   __kmp_free(team->t.t_disp_buffer);
3103   __kmp_free(team->t.t_dispatch);
3104   __kmp_free(team->t.t_implicit_task_taskdata);
3105   __kmp_allocate_team_arrays(team, max_nth);
3106 
3107   KMP_MEMCPY(team->t.t_threads, oldThreads,
3108              team->t.t_nproc * sizeof(kmp_info_t *));
3109 
3110   __kmp_free(oldThreads);
3111 }
3112 
3113 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3114 
3115   kmp_r_sched_t r_sched =
3116       __kmp_get_schedule_global(); // get current state of scheduling globals
3117 
3118   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3119 
3120   kmp_internal_control_t g_icvs = {
3121     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3122     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3123     // adjustment of threads (per thread)
3124     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3125     // whether blocktime is explicitly set
3126     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3127 #if KMP_USE_MONITOR
3128     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3129 // intervals
3130 #endif
3131     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3132     // next parallel region (per thread)
3133     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3134     __kmp_cg_max_nth, // int thread_limit;
3135     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3136     // for max_active_levels
3137     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3138     // {sched,chunk} pair
3139     __kmp_nested_proc_bind.bind_types[0],
3140     __kmp_default_device,
3141     NULL // struct kmp_internal_control *next;
3142   };
3143 
3144   return g_icvs;
3145 }
3146 
3147 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3148 
3149   kmp_internal_control_t gx_icvs;
3150   gx_icvs.serial_nesting_level =
3151       0; // probably =team->t.t_serial like in save_inter_controls
3152   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3153   gx_icvs.next = NULL;
3154 
3155   return gx_icvs;
3156 }
3157 
3158 static void __kmp_initialize_root(kmp_root_t *root) {
3159   int f;
3160   kmp_team_t *root_team;
3161   kmp_team_t *hot_team;
3162   int hot_team_max_nth;
3163   kmp_r_sched_t r_sched =
3164       __kmp_get_schedule_global(); // get current state of scheduling globals
3165   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3166   KMP_DEBUG_ASSERT(root);
3167   KMP_ASSERT(!root->r.r_begin);
3168 
3169   /* setup the root state structure */
3170   __kmp_init_lock(&root->r.r_begin_lock);
3171   root->r.r_begin = FALSE;
3172   root->r.r_active = FALSE;
3173   root->r.r_in_parallel = 0;
3174   root->r.r_blocktime = __kmp_dflt_blocktime;
3175 
3176   /* setup the root team for this task */
3177   /* allocate the root team structure */
3178   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3179 
3180   root_team =
3181       __kmp_allocate_team(root,
3182                           1, // new_nproc
3183                           1, // max_nproc
3184 #if OMPT_SUPPORT
3185                           ompt_data_none, // root parallel id
3186 #endif
3187                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3188                           0 // argc
3189                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3190                           );
3191 #if USE_DEBUGGER
3192   // Non-NULL value should be assigned to make the debugger display the root
3193   // team.
3194   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3195 #endif
3196 
3197   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3198 
3199   root->r.r_root_team = root_team;
3200   root_team->t.t_control_stack_top = NULL;
3201 
3202   /* initialize root team */
3203   root_team->t.t_threads[0] = NULL;
3204   root_team->t.t_nproc = 1;
3205   root_team->t.t_serialized = 1;
3206   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3207   root_team->t.t_sched.sched = r_sched.sched;
3208   KA_TRACE(
3209       20,
3210       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3211        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3212 
3213   /* setup the  hot team for this task */
3214   /* allocate the hot team structure */
3215   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3216 
3217   hot_team =
3218       __kmp_allocate_team(root,
3219                           1, // new_nproc
3220                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3221 #if OMPT_SUPPORT
3222                           ompt_data_none, // root parallel id
3223 #endif
3224                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3225                           0 // argc
3226                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3227                           );
3228   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3229 
3230   root->r.r_hot_team = hot_team;
3231   root_team->t.t_control_stack_top = NULL;
3232 
3233   /* first-time initialization */
3234   hot_team->t.t_parent = root_team;
3235 
3236   /* initialize hot team */
3237   hot_team_max_nth = hot_team->t.t_max_nproc;
3238   for (f = 0; f < hot_team_max_nth; ++f) {
3239     hot_team->t.t_threads[f] = NULL;
3240   }
3241   hot_team->t.t_nproc = 1;
3242   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3243   hot_team->t.t_sched.sched = r_sched.sched;
3244   hot_team->t.t_size_changed = 0;
3245 }
3246 
3247 #ifdef KMP_DEBUG
3248 
3249 typedef struct kmp_team_list_item {
3250   kmp_team_p const *entry;
3251   struct kmp_team_list_item *next;
3252 } kmp_team_list_item_t;
3253 typedef kmp_team_list_item_t *kmp_team_list_t;
3254 
3255 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3256     kmp_team_list_t list, // List of teams.
3257     kmp_team_p const *team // Team to add.
3258     ) {
3259 
3260   // List must terminate with item where both entry and next are NULL.
3261   // Team is added to the list only once.
3262   // List is sorted in ascending order by team id.
3263   // Team id is *not* a key.
3264 
3265   kmp_team_list_t l;
3266 
3267   KMP_DEBUG_ASSERT(list != NULL);
3268   if (team == NULL) {
3269     return;
3270   }
3271 
3272   __kmp_print_structure_team_accum(list, team->t.t_parent);
3273   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3274 
3275   // Search list for the team.
3276   l = list;
3277   while (l->next != NULL && l->entry != team) {
3278     l = l->next;
3279   }
3280   if (l->next != NULL) {
3281     return; // Team has been added before, exit.
3282   }
3283 
3284   // Team is not found. Search list again for insertion point.
3285   l = list;
3286   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3287     l = l->next;
3288   }
3289 
3290   // Insert team.
3291   {
3292     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3293         sizeof(kmp_team_list_item_t));
3294     *item = *l;
3295     l->entry = team;
3296     l->next = item;
3297   }
3298 }
3299 
3300 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3301 
3302                                        ) {
3303   __kmp_printf("%s", title);
3304   if (team != NULL) {
3305     __kmp_printf("%2x %p\n", team->t.t_id, team);
3306   } else {
3307     __kmp_printf(" - (nil)\n");
3308   }
3309 }
3310 
3311 static void __kmp_print_structure_thread(char const *title,
3312                                          kmp_info_p const *thread) {
3313   __kmp_printf("%s", title);
3314   if (thread != NULL) {
3315     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3316   } else {
3317     __kmp_printf(" - (nil)\n");
3318   }
3319 }
3320 
3321 void __kmp_print_structure(void) {
3322 
3323   kmp_team_list_t list;
3324 
3325   // Initialize list of teams.
3326   list =
3327       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3328   list->entry = NULL;
3329   list->next = NULL;
3330 
3331   __kmp_printf("\n------------------------------\nGlobal Thread "
3332                "Table\n------------------------------\n");
3333   {
3334     int gtid;
3335     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3336       __kmp_printf("%2d", gtid);
3337       if (__kmp_threads != NULL) {
3338         __kmp_printf(" %p", __kmp_threads[gtid]);
3339       }
3340       if (__kmp_root != NULL) {
3341         __kmp_printf(" %p", __kmp_root[gtid]);
3342       }
3343       __kmp_printf("\n");
3344     }
3345   }
3346 
3347   // Print out __kmp_threads array.
3348   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3349                "----------\n");
3350   if (__kmp_threads != NULL) {
3351     int gtid;
3352     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3353       kmp_info_t const *thread = __kmp_threads[gtid];
3354       if (thread != NULL) {
3355         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3356         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3357         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3358         __kmp_print_structure_team("    Serial Team:  ",
3359                                    thread->th.th_serial_team);
3360         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3361         __kmp_print_structure_thread("    Master:       ",
3362                                      thread->th.th_team_master);
3363         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3364         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3365         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3366         __kmp_print_structure_thread("    Next in pool: ",
3367                                      thread->th.th_next_pool);
3368         __kmp_printf("\n");
3369         __kmp_print_structure_team_accum(list, thread->th.th_team);
3370         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3371       }
3372     }
3373   } else {
3374     __kmp_printf("Threads array is not allocated.\n");
3375   }
3376 
3377   // Print out __kmp_root array.
3378   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3379                "--------\n");
3380   if (__kmp_root != NULL) {
3381     int gtid;
3382     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3383       kmp_root_t const *root = __kmp_root[gtid];
3384       if (root != NULL) {
3385         __kmp_printf("GTID %2d %p:\n", gtid, root);
3386         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3387         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3388         __kmp_print_structure_thread("    Uber Thread:  ",
3389                                      root->r.r_uber_thread);
3390         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3391         __kmp_printf("    In Parallel:  %2d\n",
3392                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3393         __kmp_printf("\n");
3394         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3395         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3396       }
3397     }
3398   } else {
3399     __kmp_printf("Ubers array is not allocated.\n");
3400   }
3401 
3402   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3403                "--------\n");
3404   while (list->next != NULL) {
3405     kmp_team_p const *team = list->entry;
3406     int i;
3407     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3408     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3409     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3410     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3411     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3412     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3413     for (i = 0; i < team->t.t_nproc; ++i) {
3414       __kmp_printf("    Thread %2d:      ", i);
3415       __kmp_print_structure_thread("", team->t.t_threads[i]);
3416     }
3417     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3418     __kmp_printf("\n");
3419     list = list->next;
3420   }
3421 
3422   // Print out __kmp_thread_pool and __kmp_team_pool.
3423   __kmp_printf("\n------------------------------\nPools\n----------------------"
3424                "--------\n");
3425   __kmp_print_structure_thread("Thread pool:          ",
3426                                CCAST(kmp_info_t *, __kmp_thread_pool));
3427   __kmp_print_structure_team("Team pool:            ",
3428                              CCAST(kmp_team_t *, __kmp_team_pool));
3429   __kmp_printf("\n");
3430 
3431   // Free team list.
3432   while (list != NULL) {
3433     kmp_team_list_item_t *item = list;
3434     list = list->next;
3435     KMP_INTERNAL_FREE(item);
3436   }
3437 }
3438 
3439 #endif
3440 
3441 //---------------------------------------------------------------------------
3442 //  Stuff for per-thread fast random number generator
3443 //  Table of primes
3444 static const unsigned __kmp_primes[] = {
3445     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3446     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3447     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3448     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3449     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3450     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3451     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3452     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3453     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3454     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3455     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3456 
3457 //---------------------------------------------------------------------------
3458 //  __kmp_get_random: Get a random number using a linear congruential method.
3459 unsigned short __kmp_get_random(kmp_info_t *thread) {
3460   unsigned x = thread->th.th_x;
3461   unsigned short r = (unsigned short)(x >> 16);
3462 
3463   thread->th.th_x = x * thread->th.th_a + 1;
3464 
3465   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3466                 thread->th.th_info.ds.ds_tid, r));
3467 
3468   return r;
3469 }
3470 //--------------------------------------------------------
3471 // __kmp_init_random: Initialize a random number generator
3472 void __kmp_init_random(kmp_info_t *thread) {
3473   unsigned seed = thread->th.th_info.ds.ds_tid;
3474 
3475   thread->th.th_a =
3476       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3477   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3478   KA_TRACE(30,
3479            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3480 }
3481 
3482 #if KMP_OS_WINDOWS
3483 /* reclaim array entries for root threads that are already dead, returns number
3484  * reclaimed */
3485 static int __kmp_reclaim_dead_roots(void) {
3486   int i, r = 0;
3487 
3488   for (i = 0; i < __kmp_threads_capacity; ++i) {
3489     if (KMP_UBER_GTID(i) &&
3490         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3491         !__kmp_root[i]
3492              ->r.r_active) { // AC: reclaim only roots died in non-active state
3493       r += __kmp_unregister_root_other_thread(i);
3494     }
3495   }
3496   return r;
3497 }
3498 #endif
3499 
3500 /* This function attempts to create free entries in __kmp_threads and
3501    __kmp_root, and returns the number of free entries generated.
3502 
3503    For Windows* OS static library, the first mechanism used is to reclaim array
3504    entries for root threads that are already dead.
3505 
3506    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3507    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3508    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3509    threadprivate cache array has been created. Synchronization with
3510    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3511 
3512    After any dead root reclamation, if the clipping value allows array expansion
3513    to result in the generation of a total of nNeed free slots, the function does
3514    that expansion. If not, nothing is done beyond the possible initial root
3515    thread reclamation.
3516 
3517    If any argument is negative, the behavior is undefined. */
3518 static int __kmp_expand_threads(int nNeed) {
3519   int added = 0;
3520   int minimumRequiredCapacity;
3521   int newCapacity;
3522   kmp_info_t **newThreads;
3523   kmp_root_t **newRoot;
3524 
3525 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3526 // resizing __kmp_threads does not need additional protection if foreign
3527 // threads are present
3528 
3529 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3530   /* only for Windows static library */
3531   /* reclaim array entries for root threads that are already dead */
3532   added = __kmp_reclaim_dead_roots();
3533 
3534   if (nNeed) {
3535     nNeed -= added;
3536     if (nNeed < 0)
3537       nNeed = 0;
3538   }
3539 #endif
3540   if (nNeed <= 0)
3541     return added;
3542 
3543   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3544   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3545   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3546   // > __kmp_max_nth in one of two ways:
3547   //
3548   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3549   //    may not be reused by another thread, so we may need to increase
3550   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3551   //
3552   // 2) New foreign root(s) are encountered.  We always register new foreign
3553   //    roots. This may cause a smaller # of threads to be allocated at
3554   //    subsequent parallel regions, but the worker threads hang around (and
3555   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3556   //
3557   // Anyway, that is the reason for moving the check to see if
3558   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3559   // instead of having it performed here. -BB
3560 
3561   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3562 
3563   /* compute expansion headroom to check if we can expand */
3564   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3565     /* possible expansion too small -- give up */
3566     return added;
3567   }
3568   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3569 
3570   newCapacity = __kmp_threads_capacity;
3571   do {
3572     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3573                                                           : __kmp_sys_max_nth;
3574   } while (newCapacity < minimumRequiredCapacity);
3575   newThreads = (kmp_info_t **)__kmp_allocate(
3576       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3577   newRoot =
3578       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3579   KMP_MEMCPY(newThreads, __kmp_threads,
3580              __kmp_threads_capacity * sizeof(kmp_info_t *));
3581   KMP_MEMCPY(newRoot, __kmp_root,
3582              __kmp_threads_capacity * sizeof(kmp_root_t *));
3583 
3584   kmp_info_t **temp_threads = __kmp_threads;
3585   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3586   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3587   __kmp_free(temp_threads);
3588   added += newCapacity - __kmp_threads_capacity;
3589   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3590 
3591   if (newCapacity > __kmp_tp_capacity) {
3592     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3593     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3594       __kmp_threadprivate_resize_cache(newCapacity);
3595     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3596       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3597     }
3598     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3599   }
3600 
3601   return added;
3602 }
3603 
3604 /* Register the current thread as a root thread and obtain our gtid. We must
3605    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3606    thread that calls from __kmp_do_serial_initialize() */
3607 int __kmp_register_root(int initial_thread) {
3608   kmp_info_t *root_thread;
3609   kmp_root_t *root;
3610   int gtid;
3611   int capacity;
3612   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3613   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3614   KMP_MB();
3615 
3616   /* 2007-03-02:
3617      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3618      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3619      work as expected -- it may return false (that means there is at least one
3620      empty slot in __kmp_threads array), but it is possible the only free slot
3621      is #0, which is reserved for initial thread and so cannot be used for this
3622      one. Following code workarounds this bug.
3623 
3624      However, right solution seems to be not reserving slot #0 for initial
3625      thread because:
3626      (1) there is no magic in slot #0,
3627      (2) we cannot detect initial thread reliably (the first thread which does
3628         serial initialization may be not a real initial thread).
3629   */
3630   capacity = __kmp_threads_capacity;
3631   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3632     --capacity;
3633   }
3634 
3635   /* see if there are too many threads */
3636   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3637     if (__kmp_tp_cached) {
3638       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3639                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3640                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3641     } else {
3642       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3643                   __kmp_msg_null);
3644     }
3645   }
3646 
3647   /* find an available thread slot */
3648   /* Don't reassign the zero slot since we need that to only be used by initial
3649      thread */
3650   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3651        gtid++)
3652     ;
3653   KA_TRACE(1,
3654            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3655   KMP_ASSERT(gtid < __kmp_threads_capacity);
3656 
3657   /* update global accounting */
3658   __kmp_all_nth++;
3659   TCW_4(__kmp_nth, __kmp_nth + 1);
3660 
3661   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3662   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3663   if (__kmp_adjust_gtid_mode) {
3664     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3665       if (TCR_4(__kmp_gtid_mode) != 2) {
3666         TCW_4(__kmp_gtid_mode, 2);
3667       }
3668     } else {
3669       if (TCR_4(__kmp_gtid_mode) != 1) {
3670         TCW_4(__kmp_gtid_mode, 1);
3671       }
3672     }
3673   }
3674 
3675 #ifdef KMP_ADJUST_BLOCKTIME
3676   /* Adjust blocktime to zero if necessary            */
3677   /* Middle initialization might not have occurred yet */
3678   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3679     if (__kmp_nth > __kmp_avail_proc) {
3680       __kmp_zero_bt = TRUE;
3681     }
3682   }
3683 #endif /* KMP_ADJUST_BLOCKTIME */
3684 
3685   /* setup this new hierarchy */
3686   if (!(root = __kmp_root[gtid])) {
3687     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3688     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3689   }
3690 
3691 #if KMP_STATS_ENABLED
3692   // Initialize stats as soon as possible (right after gtid assignment).
3693   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3694   __kmp_stats_thread_ptr->startLife();
3695   KMP_SET_THREAD_STATE(SERIAL_REGION);
3696   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3697 #endif
3698   __kmp_initialize_root(root);
3699 
3700   /* setup new root thread structure */
3701   if (root->r.r_uber_thread) {
3702     root_thread = root->r.r_uber_thread;
3703   } else {
3704     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3705     if (__kmp_storage_map) {
3706       __kmp_print_thread_storage_map(root_thread, gtid);
3707     }
3708     root_thread->th.th_info.ds.ds_gtid = gtid;
3709 #if OMPT_SUPPORT
3710     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3711 #endif
3712     root_thread->th.th_root = root;
3713     if (__kmp_env_consistency_check) {
3714       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3715     }
3716 #if USE_FAST_MEMORY
3717     __kmp_initialize_fast_memory(root_thread);
3718 #endif /* USE_FAST_MEMORY */
3719 
3720 #if KMP_USE_BGET
3721     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3722     __kmp_initialize_bget(root_thread);
3723 #endif
3724     __kmp_init_random(root_thread); // Initialize random number generator
3725   }
3726 
3727   /* setup the serial team held in reserve by the root thread */
3728   if (!root_thread->th.th_serial_team) {
3729     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3730     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3731     root_thread->th.th_serial_team = __kmp_allocate_team(
3732         root, 1, 1,
3733 #if OMPT_SUPPORT
3734         ompt_data_none, // root parallel id
3735 #endif
3736         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3737   }
3738   KMP_ASSERT(root_thread->th.th_serial_team);
3739   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3740                 root_thread->th.th_serial_team));
3741 
3742   /* drop root_thread into place */
3743   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3744 
3745   root->r.r_root_team->t.t_threads[0] = root_thread;
3746   root->r.r_hot_team->t.t_threads[0] = root_thread;
3747   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3748   // AC: the team created in reserve, not for execution (it is unused for now).
3749   root_thread->th.th_serial_team->t.t_serialized = 0;
3750   root->r.r_uber_thread = root_thread;
3751 
3752   /* initialize the thread, get it ready to go */
3753   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3754   TCW_4(__kmp_init_gtid, TRUE);
3755 
3756   /* prepare the master thread for get_gtid() */
3757   __kmp_gtid_set_specific(gtid);
3758 
3759 #if USE_ITT_BUILD
3760   __kmp_itt_thread_name(gtid);
3761 #endif /* USE_ITT_BUILD */
3762 
3763 #ifdef KMP_TDATA_GTID
3764   __kmp_gtid = gtid;
3765 #endif
3766   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3767   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3768 
3769   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3770                 "plain=%u\n",
3771                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3772                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3773                 KMP_INIT_BARRIER_STATE));
3774   { // Initialize barrier data.
3775     int b;
3776     for (b = 0; b < bs_last_barrier; ++b) {
3777       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3778 #if USE_DEBUGGER
3779       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3780 #endif
3781     }
3782   }
3783   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3784                    KMP_INIT_BARRIER_STATE);
3785 
3786 #if KMP_AFFINITY_SUPPORTED
3787   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3788   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3789   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3790   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3791   if (TCR_4(__kmp_init_middle)) {
3792     __kmp_affinity_set_init_mask(gtid, TRUE);
3793   }
3794 #endif /* KMP_AFFINITY_SUPPORTED */
3795   root_thread->th.th_def_allocator = __kmp_def_allocator;
3796   root_thread->th.th_prev_level = 0;
3797   root_thread->th.th_prev_num_threads = 1;
3798 
3799   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3800   tmp->cg_root = root_thread;
3801   tmp->cg_thread_limit = __kmp_cg_max_nth;
3802   tmp->cg_nthreads = 1;
3803   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3804                  " cg_nthreads init to 1\n",
3805                  root_thread, tmp));
3806   tmp->up = NULL;
3807   root_thread->th.th_cg_roots = tmp;
3808 
3809   __kmp_root_counter++;
3810 
3811 #if OMPT_SUPPORT
3812   if (!initial_thread && ompt_enabled.enabled) {
3813 
3814     kmp_info_t *root_thread = ompt_get_thread();
3815 
3816     ompt_set_thread_state(root_thread, ompt_state_overhead);
3817 
3818     if (ompt_enabled.ompt_callback_thread_begin) {
3819       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3820           ompt_thread_initial, __ompt_get_thread_data_internal());
3821     }
3822     ompt_data_t *task_data;
3823     ompt_data_t *parallel_data;
3824     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3825     if (ompt_enabled.ompt_callback_implicit_task) {
3826       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3827           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3828     }
3829 
3830     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3831   }
3832 #endif
3833 
3834   KMP_MB();
3835   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3836 
3837   return gtid;
3838 }
3839 
3840 #if KMP_NESTED_HOT_TEAMS
3841 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3842                                 const int max_level) {
3843   int i, n, nth;
3844   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3845   if (!hot_teams || !hot_teams[level].hot_team) {
3846     return 0;
3847   }
3848   KMP_DEBUG_ASSERT(level < max_level);
3849   kmp_team_t *team = hot_teams[level].hot_team;
3850   nth = hot_teams[level].hot_team_nth;
3851   n = nth - 1; // master is not freed
3852   if (level < max_level - 1) {
3853     for (i = 0; i < nth; ++i) {
3854       kmp_info_t *th = team->t.t_threads[i];
3855       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3856       if (i > 0 && th->th.th_hot_teams) {
3857         __kmp_free(th->th.th_hot_teams);
3858         th->th.th_hot_teams = NULL;
3859       }
3860     }
3861   }
3862   __kmp_free_team(root, team, NULL);
3863   return n;
3864 }
3865 #endif
3866 
3867 // Resets a root thread and clear its root and hot teams.
3868 // Returns the number of __kmp_threads entries directly and indirectly freed.
3869 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3870   kmp_team_t *root_team = root->r.r_root_team;
3871   kmp_team_t *hot_team = root->r.r_hot_team;
3872   int n = hot_team->t.t_nproc;
3873   int i;
3874 
3875   KMP_DEBUG_ASSERT(!root->r.r_active);
3876 
3877   root->r.r_root_team = NULL;
3878   root->r.r_hot_team = NULL;
3879   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3880   // before call to __kmp_free_team().
3881   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3882 #if KMP_NESTED_HOT_TEAMS
3883   if (__kmp_hot_teams_max_level >
3884       0) { // need to free nested hot teams and their threads if any
3885     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3886       kmp_info_t *th = hot_team->t.t_threads[i];
3887       if (__kmp_hot_teams_max_level > 1) {
3888         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3889       }
3890       if (th->th.th_hot_teams) {
3891         __kmp_free(th->th.th_hot_teams);
3892         th->th.th_hot_teams = NULL;
3893       }
3894     }
3895   }
3896 #endif
3897   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3898 
3899   // Before we can reap the thread, we need to make certain that all other
3900   // threads in the teams that had this root as ancestor have stopped trying to
3901   // steal tasks.
3902   if (__kmp_tasking_mode != tskm_immediate_exec) {
3903     __kmp_wait_to_unref_task_teams();
3904   }
3905 
3906 #if KMP_OS_WINDOWS
3907   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3908   KA_TRACE(
3909       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3910            "\n",
3911            (LPVOID) & (root->r.r_uber_thread->th),
3912            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3913   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3914 #endif /* KMP_OS_WINDOWS */
3915 
3916 #if OMPT_SUPPORT
3917   ompt_data_t *task_data;
3918   ompt_data_t *parallel_data;
3919   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3920   if (ompt_enabled.ompt_callback_implicit_task) {
3921     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3922         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3923   }
3924   if (ompt_enabled.ompt_callback_thread_end) {
3925     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3926         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3927   }
3928 #endif
3929 
3930   TCW_4(__kmp_nth,
3931         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3932   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3933   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3934                  " to %d\n",
3935                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3936                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3937   if (i == 1) {
3938     // need to free contention group structure
3939     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3940                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3941     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3942     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3943     root->r.r_uber_thread->th.th_cg_roots = NULL;
3944   }
3945   __kmp_reap_thread(root->r.r_uber_thread, 1);
3946 
3947   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3948   // instead of freeing.
3949   root->r.r_uber_thread = NULL;
3950   /* mark root as no longer in use */
3951   root->r.r_begin = FALSE;
3952 
3953   return n;
3954 }
3955 
3956 void __kmp_unregister_root_current_thread(int gtid) {
3957   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3958   /* this lock should be ok, since unregister_root_current_thread is never
3959      called during an abort, only during a normal close. furthermore, if you
3960      have the forkjoin lock, you should never try to get the initz lock */
3961   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3962   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3963     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3964                   "exiting T#%d\n",
3965                   gtid));
3966     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3967     return;
3968   }
3969   kmp_root_t *root = __kmp_root[gtid];
3970 
3971   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3972   KMP_ASSERT(KMP_UBER_GTID(gtid));
3973   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3974   KMP_ASSERT(root->r.r_active == FALSE);
3975 
3976   KMP_MB();
3977 
3978   kmp_info_t *thread = __kmp_threads[gtid];
3979   kmp_team_t *team = thread->th.th_team;
3980   kmp_task_team_t *task_team = thread->th.th_task_team;
3981 
3982   // we need to wait for the proxy tasks before finishing the thread
3983   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3984 #if OMPT_SUPPORT
3985     // the runtime is shutting down so we won't report any events
3986     thread->th.ompt_thread_info.state = ompt_state_undefined;
3987 #endif
3988     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3989   }
3990 
3991   __kmp_reset_root(gtid, root);
3992 
3993   KMP_MB();
3994   KC_TRACE(10,
3995            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3996 
3997   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3998 }
3999 
4000 #if KMP_OS_WINDOWS
4001 /* __kmp_forkjoin_lock must be already held
4002    Unregisters a root thread that is not the current thread.  Returns the number
4003    of __kmp_threads entries freed as a result. */
4004 static int __kmp_unregister_root_other_thread(int gtid) {
4005   kmp_root_t *root = __kmp_root[gtid];
4006   int r;
4007 
4008   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4009   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4010   KMP_ASSERT(KMP_UBER_GTID(gtid));
4011   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4012   KMP_ASSERT(root->r.r_active == FALSE);
4013 
4014   r = __kmp_reset_root(gtid, root);
4015   KC_TRACE(10,
4016            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4017   return r;
4018 }
4019 #endif
4020 
4021 #if KMP_DEBUG
4022 void __kmp_task_info() {
4023 
4024   kmp_int32 gtid = __kmp_entry_gtid();
4025   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4026   kmp_info_t *this_thr = __kmp_threads[gtid];
4027   kmp_team_t *steam = this_thr->th.th_serial_team;
4028   kmp_team_t *team = this_thr->th.th_team;
4029 
4030   __kmp_printf(
4031       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4032       "ptask=%p\n",
4033       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4034       team->t.t_implicit_task_taskdata[tid].td_parent);
4035 }
4036 #endif // KMP_DEBUG
4037 
4038 /* TODO optimize with one big memclr, take out what isn't needed, split
4039    responsibility to workers as much as possible, and delay initialization of
4040    features as much as possible  */
4041 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4042                                   int tid, int gtid) {
4043   /* this_thr->th.th_info.ds.ds_gtid is setup in
4044      kmp_allocate_thread/create_worker.
4045      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4046   kmp_info_t *master = team->t.t_threads[0];
4047   KMP_DEBUG_ASSERT(this_thr != NULL);
4048   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4049   KMP_DEBUG_ASSERT(team);
4050   KMP_DEBUG_ASSERT(team->t.t_threads);
4051   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4052   KMP_DEBUG_ASSERT(master);
4053   KMP_DEBUG_ASSERT(master->th.th_root);
4054 
4055   KMP_MB();
4056 
4057   TCW_SYNC_PTR(this_thr->th.th_team, team);
4058 
4059   this_thr->th.th_info.ds.ds_tid = tid;
4060   this_thr->th.th_set_nproc = 0;
4061   if (__kmp_tasking_mode != tskm_immediate_exec)
4062     // When tasking is possible, threads are not safe to reap until they are
4063     // done tasking; this will be set when tasking code is exited in wait
4064     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4065   else // no tasking --> always safe to reap
4066     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4067   this_thr->th.th_set_proc_bind = proc_bind_default;
4068 #if KMP_AFFINITY_SUPPORTED
4069   this_thr->th.th_new_place = this_thr->th.th_current_place;
4070 #endif
4071   this_thr->th.th_root = master->th.th_root;
4072 
4073   /* setup the thread's cache of the team structure */
4074   this_thr->th.th_team_nproc = team->t.t_nproc;
4075   this_thr->th.th_team_master = master;
4076   this_thr->th.th_team_serialized = team->t.t_serialized;
4077   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4078 
4079   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4080 
4081   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4082                 tid, gtid, this_thr, this_thr->th.th_current_task));
4083 
4084   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4085                            team, tid, TRUE);
4086 
4087   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4088                 tid, gtid, this_thr, this_thr->th.th_current_task));
4089   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4090   // __kmp_initialize_team()?
4091 
4092   /* TODO no worksharing in speculative threads */
4093   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4094 
4095   this_thr->th.th_local.this_construct = 0;
4096 
4097   if (!this_thr->th.th_pri_common) {
4098     this_thr->th.th_pri_common =
4099         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4100     if (__kmp_storage_map) {
4101       __kmp_print_storage_map_gtid(
4102           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4103           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4104     }
4105     this_thr->th.th_pri_head = NULL;
4106   }
4107 
4108   if (this_thr != master && // Master's CG root is initialized elsewhere
4109       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4110     // Make new thread's CG root same as master's
4111     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4112     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4113     if (tmp) {
4114       // worker changes CG, need to check if old CG should be freed
4115       int i = tmp->cg_nthreads--;
4116       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4117                      " on node %p of thread %p to %d\n",
4118                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4119       if (i == 1) {
4120         __kmp_free(tmp); // last thread left CG --> free it
4121       }
4122     }
4123     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4124     // Increment new thread's CG root's counter to add the new thread
4125     this_thr->th.th_cg_roots->cg_nthreads++;
4126     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4127                    " node %p of thread %p to %d\n",
4128                    this_thr, this_thr->th.th_cg_roots,
4129                    this_thr->th.th_cg_roots->cg_root,
4130                    this_thr->th.th_cg_roots->cg_nthreads));
4131     this_thr->th.th_current_task->td_icvs.thread_limit =
4132         this_thr->th.th_cg_roots->cg_thread_limit;
4133   }
4134 
4135   /* Initialize dynamic dispatch */
4136   {
4137     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4138     // Use team max_nproc since this will never change for the team.
4139     size_t disp_size =
4140         sizeof(dispatch_private_info_t) *
4141         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4142     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4143                   team->t.t_max_nproc));
4144     KMP_ASSERT(dispatch);
4145     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4146     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4147 
4148     dispatch->th_disp_index = 0;
4149     dispatch->th_doacross_buf_idx = 0;
4150     if (!dispatch->th_disp_buffer) {
4151       dispatch->th_disp_buffer =
4152           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4153 
4154       if (__kmp_storage_map) {
4155         __kmp_print_storage_map_gtid(
4156             gtid, &dispatch->th_disp_buffer[0],
4157             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4158                                           ? 1
4159                                           : __kmp_dispatch_num_buffers],
4160             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4161                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4162             gtid, team->t.t_id, gtid);
4163       }
4164     } else {
4165       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4166     }
4167 
4168     dispatch->th_dispatch_pr_current = 0;
4169     dispatch->th_dispatch_sh_current = 0;
4170 
4171     dispatch->th_deo_fcn = 0; /* ORDERED     */
4172     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4173   }
4174 
4175   this_thr->th.th_next_pool = NULL;
4176 
4177   if (!this_thr->th.th_task_state_memo_stack) {
4178     size_t i;
4179     this_thr->th.th_task_state_memo_stack =
4180         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4181     this_thr->th.th_task_state_top = 0;
4182     this_thr->th.th_task_state_stack_sz = 4;
4183     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4184          ++i) // zero init the stack
4185       this_thr->th.th_task_state_memo_stack[i] = 0;
4186   }
4187 
4188   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4189   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4190 
4191   KMP_MB();
4192 }
4193 
4194 /* allocate a new thread for the requesting team. this is only called from
4195    within a forkjoin critical section. we will first try to get an available
4196    thread from the thread pool. if none is available, we will fork a new one
4197    assuming we are able to create a new one. this should be assured, as the
4198    caller should check on this first. */
4199 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4200                                   int new_tid) {
4201   kmp_team_t *serial_team;
4202   kmp_info_t *new_thr;
4203   int new_gtid;
4204 
4205   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4206   KMP_DEBUG_ASSERT(root && team);
4207 #if !KMP_NESTED_HOT_TEAMS
4208   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4209 #endif
4210   KMP_MB();
4211 
4212   /* first, try to get one from the thread pool */
4213   if (__kmp_thread_pool) {
4214     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4215     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4216     if (new_thr == __kmp_thread_pool_insert_pt) {
4217       __kmp_thread_pool_insert_pt = NULL;
4218     }
4219     TCW_4(new_thr->th.th_in_pool, FALSE);
4220     __kmp_suspend_initialize_thread(new_thr);
4221     __kmp_lock_suspend_mx(new_thr);
4222     if (new_thr->th.th_active_in_pool == TRUE) {
4223       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4224       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4225       new_thr->th.th_active_in_pool = FALSE;
4226     }
4227     __kmp_unlock_suspend_mx(new_thr);
4228 
4229     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4230                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4231     KMP_ASSERT(!new_thr->th.th_team);
4232     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4233 
4234     /* setup the thread structure */
4235     __kmp_initialize_info(new_thr, team, new_tid,
4236                           new_thr->th.th_info.ds.ds_gtid);
4237     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4238 
4239     TCW_4(__kmp_nth, __kmp_nth + 1);
4240 
4241     new_thr->th.th_task_state = 0;
4242     new_thr->th.th_task_state_top = 0;
4243     new_thr->th.th_task_state_stack_sz = 4;
4244 
4245 #ifdef KMP_ADJUST_BLOCKTIME
4246     /* Adjust blocktime back to zero if necessary */
4247     /* Middle initialization might not have occurred yet */
4248     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4249       if (__kmp_nth > __kmp_avail_proc) {
4250         __kmp_zero_bt = TRUE;
4251       }
4252     }
4253 #endif /* KMP_ADJUST_BLOCKTIME */
4254 
4255 #if KMP_DEBUG
4256     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4257     // KMP_BARRIER_PARENT_FLAG.
4258     int b;
4259     kmp_balign_t *balign = new_thr->th.th_bar;
4260     for (b = 0; b < bs_last_barrier; ++b)
4261       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4262 #endif
4263 
4264     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4265                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4266 
4267     KMP_MB();
4268     return new_thr;
4269   }
4270 
4271   /* no, well fork a new one */
4272   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4273   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4274 
4275 #if KMP_USE_MONITOR
4276   // If this is the first worker thread the RTL is creating, then also
4277   // launch the monitor thread.  We try to do this as early as possible.
4278   if (!TCR_4(__kmp_init_monitor)) {
4279     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4280     if (!TCR_4(__kmp_init_monitor)) {
4281       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4282       TCW_4(__kmp_init_monitor, 1);
4283       __kmp_create_monitor(&__kmp_monitor);
4284       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4285 #if KMP_OS_WINDOWS
4286       // AC: wait until monitor has started. This is a fix for CQ232808.
4287       // The reason is that if the library is loaded/unloaded in a loop with
4288       // small (parallel) work in between, then there is high probability that
4289       // monitor thread started after the library shutdown. At shutdown it is
4290       // too late to cope with the problem, because when the master is in
4291       // DllMain (process detach) the monitor has no chances to start (it is
4292       // blocked), and master has no means to inform the monitor that the
4293       // library has gone, because all the memory which the monitor can access
4294       // is going to be released/reset.
4295       while (TCR_4(__kmp_init_monitor) < 2) {
4296         KMP_YIELD(TRUE);
4297       }
4298       KF_TRACE(10, ("after monitor thread has started\n"));
4299 #endif
4300     }
4301     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4302   }
4303 #endif
4304 
4305   KMP_MB();
4306   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4307     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4308   }
4309 
4310   /* allocate space for it. */
4311   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4312 
4313   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4314 
4315 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4316   // suppress race conditions detection on synchronization flags in debug mode
4317   // this helps to analyze library internals eliminating false positives
4318   __itt_suppress_mark_range(
4319       __itt_suppress_range, __itt_suppress_threading_errors,
4320       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4321   __itt_suppress_mark_range(
4322       __itt_suppress_range, __itt_suppress_threading_errors,
4323       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4324 #if KMP_OS_WINDOWS
4325   __itt_suppress_mark_range(
4326       __itt_suppress_range, __itt_suppress_threading_errors,
4327       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4328 #else
4329   __itt_suppress_mark_range(__itt_suppress_range,
4330                             __itt_suppress_threading_errors,
4331                             &new_thr->th.th_suspend_init_count,
4332                             sizeof(new_thr->th.th_suspend_init_count));
4333 #endif
4334   // TODO: check if we need to also suppress b_arrived flags
4335   __itt_suppress_mark_range(__itt_suppress_range,
4336                             __itt_suppress_threading_errors,
4337                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4338                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4339   __itt_suppress_mark_range(__itt_suppress_range,
4340                             __itt_suppress_threading_errors,
4341                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4342                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4343   __itt_suppress_mark_range(__itt_suppress_range,
4344                             __itt_suppress_threading_errors,
4345                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4346                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4347 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4348   if (__kmp_storage_map) {
4349     __kmp_print_thread_storage_map(new_thr, new_gtid);
4350   }
4351 
4352   // add the reserve serialized team, initialized from the team's master thread
4353   {
4354     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4355     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4356     new_thr->th.th_serial_team = serial_team =
4357         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4358 #if OMPT_SUPPORT
4359                                           ompt_data_none, // root parallel id
4360 #endif
4361                                           proc_bind_default, &r_icvs,
4362                                           0 USE_NESTED_HOT_ARG(NULL));
4363   }
4364   KMP_ASSERT(serial_team);
4365   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4366   // execution (it is unused for now).
4367   serial_team->t.t_threads[0] = new_thr;
4368   KF_TRACE(10,
4369            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4370             new_thr));
4371 
4372   /* setup the thread structures */
4373   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4374 
4375 #if USE_FAST_MEMORY
4376   __kmp_initialize_fast_memory(new_thr);
4377 #endif /* USE_FAST_MEMORY */
4378 
4379 #if KMP_USE_BGET
4380   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4381   __kmp_initialize_bget(new_thr);
4382 #endif
4383 
4384   __kmp_init_random(new_thr); // Initialize random number generator
4385 
4386   /* Initialize these only once when thread is grabbed for a team allocation */
4387   KA_TRACE(20,
4388            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4389             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4390 
4391   int b;
4392   kmp_balign_t *balign = new_thr->th.th_bar;
4393   for (b = 0; b < bs_last_barrier; ++b) {
4394     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4395     balign[b].bb.team = NULL;
4396     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4397     balign[b].bb.use_oncore_barrier = 0;
4398   }
4399 
4400   new_thr->th.th_spin_here = FALSE;
4401   new_thr->th.th_next_waiting = 0;
4402 #if KMP_OS_UNIX
4403   new_thr->th.th_blocking = false;
4404 #endif
4405 
4406 #if KMP_AFFINITY_SUPPORTED
4407   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4408   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4409   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4410   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4411 #endif
4412   new_thr->th.th_def_allocator = __kmp_def_allocator;
4413   new_thr->th.th_prev_level = 0;
4414   new_thr->th.th_prev_num_threads = 1;
4415 
4416   TCW_4(new_thr->th.th_in_pool, FALSE);
4417   new_thr->th.th_active_in_pool = FALSE;
4418   TCW_4(new_thr->th.th_active, TRUE);
4419 
4420   /* adjust the global counters */
4421   __kmp_all_nth++;
4422   __kmp_nth++;
4423 
4424   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4425   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4426   if (__kmp_adjust_gtid_mode) {
4427     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4428       if (TCR_4(__kmp_gtid_mode) != 2) {
4429         TCW_4(__kmp_gtid_mode, 2);
4430       }
4431     } else {
4432       if (TCR_4(__kmp_gtid_mode) != 1) {
4433         TCW_4(__kmp_gtid_mode, 1);
4434       }
4435     }
4436   }
4437 
4438 #ifdef KMP_ADJUST_BLOCKTIME
4439   /* Adjust blocktime back to zero if necessary       */
4440   /* Middle initialization might not have occurred yet */
4441   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4442     if (__kmp_nth > __kmp_avail_proc) {
4443       __kmp_zero_bt = TRUE;
4444     }
4445   }
4446 #endif /* KMP_ADJUST_BLOCKTIME */
4447 
4448   /* actually fork it and create the new worker thread */
4449   KF_TRACE(
4450       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4451   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4452   KF_TRACE(10,
4453            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4454 
4455   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4456                 new_gtid));
4457   KMP_MB();
4458   return new_thr;
4459 }
4460 
4461 /* Reinitialize team for reuse.
4462    The hot team code calls this case at every fork barrier, so EPCC barrier
4463    test are extremely sensitive to changes in it, esp. writes to the team
4464    struct, which cause a cache invalidation in all threads.
4465    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4466 static void __kmp_reinitialize_team(kmp_team_t *team,
4467                                     kmp_internal_control_t *new_icvs,
4468                                     ident_t *loc) {
4469   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4470                 team->t.t_threads[0], team));
4471   KMP_DEBUG_ASSERT(team && new_icvs);
4472   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4473   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4474 
4475   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4476   // Copy ICVs to the master thread's implicit taskdata
4477   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4478   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4479 
4480   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4481                 team->t.t_threads[0], team));
4482 }
4483 
4484 /* Initialize the team data structure.
4485    This assumes the t_threads and t_max_nproc are already set.
4486    Also, we don't touch the arguments */
4487 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4488                                   kmp_internal_control_t *new_icvs,
4489                                   ident_t *loc) {
4490   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4491 
4492   /* verify */
4493   KMP_DEBUG_ASSERT(team);
4494   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4495   KMP_DEBUG_ASSERT(team->t.t_threads);
4496   KMP_MB();
4497 
4498   team->t.t_master_tid = 0; /* not needed */
4499   /* team->t.t_master_bar;        not needed */
4500   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4501   team->t.t_nproc = new_nproc;
4502 
4503   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4504   team->t.t_next_pool = NULL;
4505   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4506    * up hot team */
4507 
4508   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4509   team->t.t_invoke = NULL; /* not needed */
4510 
4511   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4512   team->t.t_sched.sched = new_icvs->sched.sched;
4513 
4514 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4515   team->t.t_fp_control_saved = FALSE; /* not needed */
4516   team->t.t_x87_fpu_control_word = 0; /* not needed */
4517   team->t.t_mxcsr = 0; /* not needed */
4518 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4519 
4520   team->t.t_construct = 0;
4521 
4522   team->t.t_ordered.dt.t_value = 0;
4523   team->t.t_master_active = FALSE;
4524 
4525 #ifdef KMP_DEBUG
4526   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4527 #endif
4528 #if KMP_OS_WINDOWS
4529   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4530 #endif
4531 
4532   team->t.t_control_stack_top = NULL;
4533 
4534   __kmp_reinitialize_team(team, new_icvs, loc);
4535 
4536   KMP_MB();
4537   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4538 }
4539 
4540 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4541 /* Sets full mask for thread and returns old mask, no changes to structures. */
4542 static void
4543 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4544   if (KMP_AFFINITY_CAPABLE()) {
4545     int status;
4546     if (old_mask != NULL) {
4547       status = __kmp_get_system_affinity(old_mask, TRUE);
4548       int error = errno;
4549       if (status != 0) {
4550         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4551                     __kmp_msg_null);
4552       }
4553     }
4554     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4555   }
4556 }
4557 #endif
4558 
4559 #if KMP_AFFINITY_SUPPORTED
4560 
4561 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4562 // It calculates the worker + master thread's partition based upon the parent
4563 // thread's partition, and binds each worker to a thread in their partition.
4564 // The master thread's partition should already include its current binding.
4565 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4566   // Copy the master thread's place partition to the team struct
4567   kmp_info_t *master_th = team->t.t_threads[0];
4568   KMP_DEBUG_ASSERT(master_th != NULL);
4569   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4570   int first_place = master_th->th.th_first_place;
4571   int last_place = master_th->th.th_last_place;
4572   int masters_place = master_th->th.th_current_place;
4573   team->t.t_first_place = first_place;
4574   team->t.t_last_place = last_place;
4575 
4576   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4577                 "bound to place %d partition = [%d,%d]\n",
4578                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4579                 team->t.t_id, masters_place, first_place, last_place));
4580 
4581   switch (proc_bind) {
4582 
4583   case proc_bind_default:
4584     // serial teams might have the proc_bind policy set to proc_bind_default. It
4585     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4586     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4587     break;
4588 
4589   case proc_bind_master: {
4590     int f;
4591     int n_th = team->t.t_nproc;
4592     for (f = 1; f < n_th; f++) {
4593       kmp_info_t *th = team->t.t_threads[f];
4594       KMP_DEBUG_ASSERT(th != NULL);
4595       th->th.th_first_place = first_place;
4596       th->th.th_last_place = last_place;
4597       th->th.th_new_place = masters_place;
4598       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4599           team->t.t_display_affinity != 1) {
4600         team->t.t_display_affinity = 1;
4601       }
4602 
4603       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4604                      "partition = [%d,%d]\n",
4605                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4606                      f, masters_place, first_place, last_place));
4607     }
4608   } break;
4609 
4610   case proc_bind_close: {
4611     int f;
4612     int n_th = team->t.t_nproc;
4613     int n_places;
4614     if (first_place <= last_place) {
4615       n_places = last_place - first_place + 1;
4616     } else {
4617       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4618     }
4619     if (n_th <= n_places) {
4620       int place = masters_place;
4621       for (f = 1; f < n_th; f++) {
4622         kmp_info_t *th = team->t.t_threads[f];
4623         KMP_DEBUG_ASSERT(th != NULL);
4624 
4625         if (place == last_place) {
4626           place = first_place;
4627         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4628           place = 0;
4629         } else {
4630           place++;
4631         }
4632         th->th.th_first_place = first_place;
4633         th->th.th_last_place = last_place;
4634         th->th.th_new_place = place;
4635         if (__kmp_display_affinity && place != th->th.th_current_place &&
4636             team->t.t_display_affinity != 1) {
4637           team->t.t_display_affinity = 1;
4638         }
4639 
4640         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4641                        "partition = [%d,%d]\n",
4642                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4643                        team->t.t_id, f, place, first_place, last_place));
4644       }
4645     } else {
4646       int S, rem, gap, s_count;
4647       S = n_th / n_places;
4648       s_count = 0;
4649       rem = n_th - (S * n_places);
4650       gap = rem > 0 ? n_places / rem : n_places;
4651       int place = masters_place;
4652       int gap_ct = gap;
4653       for (f = 0; f < n_th; f++) {
4654         kmp_info_t *th = team->t.t_threads[f];
4655         KMP_DEBUG_ASSERT(th != NULL);
4656 
4657         th->th.th_first_place = first_place;
4658         th->th.th_last_place = last_place;
4659         th->th.th_new_place = place;
4660         if (__kmp_display_affinity && place != th->th.th_current_place &&
4661             team->t.t_display_affinity != 1) {
4662           team->t.t_display_affinity = 1;
4663         }
4664         s_count++;
4665 
4666         if ((s_count == S) && rem && (gap_ct == gap)) {
4667           // do nothing, add an extra thread to place on next iteration
4668         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4669           // we added an extra thread to this place; move to next place
4670           if (place == last_place) {
4671             place = first_place;
4672           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4673             place = 0;
4674           } else {
4675             place++;
4676           }
4677           s_count = 0;
4678           gap_ct = 1;
4679           rem--;
4680         } else if (s_count == S) { // place full; don't add extra
4681           if (place == last_place) {
4682             place = first_place;
4683           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4684             place = 0;
4685           } else {
4686             place++;
4687           }
4688           gap_ct++;
4689           s_count = 0;
4690         }
4691 
4692         KA_TRACE(100,
4693                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4694                   "partition = [%d,%d]\n",
4695                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4696                   th->th.th_new_place, first_place, last_place));
4697       }
4698       KMP_DEBUG_ASSERT(place == masters_place);
4699     }
4700   } break;
4701 
4702   case proc_bind_spread: {
4703     int f;
4704     int n_th = team->t.t_nproc;
4705     int n_places;
4706     int thidx;
4707     if (first_place <= last_place) {
4708       n_places = last_place - first_place + 1;
4709     } else {
4710       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4711     }
4712     if (n_th <= n_places) {
4713       int place = -1;
4714 
4715       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4716         int S = n_places / n_th;
4717         int s_count, rem, gap, gap_ct;
4718 
4719         place = masters_place;
4720         rem = n_places - n_th * S;
4721         gap = rem ? n_th / rem : 1;
4722         gap_ct = gap;
4723         thidx = n_th;
4724         if (update_master_only == 1)
4725           thidx = 1;
4726         for (f = 0; f < thidx; f++) {
4727           kmp_info_t *th = team->t.t_threads[f];
4728           KMP_DEBUG_ASSERT(th != NULL);
4729 
4730           th->th.th_first_place = place;
4731           th->th.th_new_place = place;
4732           if (__kmp_display_affinity && place != th->th.th_current_place &&
4733               team->t.t_display_affinity != 1) {
4734             team->t.t_display_affinity = 1;
4735           }
4736           s_count = 1;
4737           while (s_count < S) {
4738             if (place == last_place) {
4739               place = first_place;
4740             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4741               place = 0;
4742             } else {
4743               place++;
4744             }
4745             s_count++;
4746           }
4747           if (rem && (gap_ct == gap)) {
4748             if (place == last_place) {
4749               place = first_place;
4750             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4751               place = 0;
4752             } else {
4753               place++;
4754             }
4755             rem--;
4756             gap_ct = 0;
4757           }
4758           th->th.th_last_place = place;
4759           gap_ct++;
4760 
4761           if (place == last_place) {
4762             place = first_place;
4763           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4764             place = 0;
4765           } else {
4766             place++;
4767           }
4768 
4769           KA_TRACE(100,
4770                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4771                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4772                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4773                     f, th->th.th_new_place, th->th.th_first_place,
4774                     th->th.th_last_place, __kmp_affinity_num_masks));
4775         }
4776       } else {
4777         /* Having uniform space of available computation places I can create
4778            T partitions of round(P/T) size and put threads into the first
4779            place of each partition. */
4780         double current = static_cast<double>(masters_place);
4781         double spacing =
4782             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4783         int first, last;
4784         kmp_info_t *th;
4785 
4786         thidx = n_th + 1;
4787         if (update_master_only == 1)
4788           thidx = 1;
4789         for (f = 0; f < thidx; f++) {
4790           first = static_cast<int>(current);
4791           last = static_cast<int>(current + spacing) - 1;
4792           KMP_DEBUG_ASSERT(last >= first);
4793           if (first >= n_places) {
4794             if (masters_place) {
4795               first -= n_places;
4796               last -= n_places;
4797               if (first == (masters_place + 1)) {
4798                 KMP_DEBUG_ASSERT(f == n_th);
4799                 first--;
4800               }
4801               if (last == masters_place) {
4802                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4803                 last--;
4804               }
4805             } else {
4806               KMP_DEBUG_ASSERT(f == n_th);
4807               first = 0;
4808               last = 0;
4809             }
4810           }
4811           if (last >= n_places) {
4812             last = (n_places - 1);
4813           }
4814           place = first;
4815           current += spacing;
4816           if (f < n_th) {
4817             KMP_DEBUG_ASSERT(0 <= first);
4818             KMP_DEBUG_ASSERT(n_places > first);
4819             KMP_DEBUG_ASSERT(0 <= last);
4820             KMP_DEBUG_ASSERT(n_places > last);
4821             KMP_DEBUG_ASSERT(last_place >= first_place);
4822             th = team->t.t_threads[f];
4823             KMP_DEBUG_ASSERT(th);
4824             th->th.th_first_place = first;
4825             th->th.th_new_place = place;
4826             th->th.th_last_place = last;
4827             if (__kmp_display_affinity && place != th->th.th_current_place &&
4828                 team->t.t_display_affinity != 1) {
4829               team->t.t_display_affinity = 1;
4830             }
4831             KA_TRACE(100,
4832                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4833                       "partition = [%d,%d], spacing = %.4f\n",
4834                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4835                       team->t.t_id, f, th->th.th_new_place,
4836                       th->th.th_first_place, th->th.th_last_place, spacing));
4837           }
4838         }
4839       }
4840       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4841     } else {
4842       int S, rem, gap, s_count;
4843       S = n_th / n_places;
4844       s_count = 0;
4845       rem = n_th - (S * n_places);
4846       gap = rem > 0 ? n_places / rem : n_places;
4847       int place = masters_place;
4848       int gap_ct = gap;
4849       thidx = n_th;
4850       if (update_master_only == 1)
4851         thidx = 1;
4852       for (f = 0; f < thidx; f++) {
4853         kmp_info_t *th = team->t.t_threads[f];
4854         KMP_DEBUG_ASSERT(th != NULL);
4855 
4856         th->th.th_first_place = place;
4857         th->th.th_last_place = place;
4858         th->th.th_new_place = place;
4859         if (__kmp_display_affinity && place != th->th.th_current_place &&
4860             team->t.t_display_affinity != 1) {
4861           team->t.t_display_affinity = 1;
4862         }
4863         s_count++;
4864 
4865         if ((s_count == S) && rem && (gap_ct == gap)) {
4866           // do nothing, add an extra thread to place on next iteration
4867         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4868           // we added an extra thread to this place; move on to next place
4869           if (place == last_place) {
4870             place = first_place;
4871           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4872             place = 0;
4873           } else {
4874             place++;
4875           }
4876           s_count = 0;
4877           gap_ct = 1;
4878           rem--;
4879         } else if (s_count == S) { // place is full; don't add extra thread
4880           if (place == last_place) {
4881             place = first_place;
4882           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4883             place = 0;
4884           } else {
4885             place++;
4886           }
4887           gap_ct++;
4888           s_count = 0;
4889         }
4890 
4891         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4892                        "partition = [%d,%d]\n",
4893                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4894                        team->t.t_id, f, th->th.th_new_place,
4895                        th->th.th_first_place, th->th.th_last_place));
4896       }
4897       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4898     }
4899   } break;
4900 
4901   default:
4902     break;
4903   }
4904 
4905   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4906 }
4907 
4908 #endif // KMP_AFFINITY_SUPPORTED
4909 
4910 /* allocate a new team data structure to use.  take one off of the free pool if
4911    available */
4912 kmp_team_t *
4913 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4914 #if OMPT_SUPPORT
4915                     ompt_data_t ompt_parallel_data,
4916 #endif
4917                     kmp_proc_bind_t new_proc_bind,
4918                     kmp_internal_control_t *new_icvs,
4919                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4920   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4921   int f;
4922   kmp_team_t *team;
4923   int use_hot_team = !root->r.r_active;
4924   int level = 0;
4925 
4926   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4927   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4928   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4929   KMP_MB();
4930 
4931 #if KMP_NESTED_HOT_TEAMS
4932   kmp_hot_team_ptr_t *hot_teams;
4933   if (master) {
4934     team = master->th.th_team;
4935     level = team->t.t_active_level;
4936     if (master->th.th_teams_microtask) { // in teams construct?
4937       if (master->th.th_teams_size.nteams > 1 &&
4938           ( // #teams > 1
4939               team->t.t_pkfn ==
4940                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4941               master->th.th_teams_level <
4942                   team->t.t_level)) { // or nested parallel inside the teams
4943         ++level; // not increment if #teams==1, or for outer fork of the teams;
4944         // increment otherwise
4945       }
4946     }
4947     hot_teams = master->th.th_hot_teams;
4948     if (level < __kmp_hot_teams_max_level && hot_teams &&
4949         hot_teams[level].hot_team) {
4950       // hot team has already been allocated for given level
4951       use_hot_team = 1;
4952     } else {
4953       use_hot_team = 0;
4954     }
4955   } else {
4956     // check we won't access uninitialized hot_teams, just in case
4957     KMP_DEBUG_ASSERT(new_nproc == 1);
4958   }
4959 #endif
4960   // Optimization to use a "hot" team
4961   if (use_hot_team && new_nproc > 1) {
4962     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4963 #if KMP_NESTED_HOT_TEAMS
4964     team = hot_teams[level].hot_team;
4965 #else
4966     team = root->r.r_hot_team;
4967 #endif
4968 #if KMP_DEBUG
4969     if (__kmp_tasking_mode != tskm_immediate_exec) {
4970       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4971                     "task_team[1] = %p before reinit\n",
4972                     team->t.t_task_team[0], team->t.t_task_team[1]));
4973     }
4974 #endif
4975 
4976     // Has the number of threads changed?
4977     /* Let's assume the most common case is that the number of threads is
4978        unchanged, and put that case first. */
4979     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4980       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4981       // This case can mean that omp_set_num_threads() was called and the hot
4982       // team size was already reduced, so we check the special flag
4983       if (team->t.t_size_changed == -1) {
4984         team->t.t_size_changed = 1;
4985       } else {
4986         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4987       }
4988 
4989       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4990       kmp_r_sched_t new_sched = new_icvs->sched;
4991       // set master's schedule as new run-time schedule
4992       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4993 
4994       __kmp_reinitialize_team(team, new_icvs,
4995                               root->r.r_uber_thread->th.th_ident);
4996 
4997       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4998                     team->t.t_threads[0], team));
4999       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5000 
5001 #if KMP_AFFINITY_SUPPORTED
5002       if ((team->t.t_size_changed == 0) &&
5003           (team->t.t_proc_bind == new_proc_bind)) {
5004         if (new_proc_bind == proc_bind_spread) {
5005           __kmp_partition_places(
5006               team, 1); // add flag to update only master for spread
5007         }
5008         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5009                        "proc_bind = %d, partition = [%d,%d]\n",
5010                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5011                        team->t.t_last_place));
5012       } else {
5013         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5014         __kmp_partition_places(team);
5015       }
5016 #else
5017       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5018 #endif /* KMP_AFFINITY_SUPPORTED */
5019     } else if (team->t.t_nproc > new_nproc) {
5020       KA_TRACE(20,
5021                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5022                 new_nproc));
5023 
5024       team->t.t_size_changed = 1;
5025 #if KMP_NESTED_HOT_TEAMS
5026       if (__kmp_hot_teams_mode == 0) {
5027         // AC: saved number of threads should correspond to team's value in this
5028         // mode, can be bigger in mode 1, when hot team has threads in reserve
5029         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5030         hot_teams[level].hot_team_nth = new_nproc;
5031 #endif // KMP_NESTED_HOT_TEAMS
5032         /* release the extra threads we don't need any more */
5033         for (f = new_nproc; f < team->t.t_nproc; f++) {
5034           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5035           if (__kmp_tasking_mode != tskm_immediate_exec) {
5036             // When decreasing team size, threads no longer in the team should
5037             // unref task team.
5038             team->t.t_threads[f]->th.th_task_team = NULL;
5039           }
5040           __kmp_free_thread(team->t.t_threads[f]);
5041           team->t.t_threads[f] = NULL;
5042         }
5043 #if KMP_NESTED_HOT_TEAMS
5044       } // (__kmp_hot_teams_mode == 0)
5045       else {
5046         // When keeping extra threads in team, switch threads to wait on own
5047         // b_go flag
5048         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5049           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5050           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5051           for (int b = 0; b < bs_last_barrier; ++b) {
5052             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5053               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5054             }
5055             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5056           }
5057         }
5058       }
5059 #endif // KMP_NESTED_HOT_TEAMS
5060       team->t.t_nproc = new_nproc;
5061       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5062       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5063       __kmp_reinitialize_team(team, new_icvs,
5064                               root->r.r_uber_thread->th.th_ident);
5065 
5066       // Update remaining threads
5067       for (f = 0; f < new_nproc; ++f) {
5068         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5069       }
5070 
5071       // restore the current task state of the master thread: should be the
5072       // implicit task
5073       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5074                     team->t.t_threads[0], team));
5075 
5076       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5077 
5078 #ifdef KMP_DEBUG
5079       for (f = 0; f < team->t.t_nproc; f++) {
5080         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5081                          team->t.t_threads[f]->th.th_team_nproc ==
5082                              team->t.t_nproc);
5083       }
5084 #endif
5085 
5086       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5087 #if KMP_AFFINITY_SUPPORTED
5088       __kmp_partition_places(team);
5089 #endif
5090     } else { // team->t.t_nproc < new_nproc
5091 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5092       kmp_affin_mask_t *old_mask;
5093       if (KMP_AFFINITY_CAPABLE()) {
5094         KMP_CPU_ALLOC(old_mask);
5095       }
5096 #endif
5097 
5098       KA_TRACE(20,
5099                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5100                 new_nproc));
5101 
5102       team->t.t_size_changed = 1;
5103 
5104 #if KMP_NESTED_HOT_TEAMS
5105       int avail_threads = hot_teams[level].hot_team_nth;
5106       if (new_nproc < avail_threads)
5107         avail_threads = new_nproc;
5108       kmp_info_t **other_threads = team->t.t_threads;
5109       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5110         // Adjust barrier data of reserved threads (if any) of the team
5111         // Other data will be set in __kmp_initialize_info() below.
5112         int b;
5113         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5114         for (b = 0; b < bs_last_barrier; ++b) {
5115           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5116           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5117 #if USE_DEBUGGER
5118           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5119 #endif
5120         }
5121       }
5122       if (hot_teams[level].hot_team_nth >= new_nproc) {
5123         // we have all needed threads in reserve, no need to allocate any
5124         // this only possible in mode 1, cannot have reserved threads in mode 0
5125         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5126         team->t.t_nproc = new_nproc; // just get reserved threads involved
5127       } else {
5128         // we may have some threads in reserve, but not enough
5129         team->t.t_nproc =
5130             hot_teams[level]
5131                 .hot_team_nth; // get reserved threads involved if any
5132         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5133 #endif // KMP_NESTED_HOT_TEAMS
5134         if (team->t.t_max_nproc < new_nproc) {
5135           /* reallocate larger arrays */
5136           __kmp_reallocate_team_arrays(team, new_nproc);
5137           __kmp_reinitialize_team(team, new_icvs, NULL);
5138         }
5139 
5140 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5141         /* Temporarily set full mask for master thread before creation of
5142            workers. The reason is that workers inherit the affinity from master,
5143            so if a lot of workers are created on the single core quickly, they
5144            don't get a chance to set their own affinity for a long time. */
5145         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5146 #endif
5147 
5148         /* allocate new threads for the hot team */
5149         for (f = team->t.t_nproc; f < new_nproc; f++) {
5150           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5151           KMP_DEBUG_ASSERT(new_worker);
5152           team->t.t_threads[f] = new_worker;
5153 
5154           KA_TRACE(20,
5155                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5156                     "join=%llu, plain=%llu\n",
5157                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5158                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5159                     team->t.t_bar[bs_plain_barrier].b_arrived));
5160 
5161           { // Initialize barrier data for new threads.
5162             int b;
5163             kmp_balign_t *balign = new_worker->th.th_bar;
5164             for (b = 0; b < bs_last_barrier; ++b) {
5165               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5166               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5167                                KMP_BARRIER_PARENT_FLAG);
5168 #if USE_DEBUGGER
5169               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5170 #endif
5171             }
5172           }
5173         }
5174 
5175 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5176         if (KMP_AFFINITY_CAPABLE()) {
5177           /* Restore initial master thread's affinity mask */
5178           __kmp_set_system_affinity(old_mask, TRUE);
5179           KMP_CPU_FREE(old_mask);
5180         }
5181 #endif
5182 #if KMP_NESTED_HOT_TEAMS
5183       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5184 #endif // KMP_NESTED_HOT_TEAMS
5185       /* make sure everyone is syncronized */
5186       int old_nproc = team->t.t_nproc; // save old value and use to update only
5187       // new threads below
5188       __kmp_initialize_team(team, new_nproc, new_icvs,
5189                             root->r.r_uber_thread->th.th_ident);
5190 
5191       /* reinitialize the threads */
5192       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5193       for (f = 0; f < team->t.t_nproc; ++f)
5194         __kmp_initialize_info(team->t.t_threads[f], team, f,
5195                               __kmp_gtid_from_tid(f, team));
5196 
5197       if (level) { // set th_task_state for new threads in nested hot team
5198         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5199         // only need to set the th_task_state for the new threads. th_task_state
5200         // for master thread will not be accurate until after this in
5201         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5202         // correct value.
5203         for (f = old_nproc; f < team->t.t_nproc; ++f)
5204           team->t.t_threads[f]->th.th_task_state =
5205               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5206       } else { // set th_task_state for new threads in non-nested hot team
5207         kmp_uint8 old_state =
5208             team->t.t_threads[0]->th.th_task_state; // copy master's state
5209         for (f = old_nproc; f < team->t.t_nproc; ++f)
5210           team->t.t_threads[f]->th.th_task_state = old_state;
5211       }
5212 
5213 #ifdef KMP_DEBUG
5214       for (f = 0; f < team->t.t_nproc; ++f) {
5215         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5216                          team->t.t_threads[f]->th.th_team_nproc ==
5217                              team->t.t_nproc);
5218       }
5219 #endif
5220 
5221       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5222 #if KMP_AFFINITY_SUPPORTED
5223       __kmp_partition_places(team);
5224 #endif
5225     } // Check changes in number of threads
5226 
5227     kmp_info_t *master = team->t.t_threads[0];
5228     if (master->th.th_teams_microtask) {
5229       for (f = 1; f < new_nproc; ++f) {
5230         // propagate teams construct specific info to workers
5231         kmp_info_t *thr = team->t.t_threads[f];
5232         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5233         thr->th.th_teams_level = master->th.th_teams_level;
5234         thr->th.th_teams_size = master->th.th_teams_size;
5235       }
5236     }
5237 #if KMP_NESTED_HOT_TEAMS
5238     if (level) {
5239       // Sync barrier state for nested hot teams, not needed for outermost hot
5240       // team.
5241       for (f = 1; f < new_nproc; ++f) {
5242         kmp_info_t *thr = team->t.t_threads[f];
5243         int b;
5244         kmp_balign_t *balign = thr->th.th_bar;
5245         for (b = 0; b < bs_last_barrier; ++b) {
5246           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5247           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5248 #if USE_DEBUGGER
5249           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5250 #endif
5251         }
5252       }
5253     }
5254 #endif // KMP_NESTED_HOT_TEAMS
5255 
5256     /* reallocate space for arguments if necessary */
5257     __kmp_alloc_argv_entries(argc, team, TRUE);
5258     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5259     // The hot team re-uses the previous task team,
5260     // if untouched during the previous release->gather phase.
5261 
5262     KF_TRACE(10, (" hot_team = %p\n", team));
5263 
5264 #if KMP_DEBUG
5265     if (__kmp_tasking_mode != tskm_immediate_exec) {
5266       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5267                     "task_team[1] = %p after reinit\n",
5268                     team->t.t_task_team[0], team->t.t_task_team[1]));
5269     }
5270 #endif
5271 
5272 #if OMPT_SUPPORT
5273     __ompt_team_assign_id(team, ompt_parallel_data);
5274 #endif
5275 
5276     KMP_MB();
5277 
5278     return team;
5279   }
5280 
5281   /* next, let's try to take one from the team pool */
5282   KMP_MB();
5283   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5284     /* TODO: consider resizing undersized teams instead of reaping them, now
5285        that we have a resizing mechanism */
5286     if (team->t.t_max_nproc >= max_nproc) {
5287       /* take this team from the team pool */
5288       __kmp_team_pool = team->t.t_next_pool;
5289 
5290       /* setup the team for fresh use */
5291       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5292 
5293       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5294                     "task_team[1] %p to NULL\n",
5295                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5296       team->t.t_task_team[0] = NULL;
5297       team->t.t_task_team[1] = NULL;
5298 
5299       /* reallocate space for arguments if necessary */
5300       __kmp_alloc_argv_entries(argc, team, TRUE);
5301       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5302 
5303       KA_TRACE(
5304           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5305                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5306       { // Initialize barrier data.
5307         int b;
5308         for (b = 0; b < bs_last_barrier; ++b) {
5309           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5310 #if USE_DEBUGGER
5311           team->t.t_bar[b].b_master_arrived = 0;
5312           team->t.t_bar[b].b_team_arrived = 0;
5313 #endif
5314         }
5315       }
5316 
5317       team->t.t_proc_bind = new_proc_bind;
5318 
5319       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5320                     team->t.t_id));
5321 
5322 #if OMPT_SUPPORT
5323       __ompt_team_assign_id(team, ompt_parallel_data);
5324 #endif
5325 
5326       KMP_MB();
5327 
5328       return team;
5329     }
5330 
5331     /* reap team if it is too small, then loop back and check the next one */
5332     // not sure if this is wise, but, will be redone during the hot-teams
5333     // rewrite.
5334     /* TODO: Use technique to find the right size hot-team, don't reap them */
5335     team = __kmp_reap_team(team);
5336     __kmp_team_pool = team;
5337   }
5338 
5339   /* nothing available in the pool, no matter, make a new team! */
5340   KMP_MB();
5341   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5342 
5343   /* and set it up */
5344   team->t.t_max_nproc = max_nproc;
5345   /* NOTE well, for some reason allocating one big buffer and dividing it up
5346      seems to really hurt performance a lot on the P4, so, let's not use this */
5347   __kmp_allocate_team_arrays(team, max_nproc);
5348 
5349   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5350   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5351 
5352   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5353                 "%p to NULL\n",
5354                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5355   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5356   // memory, no need to duplicate
5357   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5358   // memory, no need to duplicate
5359 
5360   if (__kmp_storage_map) {
5361     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5362   }
5363 
5364   /* allocate space for arguments */
5365   __kmp_alloc_argv_entries(argc, team, FALSE);
5366   team->t.t_argc = argc;
5367 
5368   KA_TRACE(20,
5369            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5370             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5371   { // Initialize barrier data.
5372     int b;
5373     for (b = 0; b < bs_last_barrier; ++b) {
5374       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5375 #if USE_DEBUGGER
5376       team->t.t_bar[b].b_master_arrived = 0;
5377       team->t.t_bar[b].b_team_arrived = 0;
5378 #endif
5379     }
5380   }
5381 
5382   team->t.t_proc_bind = new_proc_bind;
5383 
5384 #if OMPT_SUPPORT
5385   __ompt_team_assign_id(team, ompt_parallel_data);
5386   team->t.ompt_serialized_team_info = NULL;
5387 #endif
5388 
5389   KMP_MB();
5390 
5391   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5392                 team->t.t_id));
5393 
5394   return team;
5395 }
5396 
5397 /* TODO implement hot-teams at all levels */
5398 /* TODO implement lazy thread release on demand (disband request) */
5399 
5400 /* free the team.  return it to the team pool.  release all the threads
5401  * associated with it */
5402 void __kmp_free_team(kmp_root_t *root,
5403                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5404   int f;
5405   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5406                 team->t.t_id));
5407 
5408   /* verify state */
5409   KMP_DEBUG_ASSERT(root);
5410   KMP_DEBUG_ASSERT(team);
5411   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5412   KMP_DEBUG_ASSERT(team->t.t_threads);
5413 
5414   int use_hot_team = team == root->r.r_hot_team;
5415 #if KMP_NESTED_HOT_TEAMS
5416   int level;
5417   kmp_hot_team_ptr_t *hot_teams;
5418   if (master) {
5419     level = team->t.t_active_level - 1;
5420     if (master->th.th_teams_microtask) { // in teams construct?
5421       if (master->th.th_teams_size.nteams > 1) {
5422         ++level; // level was not increased in teams construct for
5423         // team_of_masters
5424       }
5425       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5426           master->th.th_teams_level == team->t.t_level) {
5427         ++level; // level was not increased in teams construct for
5428         // team_of_workers before the parallel
5429       } // team->t.t_level will be increased inside parallel
5430     }
5431     hot_teams = master->th.th_hot_teams;
5432     if (level < __kmp_hot_teams_max_level) {
5433       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5434       use_hot_team = 1;
5435     }
5436   }
5437 #endif // KMP_NESTED_HOT_TEAMS
5438 
5439   /* team is done working */
5440   TCW_SYNC_PTR(team->t.t_pkfn,
5441                NULL); // Important for Debugging Support Library.
5442 #if KMP_OS_WINDOWS
5443   team->t.t_copyin_counter = 0; // init counter for possible reuse
5444 #endif
5445   // Do not reset pointer to parent team to NULL for hot teams.
5446 
5447   /* if we are non-hot team, release our threads */
5448   if (!use_hot_team) {
5449     if (__kmp_tasking_mode != tskm_immediate_exec) {
5450       // Wait for threads to reach reapable state
5451       for (f = 1; f < team->t.t_nproc; ++f) {
5452         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5453         kmp_info_t *th = team->t.t_threads[f];
5454         volatile kmp_uint32 *state = &th->th.th_reap_state;
5455         while (*state != KMP_SAFE_TO_REAP) {
5456 #if KMP_OS_WINDOWS
5457           // On Windows a thread can be killed at any time, check this
5458           DWORD ecode;
5459           if (!__kmp_is_thread_alive(th, &ecode)) {
5460             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5461             break;
5462           }
5463 #endif
5464           // first check if thread is sleeping
5465           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5466           if (fl.is_sleeping())
5467             fl.resume(__kmp_gtid_from_thread(th));
5468           KMP_CPU_PAUSE();
5469         }
5470       }
5471 
5472       // Delete task teams
5473       int tt_idx;
5474       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5475         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5476         if (task_team != NULL) {
5477           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5478             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5479             team->t.t_threads[f]->th.th_task_team = NULL;
5480           }
5481           KA_TRACE(
5482               20,
5483               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5484                __kmp_get_gtid(), task_team, team->t.t_id));
5485 #if KMP_NESTED_HOT_TEAMS
5486           __kmp_free_task_team(master, task_team);
5487 #endif
5488           team->t.t_task_team[tt_idx] = NULL;
5489         }
5490       }
5491     }
5492 
5493     // Reset pointer to parent team only for non-hot teams.
5494     team->t.t_parent = NULL;
5495     team->t.t_level = 0;
5496     team->t.t_active_level = 0;
5497 
5498     /* free the worker threads */
5499     for (f = 1; f < team->t.t_nproc; ++f) {
5500       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5501       __kmp_free_thread(team->t.t_threads[f]);
5502       team->t.t_threads[f] = NULL;
5503     }
5504 
5505     /* put the team back in the team pool */
5506     /* TODO limit size of team pool, call reap_team if pool too large */
5507     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5508     __kmp_team_pool = (volatile kmp_team_t *)team;
5509   } else { // Check if team was created for the masters in a teams construct
5510     // See if first worker is a CG root
5511     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5512                      team->t.t_threads[1]->th.th_cg_roots);
5513     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5514       // Clean up the CG root nodes on workers so that this team can be re-used
5515       for (f = 1; f < team->t.t_nproc; ++f) {
5516         kmp_info_t *thr = team->t.t_threads[f];
5517         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5518                          thr->th.th_cg_roots->cg_root == thr);
5519         // Pop current CG root off list
5520         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5521         thr->th.th_cg_roots = tmp->up;
5522         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5523                        " up to node %p. cg_nthreads was %d\n",
5524                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5525         int i = tmp->cg_nthreads--;
5526         if (i == 1) {
5527           __kmp_free(tmp); // free CG if we are the last thread in it
5528         }
5529         // Restore current task's thread_limit from CG root
5530         if (thr->th.th_cg_roots)
5531           thr->th.th_current_task->td_icvs.thread_limit =
5532               thr->th.th_cg_roots->cg_thread_limit;
5533       }
5534     }
5535   }
5536 
5537   KMP_MB();
5538 }
5539 
5540 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5541 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5542   kmp_team_t *next_pool = team->t.t_next_pool;
5543 
5544   KMP_DEBUG_ASSERT(team);
5545   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5546   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5547   KMP_DEBUG_ASSERT(team->t.t_threads);
5548   KMP_DEBUG_ASSERT(team->t.t_argv);
5549 
5550   /* TODO clean the threads that are a part of this? */
5551 
5552   /* free stuff */
5553   __kmp_free_team_arrays(team);
5554   if (team->t.t_argv != &team->t.t_inline_argv[0])
5555     __kmp_free((void *)team->t.t_argv);
5556   __kmp_free(team);
5557 
5558   KMP_MB();
5559   return next_pool;
5560 }
5561 
5562 // Free the thread.  Don't reap it, just place it on the pool of available
5563 // threads.
5564 //
5565 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5566 // binding for the affinity mechanism to be useful.
5567 //
5568 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5569 // However, we want to avoid a potential performance problem by always
5570 // scanning through the list to find the correct point at which to insert
5571 // the thread (potential N**2 behavior).  To do this we keep track of the
5572 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5573 // With single-level parallelism, threads will always be added to the tail
5574 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5575 // parallelism, all bets are off and we may need to scan through the entire
5576 // free list.
5577 //
5578 // This change also has a potentially large performance benefit, for some
5579 // applications.  Previously, as threads were freed from the hot team, they
5580 // would be placed back on the free list in inverse order.  If the hot team
5581 // grew back to it's original size, then the freed thread would be placed
5582 // back on the hot team in reverse order.  This could cause bad cache
5583 // locality problems on programs where the size of the hot team regularly
5584 // grew and shrunk.
5585 //
5586 // Now, for single-level parallelism, the OMP tid is always == gtid.
5587 void __kmp_free_thread(kmp_info_t *this_th) {
5588   int gtid;
5589   kmp_info_t **scan;
5590 
5591   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5592                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5593 
5594   KMP_DEBUG_ASSERT(this_th);
5595 
5596   // When moving thread to pool, switch thread to wait on own b_go flag, and
5597   // uninitialized (NULL team).
5598   int b;
5599   kmp_balign_t *balign = this_th->th.th_bar;
5600   for (b = 0; b < bs_last_barrier; ++b) {
5601     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5602       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5603     balign[b].bb.team = NULL;
5604     balign[b].bb.leaf_kids = 0;
5605   }
5606   this_th->th.th_task_state = 0;
5607   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5608 
5609   /* put thread back on the free pool */
5610   TCW_PTR(this_th->th.th_team, NULL);
5611   TCW_PTR(this_th->th.th_root, NULL);
5612   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5613 
5614   while (this_th->th.th_cg_roots) {
5615     this_th->th.th_cg_roots->cg_nthreads--;
5616     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5617                    " %p of thread  %p to %d\n",
5618                    this_th, this_th->th.th_cg_roots,
5619                    this_th->th.th_cg_roots->cg_root,
5620                    this_th->th.th_cg_roots->cg_nthreads));
5621     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5622     if (tmp->cg_root == this_th) { // Thread is a cg_root
5623       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5624       KA_TRACE(
5625           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5626       this_th->th.th_cg_roots = tmp->up;
5627       __kmp_free(tmp);
5628     } else { // Worker thread
5629       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5630         __kmp_free(tmp);
5631       }
5632       this_th->th.th_cg_roots = NULL;
5633       break;
5634     }
5635   }
5636 
5637   /* If the implicit task assigned to this thread can be used by other threads
5638    * -> multiple threads can share the data and try to free the task at
5639    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5640    * with higher probability when hot team is disabled but can occurs even when
5641    * the hot team is enabled */
5642   __kmp_free_implicit_task(this_th);
5643   this_th->th.th_current_task = NULL;
5644 
5645   // If the __kmp_thread_pool_insert_pt is already past the new insert
5646   // point, then we need to re-scan the entire list.
5647   gtid = this_th->th.th_info.ds.ds_gtid;
5648   if (__kmp_thread_pool_insert_pt != NULL) {
5649     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5650     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5651       __kmp_thread_pool_insert_pt = NULL;
5652     }
5653   }
5654 
5655   // Scan down the list to find the place to insert the thread.
5656   // scan is the address of a link in the list, possibly the address of
5657   // __kmp_thread_pool itself.
5658   //
5659   // In the absence of nested parallelism, the for loop will have 0 iterations.
5660   if (__kmp_thread_pool_insert_pt != NULL) {
5661     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5662   } else {
5663     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5664   }
5665   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5666        scan = &((*scan)->th.th_next_pool))
5667     ;
5668 
5669   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5670   // to its address.
5671   TCW_PTR(this_th->th.th_next_pool, *scan);
5672   __kmp_thread_pool_insert_pt = *scan = this_th;
5673   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5674                    (this_th->th.th_info.ds.ds_gtid <
5675                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5676   TCW_4(this_th->th.th_in_pool, TRUE);
5677   __kmp_suspend_initialize_thread(this_th);
5678   __kmp_lock_suspend_mx(this_th);
5679   if (this_th->th.th_active == TRUE) {
5680     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5681     this_th->th.th_active_in_pool = TRUE;
5682   }
5683 #if KMP_DEBUG
5684   else {
5685     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5686   }
5687 #endif
5688   __kmp_unlock_suspend_mx(this_th);
5689 
5690   TCW_4(__kmp_nth, __kmp_nth - 1);
5691 
5692 #ifdef KMP_ADJUST_BLOCKTIME
5693   /* Adjust blocktime back to user setting or default if necessary */
5694   /* Middle initialization might never have occurred                */
5695   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5696     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5697     if (__kmp_nth <= __kmp_avail_proc) {
5698       __kmp_zero_bt = FALSE;
5699     }
5700   }
5701 #endif /* KMP_ADJUST_BLOCKTIME */
5702 
5703   KMP_MB();
5704 }
5705 
5706 /* ------------------------------------------------------------------------ */
5707 
5708 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5709 #if OMPTARGET_PROFILING_SUPPORT
5710   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5711   // TODO: add a configuration option for time granularity
5712   if (ProfileTraceFile)
5713     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5714 #endif
5715 
5716   int gtid = this_thr->th.th_info.ds.ds_gtid;
5717   /*    void                 *stack_data;*/
5718   kmp_team_t **volatile pteam;
5719 
5720   KMP_MB();
5721   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5722 
5723   if (__kmp_env_consistency_check) {
5724     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5725   }
5726 
5727 #if OMPT_SUPPORT
5728   ompt_data_t *thread_data;
5729   if (ompt_enabled.enabled) {
5730     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5731     *thread_data = ompt_data_none;
5732 
5733     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5734     this_thr->th.ompt_thread_info.wait_id = 0;
5735     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5736     this_thr->th.ompt_thread_info.parallel_flags = 0;
5737     if (ompt_enabled.ompt_callback_thread_begin) {
5738       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5739           ompt_thread_worker, thread_data);
5740     }
5741     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5742   }
5743 #endif
5744 
5745   /* This is the place where threads wait for work */
5746   while (!TCR_4(__kmp_global.g.g_done)) {
5747     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5748     KMP_MB();
5749 
5750     /* wait for work to do */
5751     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5752 
5753     /* No tid yet since not part of a team */
5754     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5755 
5756 #if OMPT_SUPPORT
5757     if (ompt_enabled.enabled) {
5758       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5759     }
5760 #endif
5761 
5762     pteam = &this_thr->th.th_team;
5763 
5764     /* have we been allocated? */
5765     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5766       /* we were just woken up, so run our new task */
5767       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5768         int rc;
5769         KA_TRACE(20,
5770                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5771                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5772                   (*pteam)->t.t_pkfn));
5773 
5774         updateHWFPControl(*pteam);
5775 
5776 #if OMPT_SUPPORT
5777         if (ompt_enabled.enabled) {
5778           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5779         }
5780 #endif
5781 
5782         rc = (*pteam)->t.t_invoke(gtid);
5783         KMP_ASSERT(rc);
5784 
5785         KMP_MB();
5786         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5787                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5788                       (*pteam)->t.t_pkfn));
5789       }
5790 #if OMPT_SUPPORT
5791       if (ompt_enabled.enabled) {
5792         /* no frame set while outside task */
5793         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5794 
5795         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5796       }
5797 #endif
5798       /* join barrier after parallel region */
5799       __kmp_join_barrier(gtid);
5800     }
5801   }
5802   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5803 
5804 #if OMPT_SUPPORT
5805   if (ompt_enabled.ompt_callback_thread_end) {
5806     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5807   }
5808 #endif
5809 
5810   this_thr->th.th_task_team = NULL;
5811   /* run the destructors for the threadprivate data for this thread */
5812   __kmp_common_destroy_gtid(gtid);
5813 
5814   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5815   KMP_MB();
5816 
5817 #if OMPTARGET_PROFILING_SUPPORT
5818   llvm::timeTraceProfilerFinishThread();
5819 #endif
5820   return this_thr;
5821 }
5822 
5823 /* ------------------------------------------------------------------------ */
5824 
5825 void __kmp_internal_end_dest(void *specific_gtid) {
5826   // Make sure no significant bits are lost
5827   int gtid;
5828   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5829 
5830   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5831   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5832    * this is because 0 is reserved for the nothing-stored case */
5833 
5834   __kmp_internal_end_thread(gtid);
5835 }
5836 
5837 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5838 
5839 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5840   __kmp_internal_end_atexit();
5841 }
5842 
5843 #endif
5844 
5845 /* [Windows] josh: when the atexit handler is called, there may still be more
5846    than one thread alive */
5847 void __kmp_internal_end_atexit(void) {
5848   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5849   /* [Windows]
5850      josh: ideally, we want to completely shutdown the library in this atexit
5851      handler, but stat code that depends on thread specific data for gtid fails
5852      because that data becomes unavailable at some point during the shutdown, so
5853      we call __kmp_internal_end_thread instead. We should eventually remove the
5854      dependency on __kmp_get_specific_gtid in the stat code and use
5855      __kmp_internal_end_library to cleanly shutdown the library.
5856 
5857      // TODO: Can some of this comment about GVS be removed?
5858      I suspect that the offending stat code is executed when the calling thread
5859      tries to clean up a dead root thread's data structures, resulting in GVS
5860      code trying to close the GVS structures for that thread, but since the stat
5861      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5862      the calling thread is cleaning up itself instead of another thread, it get
5863      confused. This happens because allowing a thread to unregister and cleanup
5864      another thread is a recent modification for addressing an issue.
5865      Based on the current design (20050722), a thread may end up
5866      trying to unregister another thread only if thread death does not trigger
5867      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5868      thread specific data destructor function to detect thread death. For
5869      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5870      is nothing.  Thus, the workaround is applicable only for Windows static
5871      stat library. */
5872   __kmp_internal_end_library(-1);
5873 #if KMP_OS_WINDOWS
5874   __kmp_close_console();
5875 #endif
5876 }
5877 
5878 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5879   // It is assumed __kmp_forkjoin_lock is acquired.
5880 
5881   int gtid;
5882 
5883   KMP_DEBUG_ASSERT(thread != NULL);
5884 
5885   gtid = thread->th.th_info.ds.ds_gtid;
5886 
5887   if (!is_root) {
5888     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5889       /* Assume the threads are at the fork barrier here */
5890       KA_TRACE(
5891           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5892                gtid));
5893       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5894        * (GEH) */
5895       ANNOTATE_HAPPENS_BEFORE(thread);
5896       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5897                          thread);
5898       __kmp_release_64(&flag);
5899     }
5900 
5901     // Terminate OS thread.
5902     __kmp_reap_worker(thread);
5903 
5904     // The thread was killed asynchronously.  If it was actively
5905     // spinning in the thread pool, decrement the global count.
5906     //
5907     // There is a small timing hole here - if the worker thread was just waking
5908     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5909     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5910     // the global counter might not get updated.
5911     //
5912     // Currently, this can only happen as the library is unloaded,
5913     // so there are no harmful side effects.
5914     if (thread->th.th_active_in_pool) {
5915       thread->th.th_active_in_pool = FALSE;
5916       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5917       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5918     }
5919   }
5920 
5921   __kmp_free_implicit_task(thread);
5922 
5923 // Free the fast memory for tasking
5924 #if USE_FAST_MEMORY
5925   __kmp_free_fast_memory(thread);
5926 #endif /* USE_FAST_MEMORY */
5927 
5928   __kmp_suspend_uninitialize_thread(thread);
5929 
5930   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5931   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5932 
5933   --__kmp_all_nth;
5934 // __kmp_nth was decremented when thread is added to the pool.
5935 
5936 #ifdef KMP_ADJUST_BLOCKTIME
5937   /* Adjust blocktime back to user setting or default if necessary */
5938   /* Middle initialization might never have occurred                */
5939   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5940     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5941     if (__kmp_nth <= __kmp_avail_proc) {
5942       __kmp_zero_bt = FALSE;
5943     }
5944   }
5945 #endif /* KMP_ADJUST_BLOCKTIME */
5946 
5947   /* free the memory being used */
5948   if (__kmp_env_consistency_check) {
5949     if (thread->th.th_cons) {
5950       __kmp_free_cons_stack(thread->th.th_cons);
5951       thread->th.th_cons = NULL;
5952     }
5953   }
5954 
5955   if (thread->th.th_pri_common != NULL) {
5956     __kmp_free(thread->th.th_pri_common);
5957     thread->th.th_pri_common = NULL;
5958   }
5959 
5960   if (thread->th.th_task_state_memo_stack != NULL) {
5961     __kmp_free(thread->th.th_task_state_memo_stack);
5962     thread->th.th_task_state_memo_stack = NULL;
5963   }
5964 
5965 #if KMP_USE_BGET
5966   if (thread->th.th_local.bget_data != NULL) {
5967     __kmp_finalize_bget(thread);
5968   }
5969 #endif
5970 
5971 #if KMP_AFFINITY_SUPPORTED
5972   if (thread->th.th_affin_mask != NULL) {
5973     KMP_CPU_FREE(thread->th.th_affin_mask);
5974     thread->th.th_affin_mask = NULL;
5975   }
5976 #endif /* KMP_AFFINITY_SUPPORTED */
5977 
5978 #if KMP_USE_HIER_SCHED
5979   if (thread->th.th_hier_bar_data != NULL) {
5980     __kmp_free(thread->th.th_hier_bar_data);
5981     thread->th.th_hier_bar_data = NULL;
5982   }
5983 #endif
5984 
5985   __kmp_reap_team(thread->th.th_serial_team);
5986   thread->th.th_serial_team = NULL;
5987   __kmp_free(thread);
5988 
5989   KMP_MB();
5990 
5991 } // __kmp_reap_thread
5992 
5993 static void __kmp_internal_end(void) {
5994   int i;
5995 
5996   /* First, unregister the library */
5997   __kmp_unregister_library();
5998 
5999 #if KMP_OS_WINDOWS
6000   /* In Win static library, we can't tell when a root actually dies, so we
6001      reclaim the data structures for any root threads that have died but not
6002      unregistered themselves, in order to shut down cleanly.
6003      In Win dynamic library we also can't tell when a thread dies.  */
6004   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6005 // dead roots
6006 #endif
6007 
6008   for (i = 0; i < __kmp_threads_capacity; i++)
6009     if (__kmp_root[i])
6010       if (__kmp_root[i]->r.r_active)
6011         break;
6012   KMP_MB(); /* Flush all pending memory write invalidates.  */
6013   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6014 
6015   if (i < __kmp_threads_capacity) {
6016 #if KMP_USE_MONITOR
6017     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6018     KMP_MB(); /* Flush all pending memory write invalidates.  */
6019 
6020     // Need to check that monitor was initialized before reaping it. If we are
6021     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6022     // __kmp_monitor will appear to contain valid data, but it is only valid in
6023     // the parent process, not the child.
6024     // New behavior (201008): instead of keying off of the flag
6025     // __kmp_init_parallel, the monitor thread creation is keyed off
6026     // of the new flag __kmp_init_monitor.
6027     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6028     if (TCR_4(__kmp_init_monitor)) {
6029       __kmp_reap_monitor(&__kmp_monitor);
6030       TCW_4(__kmp_init_monitor, 0);
6031     }
6032     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6033     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6034 #endif // KMP_USE_MONITOR
6035   } else {
6036 /* TODO move this to cleanup code */
6037 #ifdef KMP_DEBUG
6038     /* make sure that everything has properly ended */
6039     for (i = 0; i < __kmp_threads_capacity; i++) {
6040       if (__kmp_root[i]) {
6041         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6042         //                    there can be uber threads alive here
6043         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6044       }
6045     }
6046 #endif
6047 
6048     KMP_MB();
6049 
6050     // Reap the worker threads.
6051     // This is valid for now, but be careful if threads are reaped sooner.
6052     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6053       // Get the next thread from the pool.
6054       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6055       __kmp_thread_pool = thread->th.th_next_pool;
6056       // Reap it.
6057       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6058       thread->th.th_next_pool = NULL;
6059       thread->th.th_in_pool = FALSE;
6060       __kmp_reap_thread(thread, 0);
6061     }
6062     __kmp_thread_pool_insert_pt = NULL;
6063 
6064     // Reap teams.
6065     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6066       // Get the next team from the pool.
6067       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6068       __kmp_team_pool = team->t.t_next_pool;
6069       // Reap it.
6070       team->t.t_next_pool = NULL;
6071       __kmp_reap_team(team);
6072     }
6073 
6074     __kmp_reap_task_teams();
6075 
6076 #if KMP_OS_UNIX
6077     // Threads that are not reaped should not access any resources since they
6078     // are going to be deallocated soon, so the shutdown sequence should wait
6079     // until all threads either exit the final spin-waiting loop or begin
6080     // sleeping after the given blocktime.
6081     for (i = 0; i < __kmp_threads_capacity; i++) {
6082       kmp_info_t *thr = __kmp_threads[i];
6083       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6084         KMP_CPU_PAUSE();
6085     }
6086 #endif
6087 
6088     for (i = 0; i < __kmp_threads_capacity; ++i) {
6089       // TBD: Add some checking...
6090       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6091     }
6092 
6093     /* Make sure all threadprivate destructors get run by joining with all
6094        worker threads before resetting this flag */
6095     TCW_SYNC_4(__kmp_init_common, FALSE);
6096 
6097     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6098     KMP_MB();
6099 
6100 #if KMP_USE_MONITOR
6101     // See note above: One of the possible fixes for CQ138434 / CQ140126
6102     //
6103     // FIXME: push both code fragments down and CSE them?
6104     // push them into __kmp_cleanup() ?
6105     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6106     if (TCR_4(__kmp_init_monitor)) {
6107       __kmp_reap_monitor(&__kmp_monitor);
6108       TCW_4(__kmp_init_monitor, 0);
6109     }
6110     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6111     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6112 #endif
6113   } /* else !__kmp_global.t_active */
6114   TCW_4(__kmp_init_gtid, FALSE);
6115   KMP_MB(); /* Flush all pending memory write invalidates.  */
6116 
6117   __kmp_cleanup();
6118 #if OMPT_SUPPORT
6119   ompt_fini();
6120 #endif
6121 }
6122 
6123 void __kmp_internal_end_library(int gtid_req) {
6124   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6125   /* this shouldn't be a race condition because __kmp_internal_end() is the
6126      only place to clear __kmp_serial_init */
6127   /* we'll check this later too, after we get the lock */
6128   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6129   // redundant, because the next check will work in any case.
6130   if (__kmp_global.g.g_abort) {
6131     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6132     /* TODO abort? */
6133     return;
6134   }
6135   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6136     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6137     return;
6138   }
6139 
6140   KMP_MB(); /* Flush all pending memory write invalidates.  */
6141   /* find out who we are and what we should do */
6142   {
6143     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6144     KA_TRACE(
6145         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6146     if (gtid == KMP_GTID_SHUTDOWN) {
6147       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6148                     "already shutdown\n"));
6149       return;
6150     } else if (gtid == KMP_GTID_MONITOR) {
6151       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6152                     "registered, or system shutdown\n"));
6153       return;
6154     } else if (gtid == KMP_GTID_DNE) {
6155       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6156                     "shutdown\n"));
6157       /* we don't know who we are, but we may still shutdown the library */
6158     } else if (KMP_UBER_GTID(gtid)) {
6159       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6160       if (__kmp_root[gtid]->r.r_active) {
6161         __kmp_global.g.g_abort = -1;
6162         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6163         __kmp_unregister_library();
6164         KA_TRACE(10,
6165                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6166                   gtid));
6167         return;
6168       } else {
6169         KA_TRACE(
6170             10,
6171             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6172         __kmp_unregister_root_current_thread(gtid);
6173       }
6174     } else {
6175 /* worker threads may call this function through the atexit handler, if they
6176  * call exit() */
6177 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6178    TODO: do a thorough shutdown instead */
6179 #ifdef DUMP_DEBUG_ON_EXIT
6180       if (__kmp_debug_buf)
6181         __kmp_dump_debug_buffer();
6182 #endif
6183       // added unregister library call here when we switch to shm linux
6184       // if we don't, it will leave lots of files in /dev/shm
6185       // cleanup shared memory file before exiting.
6186       __kmp_unregister_library();
6187       return;
6188     }
6189   }
6190   /* synchronize the termination process */
6191   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6192 
6193   /* have we already finished */
6194   if (__kmp_global.g.g_abort) {
6195     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6196     /* TODO abort? */
6197     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6198     return;
6199   }
6200   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6201     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6202     return;
6203   }
6204 
6205   /* We need this lock to enforce mutex between this reading of
6206      __kmp_threads_capacity and the writing by __kmp_register_root.
6207      Alternatively, we can use a counter of roots that is atomically updated by
6208      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6209      __kmp_internal_end_*.  */
6210   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6211 
6212   /* now we can safely conduct the actual termination */
6213   __kmp_internal_end();
6214 
6215   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6216   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6217 
6218   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6219 
6220 #ifdef DUMP_DEBUG_ON_EXIT
6221   if (__kmp_debug_buf)
6222     __kmp_dump_debug_buffer();
6223 #endif
6224 
6225 #if KMP_OS_WINDOWS
6226   __kmp_close_console();
6227 #endif
6228 
6229   __kmp_fini_allocator();
6230 
6231 } // __kmp_internal_end_library
6232 
6233 void __kmp_internal_end_thread(int gtid_req) {
6234   int i;
6235 
6236   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6237   /* this shouldn't be a race condition because __kmp_internal_end() is the
6238    * only place to clear __kmp_serial_init */
6239   /* we'll check this later too, after we get the lock */
6240   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6241   // redundant, because the next check will work in any case.
6242   if (__kmp_global.g.g_abort) {
6243     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6244     /* TODO abort? */
6245     return;
6246   }
6247   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6248     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6249     return;
6250   }
6251 
6252   KMP_MB(); /* Flush all pending memory write invalidates.  */
6253 
6254   /* find out who we are and what we should do */
6255   {
6256     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6257     KA_TRACE(10,
6258              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6259     if (gtid == KMP_GTID_SHUTDOWN) {
6260       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6261                     "already shutdown\n"));
6262       return;
6263     } else if (gtid == KMP_GTID_MONITOR) {
6264       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6265                     "registered, or system shutdown\n"));
6266       return;
6267     } else if (gtid == KMP_GTID_DNE) {
6268       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6269                     "shutdown\n"));
6270       return;
6271       /* we don't know who we are */
6272     } else if (KMP_UBER_GTID(gtid)) {
6273       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6274       if (__kmp_root[gtid]->r.r_active) {
6275         __kmp_global.g.g_abort = -1;
6276         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6277         KA_TRACE(10,
6278                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6279                   gtid));
6280         return;
6281       } else {
6282         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6283                       gtid));
6284         __kmp_unregister_root_current_thread(gtid);
6285       }
6286     } else {
6287       /* just a worker thread, let's leave */
6288       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6289 
6290       if (gtid >= 0) {
6291         __kmp_threads[gtid]->th.th_task_team = NULL;
6292       }
6293 
6294       KA_TRACE(10,
6295                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6296                 gtid));
6297       return;
6298     }
6299   }
6300 #if KMP_DYNAMIC_LIB
6301   if (__kmp_pause_status != kmp_hard_paused)
6302   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6303   // because we will better shutdown later in the library destructor.
6304   {
6305     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6306     return;
6307   }
6308 #endif
6309   /* synchronize the termination process */
6310   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6311 
6312   /* have we already finished */
6313   if (__kmp_global.g.g_abort) {
6314     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6315     /* TODO abort? */
6316     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6317     return;
6318   }
6319   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6320     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6321     return;
6322   }
6323 
6324   /* We need this lock to enforce mutex between this reading of
6325      __kmp_threads_capacity and the writing by __kmp_register_root.
6326      Alternatively, we can use a counter of roots that is atomically updated by
6327      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6328      __kmp_internal_end_*.  */
6329 
6330   /* should we finish the run-time?  are all siblings done? */
6331   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6332 
6333   for (i = 0; i < __kmp_threads_capacity; ++i) {
6334     if (KMP_UBER_GTID(i)) {
6335       KA_TRACE(
6336           10,
6337           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6338       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6339       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6340       return;
6341     }
6342   }
6343 
6344   /* now we can safely conduct the actual termination */
6345 
6346   __kmp_internal_end();
6347 
6348   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6349   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6350 
6351   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6352 
6353 #ifdef DUMP_DEBUG_ON_EXIT
6354   if (__kmp_debug_buf)
6355     __kmp_dump_debug_buffer();
6356 #endif
6357 } // __kmp_internal_end_thread
6358 
6359 // -----------------------------------------------------------------------------
6360 // Library registration stuff.
6361 
6362 static long __kmp_registration_flag = 0;
6363 // Random value used to indicate library initialization.
6364 static char *__kmp_registration_str = NULL;
6365 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6366 
6367 static inline char *__kmp_reg_status_name() {
6368 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6369    each thread. If registration and unregistration go in different threads
6370    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6371    env var can not be found, because the name will contain different pid. */
6372 // macOS* complains about name being too long with additional getuid()
6373 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6374   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6375                           (int)getuid());
6376 #else
6377   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6378 #endif
6379 } // __kmp_reg_status_get
6380 
6381 void __kmp_register_library_startup(void) {
6382 
6383   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6384   int done = 0;
6385   union {
6386     double dtime;
6387     long ltime;
6388   } time;
6389 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6390   __kmp_initialize_system_tick();
6391 #endif
6392   __kmp_read_system_time(&time.dtime);
6393   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6394   __kmp_registration_str =
6395       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6396                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6397 
6398   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6399                 __kmp_registration_str));
6400 
6401   while (!done) {
6402 
6403     char *value = NULL; // Actual value of the environment variable.
6404 
6405 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6406     char *shm_name = __kmp_str_format("/%s", name);
6407     int shm_preexist = 0;
6408     char *data1;
6409     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6410     if ((fd1 == -1) && (errno == EEXIST)) {
6411       // file didn't open because it already exists.
6412       // try opening existing file
6413       fd1 = shm_open(shm_name, O_RDWR, 0666);
6414       if (fd1 == -1) { // file didn't open
6415         // error out here
6416         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6417                     __kmp_msg_null);
6418       } else {
6419         // able to open existing file
6420         shm_preexist = 1;
6421       }
6422     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6423       // already exists.
6424       // error out here.
6425       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6426                   __kmp_msg_null);
6427     }
6428     if (shm_preexist == 0) {
6429       // we created SHM now set size
6430       if (ftruncate(fd1, SHM_SIZE) == -1) {
6431         // error occured setting size;
6432         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6433                     KMP_ERR(errno), __kmp_msg_null);
6434       }
6435     }
6436     data1 =
6437         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6438     if (data1 == MAP_FAILED) {
6439       // failed to map shared memory
6440       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6441                   __kmp_msg_null);
6442     }
6443     if (shm_preexist == 0) { // set data to SHM, set value
6444       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6445     }
6446     // Read value from either what we just wrote or existing file.
6447     value = __kmp_str_format("%s", data1); // read value from SHM
6448     munmap(data1, SHM_SIZE);
6449     close(fd1);
6450 #else // Windows and unix with static library
6451     // Set environment variable, but do not overwrite if it is exist.
6452     __kmp_env_set(name, __kmp_registration_str, 0);
6453     // read value to see if it got set
6454     value = __kmp_env_get(name);
6455 #endif
6456 
6457     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6458       done = 1; // Ok, environment variable set successfully, exit the loop.
6459     } else {
6460       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6461       // Check whether it alive or dead.
6462       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6463       char *tail = value;
6464       char *flag_addr_str = NULL;
6465       char *flag_val_str = NULL;
6466       char const *file_name = NULL;
6467       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6468       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6469       file_name = tail;
6470       if (tail != NULL) {
6471         long *flag_addr = 0;
6472         long flag_val = 0;
6473         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6474         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6475         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6476           // First, check whether environment-encoded address is mapped into
6477           // addr space.
6478           // If so, dereference it to see if it still has the right value.
6479           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6480             neighbor = 1;
6481           } else {
6482             // If not, then we know the other copy of the library is no longer
6483             // running.
6484             neighbor = 2;
6485           }
6486         }
6487       }
6488       switch (neighbor) {
6489       case 0: // Cannot parse environment variable -- neighbor status unknown.
6490         // Assume it is the incompatible format of future version of the
6491         // library. Assume the other library is alive.
6492         // WARN( ... ); // TODO: Issue a warning.
6493         file_name = "unknown library";
6494         KMP_FALLTHROUGH();
6495       // Attention! Falling to the next case. That's intentional.
6496       case 1: { // Neighbor is alive.
6497         // Check it is allowed.
6498         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6499         if (!__kmp_str_match_true(duplicate_ok)) {
6500           // That's not allowed. Issue fatal error.
6501           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6502                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6503         }
6504         KMP_INTERNAL_FREE(duplicate_ok);
6505         __kmp_duplicate_library_ok = 1;
6506         done = 1; // Exit the loop.
6507       } break;
6508       case 2: { // Neighbor is dead.
6509 
6510 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6511         // close shared memory.
6512         shm_unlink(shm_name); // this removes file in /dev/shm
6513 #else
6514         // Clear the variable and try to register library again.
6515         __kmp_env_unset(name);
6516 #endif
6517       } break;
6518       default: { KMP_DEBUG_ASSERT(0); } break;
6519       }
6520     }
6521     KMP_INTERNAL_FREE((void *)value);
6522 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6523     KMP_INTERNAL_FREE((void *)shm_name);
6524 #endif
6525   } // while
6526   KMP_INTERNAL_FREE((void *)name);
6527 
6528 } // func __kmp_register_library_startup
6529 
6530 void __kmp_unregister_library(void) {
6531 
6532   char *name = __kmp_reg_status_name();
6533   char *value = NULL;
6534 
6535 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6536   char *shm_name = __kmp_str_format("/%s", name);
6537   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6538   if (fd1 == -1) {
6539     // file did not open. return.
6540     return;
6541   }
6542   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6543   if (data1 != MAP_FAILED) {
6544     value = __kmp_str_format("%s", data1); // read value from SHM
6545     munmap(data1, SHM_SIZE);
6546   }
6547   close(fd1);
6548 #else
6549   value = __kmp_env_get(name);
6550 #endif
6551 
6552   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6553   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6554   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6555 //  Ok, this is our variable. Delete it.
6556 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6557     shm_unlink(shm_name); // this removes file in /dev/shm
6558 #else
6559     __kmp_env_unset(name);
6560 #endif
6561   }
6562 
6563 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6564   KMP_INTERNAL_FREE(shm_name);
6565 #endif
6566 
6567   KMP_INTERNAL_FREE(__kmp_registration_str);
6568   KMP_INTERNAL_FREE(value);
6569   KMP_INTERNAL_FREE(name);
6570 
6571   __kmp_registration_flag = 0;
6572   __kmp_registration_str = NULL;
6573 
6574 } // __kmp_unregister_library
6575 
6576 // End of Library registration stuff.
6577 // -----------------------------------------------------------------------------
6578 
6579 #if KMP_MIC_SUPPORTED
6580 
6581 static void __kmp_check_mic_type() {
6582   kmp_cpuid_t cpuid_state = {0};
6583   kmp_cpuid_t *cs_p = &cpuid_state;
6584   __kmp_x86_cpuid(1, 0, cs_p);
6585   // We don't support mic1 at the moment
6586   if ((cs_p->eax & 0xff0) == 0xB10) {
6587     __kmp_mic_type = mic2;
6588   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6589     __kmp_mic_type = mic3;
6590   } else {
6591     __kmp_mic_type = non_mic;
6592   }
6593 }
6594 
6595 #endif /* KMP_MIC_SUPPORTED */
6596 
6597 #if KMP_HAVE_UMWAIT
6598 static void __kmp_user_level_mwait_init() {
6599   struct kmp_cpuid buf;
6600   __kmp_x86_cpuid(7, 0, &buf);
6601   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6602   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6603                 __kmp_umwait_enabled));
6604 }
6605 #elif KMP_HAVE_MWAIT
6606 #ifndef AT_INTELPHIUSERMWAIT
6607 // Spurious, non-existent value that should always fail to return anything.
6608 // Will be replaced with the correct value when we know that.
6609 #define AT_INTELPHIUSERMWAIT 10000
6610 #endif
6611 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6612 // earlier OS is used to build the RTL, we'll use the following internal
6613 // function when the entry is not found.
6614 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6615 unsigned long getauxval(unsigned long) { return 0; }
6616 
6617 static void __kmp_user_level_mwait_init() {
6618   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6619   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6620   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6621   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6622   if (__kmp_mic_type == mic3) {
6623     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6624     if ((res & 0x1) || __kmp_user_level_mwait) {
6625       __kmp_mwait_enabled = TRUE;
6626       if (__kmp_user_level_mwait) {
6627         KMP_INFORM(EnvMwaitWarn);
6628       }
6629     } else {
6630       __kmp_mwait_enabled = FALSE;
6631     }
6632   }
6633   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6634                 "__kmp_mwait_enabled = %d\n",
6635                 __kmp_mic_type, __kmp_mwait_enabled));
6636 }
6637 #endif /* KMP_HAVE_UMWAIT */
6638 
6639 static void __kmp_do_serial_initialize(void) {
6640   int i, gtid;
6641   size_t size;
6642 
6643   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6644 
6645   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6646   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6647   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6648   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6649   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6650 
6651 #if OMPT_SUPPORT
6652   ompt_pre_init();
6653 #endif
6654 
6655   __kmp_validate_locks();
6656 
6657   /* Initialize internal memory allocator */
6658   __kmp_init_allocator();
6659 
6660   /* Register the library startup via an environment variable and check to see
6661      whether another copy of the library is already registered. */
6662 
6663   __kmp_register_library_startup();
6664 
6665   /* TODO reinitialization of library */
6666   if (TCR_4(__kmp_global.g.g_done)) {
6667     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6668   }
6669 
6670   __kmp_global.g.g_abort = 0;
6671   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6672 
6673 /* initialize the locks */
6674 #if KMP_USE_ADAPTIVE_LOCKS
6675 #if KMP_DEBUG_ADAPTIVE_LOCKS
6676   __kmp_init_speculative_stats();
6677 #endif
6678 #endif
6679 #if KMP_STATS_ENABLED
6680   __kmp_stats_init();
6681 #endif
6682   __kmp_init_lock(&__kmp_global_lock);
6683   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6684   __kmp_init_lock(&__kmp_debug_lock);
6685   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6686   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6687   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6688   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6689   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6690   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6691   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6692   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6693   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6694   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6695   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6696   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6697   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6698   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6699   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6700 #if KMP_USE_MONITOR
6701   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6702 #endif
6703   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6704 
6705   /* conduct initialization and initial setup of configuration */
6706 
6707   __kmp_runtime_initialize();
6708 
6709 #if KMP_MIC_SUPPORTED
6710   __kmp_check_mic_type();
6711 #endif
6712 
6713 // Some global variable initialization moved here from kmp_env_initialize()
6714 #ifdef KMP_DEBUG
6715   kmp_diag = 0;
6716 #endif
6717   __kmp_abort_delay = 0;
6718 
6719   // From __kmp_init_dflt_team_nth()
6720   /* assume the entire machine will be used */
6721   __kmp_dflt_team_nth_ub = __kmp_xproc;
6722   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6723     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6724   }
6725   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6726     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6727   }
6728   __kmp_max_nth = __kmp_sys_max_nth;
6729   __kmp_cg_max_nth = __kmp_sys_max_nth;
6730   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6731   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6732     __kmp_teams_max_nth = __kmp_sys_max_nth;
6733   }
6734 
6735   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6736   // part
6737   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6738 #if KMP_USE_MONITOR
6739   __kmp_monitor_wakeups =
6740       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6741   __kmp_bt_intervals =
6742       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6743 #endif
6744   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6745   __kmp_library = library_throughput;
6746   // From KMP_SCHEDULE initialization
6747   __kmp_static = kmp_sch_static_balanced;
6748 // AC: do not use analytical here, because it is non-monotonous
6749 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6750 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6751 // need to repeat assignment
6752 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6753 // bit control and barrier method control parts
6754 #if KMP_FAST_REDUCTION_BARRIER
6755 #define kmp_reduction_barrier_gather_bb ((int)1)
6756 #define kmp_reduction_barrier_release_bb ((int)1)
6757 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6758 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6759 #endif // KMP_FAST_REDUCTION_BARRIER
6760   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6761     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6762     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6763     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6764     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6765 #if KMP_FAST_REDUCTION_BARRIER
6766     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6767       // lin_64 ): hyper,1
6768       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6769       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6770       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6771       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6772     }
6773 #endif // KMP_FAST_REDUCTION_BARRIER
6774   }
6775 #if KMP_FAST_REDUCTION_BARRIER
6776 #undef kmp_reduction_barrier_release_pat
6777 #undef kmp_reduction_barrier_gather_pat
6778 #undef kmp_reduction_barrier_release_bb
6779 #undef kmp_reduction_barrier_gather_bb
6780 #endif // KMP_FAST_REDUCTION_BARRIER
6781 #if KMP_MIC_SUPPORTED
6782   if (__kmp_mic_type == mic2) { // KNC
6783     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6784     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6785     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6786         1; // forkjoin release
6787     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6788     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6789   }
6790 #if KMP_FAST_REDUCTION_BARRIER
6791   if (__kmp_mic_type == mic2) { // KNC
6792     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6793     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6794   }
6795 #endif // KMP_FAST_REDUCTION_BARRIER
6796 #endif // KMP_MIC_SUPPORTED
6797 
6798 // From KMP_CHECKS initialization
6799 #ifdef KMP_DEBUG
6800   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6801 #else
6802   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6803 #endif
6804 
6805   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6806   __kmp_foreign_tp = TRUE;
6807 
6808   __kmp_global.g.g_dynamic = FALSE;
6809   __kmp_global.g.g_dynamic_mode = dynamic_default;
6810 
6811   __kmp_env_initialize(NULL);
6812 
6813 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6814   __kmp_user_level_mwait_init();
6815 #endif
6816 // Print all messages in message catalog for testing purposes.
6817 #ifdef KMP_DEBUG
6818   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6819   if (__kmp_str_match_true(val)) {
6820     kmp_str_buf_t buffer;
6821     __kmp_str_buf_init(&buffer);
6822     __kmp_i18n_dump_catalog(&buffer);
6823     __kmp_printf("%s", buffer.str);
6824     __kmp_str_buf_free(&buffer);
6825   }
6826   __kmp_env_free(&val);
6827 #endif
6828 
6829   __kmp_threads_capacity =
6830       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6831   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6832   __kmp_tp_capacity = __kmp_default_tp_capacity(
6833       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6834 
6835   // If the library is shut down properly, both pools must be NULL. Just in
6836   // case, set them to NULL -- some memory may leak, but subsequent code will
6837   // work even if pools are not freed.
6838   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6839   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6840   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6841   __kmp_thread_pool = NULL;
6842   __kmp_thread_pool_insert_pt = NULL;
6843   __kmp_team_pool = NULL;
6844 
6845   /* Allocate all of the variable sized records */
6846   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6847    * expandable */
6848   /* Since allocation is cache-aligned, just add extra padding at the end */
6849   size =
6850       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6851       CACHE_LINE;
6852   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6853   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6854                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6855 
6856   /* init thread counts */
6857   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6858                    0); // Asserts fail if the library is reinitializing and
6859   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6860   __kmp_all_nth = 0;
6861   __kmp_nth = 0;
6862 
6863   /* setup the uber master thread and hierarchy */
6864   gtid = __kmp_register_root(TRUE);
6865   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6866   KMP_ASSERT(KMP_UBER_GTID(gtid));
6867   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6868 
6869   KMP_MB(); /* Flush all pending memory write invalidates.  */
6870 
6871   __kmp_common_initialize();
6872 
6873 #if KMP_OS_UNIX
6874   /* invoke the child fork handler */
6875   __kmp_register_atfork();
6876 #endif
6877 
6878 #if !KMP_DYNAMIC_LIB
6879   {
6880     /* Invoke the exit handler when the program finishes, only for static
6881        library. For dynamic library, we already have _fini and DllMain. */
6882     int rc = atexit(__kmp_internal_end_atexit);
6883     if (rc != 0) {
6884       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6885                   __kmp_msg_null);
6886     }
6887   }
6888 #endif
6889 
6890 #if KMP_HANDLE_SIGNALS
6891 #if KMP_OS_UNIX
6892   /* NOTE: make sure that this is called before the user installs their own
6893      signal handlers so that the user handlers are called first. this way they
6894      can return false, not call our handler, avoid terminating the library, and
6895      continue execution where they left off. */
6896   __kmp_install_signals(FALSE);
6897 #endif /* KMP_OS_UNIX */
6898 #if KMP_OS_WINDOWS
6899   __kmp_install_signals(TRUE);
6900 #endif /* KMP_OS_WINDOWS */
6901 #endif
6902 
6903   /* we have finished the serial initialization */
6904   __kmp_init_counter++;
6905 
6906   __kmp_init_serial = TRUE;
6907 
6908   if (__kmp_settings) {
6909     __kmp_env_print();
6910   }
6911 
6912   if (__kmp_display_env || __kmp_display_env_verbose) {
6913     __kmp_env_print_2();
6914   }
6915 
6916 #if OMPT_SUPPORT
6917   ompt_post_init();
6918 #endif
6919 
6920   KMP_MB();
6921 
6922   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6923 }
6924 
6925 void __kmp_serial_initialize(void) {
6926   if (__kmp_init_serial) {
6927     return;
6928   }
6929   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6930   if (__kmp_init_serial) {
6931     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6932     return;
6933   }
6934   __kmp_do_serial_initialize();
6935   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6936 }
6937 
6938 static void __kmp_do_middle_initialize(void) {
6939   int i, j;
6940   int prev_dflt_team_nth;
6941 
6942   if (!__kmp_init_serial) {
6943     __kmp_do_serial_initialize();
6944   }
6945 
6946   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6947 
6948   // Save the previous value for the __kmp_dflt_team_nth so that
6949   // we can avoid some reinitialization if it hasn't changed.
6950   prev_dflt_team_nth = __kmp_dflt_team_nth;
6951 
6952 #if KMP_AFFINITY_SUPPORTED
6953   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6954   // number of cores on the machine.
6955   __kmp_affinity_initialize();
6956 
6957   // Run through the __kmp_threads array and set the affinity mask
6958   // for each root thread that is currently registered with the RTL.
6959   for (i = 0; i < __kmp_threads_capacity; i++) {
6960     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6961       __kmp_affinity_set_init_mask(i, TRUE);
6962     }
6963   }
6964 #endif /* KMP_AFFINITY_SUPPORTED */
6965 
6966   KMP_ASSERT(__kmp_xproc > 0);
6967   if (__kmp_avail_proc == 0) {
6968     __kmp_avail_proc = __kmp_xproc;
6969   }
6970 
6971   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6972   // correct them now
6973   j = 0;
6974   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6975     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6976         __kmp_avail_proc;
6977     j++;
6978   }
6979 
6980   if (__kmp_dflt_team_nth == 0) {
6981 #ifdef KMP_DFLT_NTH_CORES
6982     // Default #threads = #cores
6983     __kmp_dflt_team_nth = __kmp_ncores;
6984     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6985                   "__kmp_ncores (%d)\n",
6986                   __kmp_dflt_team_nth));
6987 #else
6988     // Default #threads = #available OS procs
6989     __kmp_dflt_team_nth = __kmp_avail_proc;
6990     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6991                   "__kmp_avail_proc(%d)\n",
6992                   __kmp_dflt_team_nth));
6993 #endif /* KMP_DFLT_NTH_CORES */
6994   }
6995 
6996   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6997     __kmp_dflt_team_nth = KMP_MIN_NTH;
6998   }
6999   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7000     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7001   }
7002 
7003   // There's no harm in continuing if the following check fails,
7004   // but it indicates an error in the previous logic.
7005   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7006 
7007   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7008     // Run through the __kmp_threads array and set the num threads icv for each
7009     // root thread that is currently registered with the RTL (which has not
7010     // already explicitly set its nthreads-var with a call to
7011     // omp_set_num_threads()).
7012     for (i = 0; i < __kmp_threads_capacity; i++) {
7013       kmp_info_t *thread = __kmp_threads[i];
7014       if (thread == NULL)
7015         continue;
7016       if (thread->th.th_current_task->td_icvs.nproc != 0)
7017         continue;
7018 
7019       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7020     }
7021   }
7022   KA_TRACE(
7023       20,
7024       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7025        __kmp_dflt_team_nth));
7026 
7027 #ifdef KMP_ADJUST_BLOCKTIME
7028   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7029   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7030     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7031     if (__kmp_nth > __kmp_avail_proc) {
7032       __kmp_zero_bt = TRUE;
7033     }
7034   }
7035 #endif /* KMP_ADJUST_BLOCKTIME */
7036 
7037   /* we have finished middle initialization */
7038   TCW_SYNC_4(__kmp_init_middle, TRUE);
7039 
7040   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7041 }
7042 
7043 void __kmp_middle_initialize(void) {
7044   if (__kmp_init_middle) {
7045     return;
7046   }
7047   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7048   if (__kmp_init_middle) {
7049     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7050     return;
7051   }
7052   __kmp_do_middle_initialize();
7053   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7054 }
7055 
7056 void __kmp_parallel_initialize(void) {
7057   int gtid = __kmp_entry_gtid(); // this might be a new root
7058 
7059   /* synchronize parallel initialization (for sibling) */
7060   if (TCR_4(__kmp_init_parallel))
7061     return;
7062   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7063   if (TCR_4(__kmp_init_parallel)) {
7064     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7065     return;
7066   }
7067 
7068   /* TODO reinitialization after we have already shut down */
7069   if (TCR_4(__kmp_global.g.g_done)) {
7070     KA_TRACE(
7071         10,
7072         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7073     __kmp_infinite_loop();
7074   }
7075 
7076   /* jc: The lock __kmp_initz_lock is already held, so calling
7077      __kmp_serial_initialize would cause a deadlock.  So we call
7078      __kmp_do_serial_initialize directly. */
7079   if (!__kmp_init_middle) {
7080     __kmp_do_middle_initialize();
7081   }
7082   __kmp_resume_if_hard_paused();
7083 
7084   /* begin initialization */
7085   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7086   KMP_ASSERT(KMP_UBER_GTID(gtid));
7087 
7088 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7089   // Save the FP control regs.
7090   // Worker threads will set theirs to these values at thread startup.
7091   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7092   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7093   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7094 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7095 
7096 #if KMP_OS_UNIX
7097 #if KMP_HANDLE_SIGNALS
7098   /*  must be after __kmp_serial_initialize  */
7099   __kmp_install_signals(TRUE);
7100 #endif
7101 #endif
7102 
7103   __kmp_suspend_initialize();
7104 
7105 #if defined(USE_LOAD_BALANCE)
7106   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7107     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7108   }
7109 #else
7110   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7111     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7112   }
7113 #endif
7114 
7115   if (__kmp_version) {
7116     __kmp_print_version_2();
7117   }
7118 
7119   /* we have finished parallel initialization */
7120   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7121 
7122   KMP_MB();
7123   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7124 
7125   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7126 }
7127 
7128 /* ------------------------------------------------------------------------ */
7129 
7130 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7131                                    kmp_team_t *team) {
7132   kmp_disp_t *dispatch;
7133 
7134   KMP_MB();
7135 
7136   /* none of the threads have encountered any constructs, yet. */
7137   this_thr->th.th_local.this_construct = 0;
7138 #if KMP_CACHE_MANAGE
7139   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7140 #endif /* KMP_CACHE_MANAGE */
7141   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7142   KMP_DEBUG_ASSERT(dispatch);
7143   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7144   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7145   // this_thr->th.th_info.ds.ds_tid ] );
7146 
7147   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7148   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7149   if (__kmp_env_consistency_check)
7150     __kmp_push_parallel(gtid, team->t.t_ident);
7151 
7152   KMP_MB(); /* Flush all pending memory write invalidates.  */
7153 }
7154 
7155 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7156                                   kmp_team_t *team) {
7157   if (__kmp_env_consistency_check)
7158     __kmp_pop_parallel(gtid, team->t.t_ident);
7159 
7160   __kmp_finish_implicit_task(this_thr);
7161 }
7162 
7163 int __kmp_invoke_task_func(int gtid) {
7164   int rc;
7165   int tid = __kmp_tid_from_gtid(gtid);
7166   kmp_info_t *this_thr = __kmp_threads[gtid];
7167   kmp_team_t *team = this_thr->th.th_team;
7168 
7169   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7170 #if USE_ITT_BUILD
7171   if (__itt_stack_caller_create_ptr) {
7172     __kmp_itt_stack_callee_enter(
7173         (__itt_caller)
7174             team->t.t_stack_id); // inform ittnotify about entering user's code
7175   }
7176 #endif /* USE_ITT_BUILD */
7177 #if INCLUDE_SSC_MARKS
7178   SSC_MARK_INVOKING();
7179 #endif
7180 
7181 #if OMPT_SUPPORT
7182   void *dummy;
7183   void **exit_frame_p;
7184   ompt_data_t *my_task_data;
7185   ompt_data_t *my_parallel_data;
7186   int ompt_team_size;
7187 
7188   if (ompt_enabled.enabled) {
7189     exit_frame_p = &(
7190         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7191   } else {
7192     exit_frame_p = &dummy;
7193   }
7194 
7195   my_task_data =
7196       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7197   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7198   if (ompt_enabled.ompt_callback_implicit_task) {
7199     ompt_team_size = team->t.t_nproc;
7200     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7201         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7202         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7203     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7204   }
7205 #endif
7206 
7207 #if KMP_STATS_ENABLED
7208   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7209   if (previous_state == stats_state_e::TEAMS_REGION) {
7210     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7211   } else {
7212     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7213   }
7214   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7215 #endif
7216 
7217   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7218                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7219 #if OMPT_SUPPORT
7220                               ,
7221                               exit_frame_p
7222 #endif
7223                               );
7224 #if OMPT_SUPPORT
7225   *exit_frame_p = NULL;
7226    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7227 #endif
7228 
7229 #if KMP_STATS_ENABLED
7230   if (previous_state == stats_state_e::TEAMS_REGION) {
7231     KMP_SET_THREAD_STATE(previous_state);
7232   }
7233   KMP_POP_PARTITIONED_TIMER();
7234 #endif
7235 
7236 #if USE_ITT_BUILD
7237   if (__itt_stack_caller_create_ptr) {
7238     __kmp_itt_stack_callee_leave(
7239         (__itt_caller)
7240             team->t.t_stack_id); // inform ittnotify about leaving user's code
7241   }
7242 #endif /* USE_ITT_BUILD */
7243   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7244 
7245   return rc;
7246 }
7247 
7248 void __kmp_teams_master(int gtid) {
7249   // This routine is called by all master threads in teams construct
7250   kmp_info_t *thr = __kmp_threads[gtid];
7251   kmp_team_t *team = thr->th.th_team;
7252   ident_t *loc = team->t.t_ident;
7253   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7254   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7255   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7256   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7257                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7258 
7259   // This thread is a new CG root.  Set up the proper variables.
7260   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7261   tmp->cg_root = thr; // Make thr the CG root
7262   // Init to thread limit that was stored when league masters were forked
7263   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7264   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7265   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7266                  " cg_nthreads to 1\n",
7267                  thr, tmp));
7268   tmp->up = thr->th.th_cg_roots;
7269   thr->th.th_cg_roots = tmp;
7270 
7271 // Launch league of teams now, but not let workers execute
7272 // (they hang on fork barrier until next parallel)
7273 #if INCLUDE_SSC_MARKS
7274   SSC_MARK_FORKING();
7275 #endif
7276   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7277                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7278                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7279 #if INCLUDE_SSC_MARKS
7280   SSC_MARK_JOINING();
7281 #endif
7282   // If the team size was reduced from the limit, set it to the new size
7283   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7284     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7285   // AC: last parameter "1" eliminates join barrier which won't work because
7286   // worker threads are in a fork barrier waiting for more parallel regions
7287   __kmp_join_call(loc, gtid
7288 #if OMPT_SUPPORT
7289                   ,
7290                   fork_context_intel
7291 #endif
7292                   ,
7293                   1);
7294 }
7295 
7296 int __kmp_invoke_teams_master(int gtid) {
7297   kmp_info_t *this_thr = __kmp_threads[gtid];
7298   kmp_team_t *team = this_thr->th.th_team;
7299 #if KMP_DEBUG
7300   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7301     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7302                      (void *)__kmp_teams_master);
7303 #endif
7304   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7305 #if OMPT_SUPPORT
7306   int tid = __kmp_tid_from_gtid(gtid);
7307   ompt_data_t *task_data =
7308       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7309   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7310   if (ompt_enabled.ompt_callback_implicit_task) {
7311     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7312         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7313         ompt_task_initial);
7314     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7315   }
7316 #endif
7317   __kmp_teams_master(gtid);
7318 #if OMPT_SUPPORT
7319   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7320 #endif
7321   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7322   return 1;
7323 }
7324 
7325 /* this sets the requested number of threads for the next parallel region
7326    encountered by this team. since this should be enclosed in the forkjoin
7327    critical section it should avoid race conditions with asymmetrical nested
7328    parallelism */
7329 
7330 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7331   kmp_info_t *thr = __kmp_threads[gtid];
7332 
7333   if (num_threads > 0)
7334     thr->th.th_set_nproc = num_threads;
7335 }
7336 
7337 /* this sets the requested number of teams for the teams region and/or
7338    the number of threads for the next parallel region encountered  */
7339 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7340                           int num_threads) {
7341   kmp_info_t *thr = __kmp_threads[gtid];
7342   KMP_DEBUG_ASSERT(num_teams >= 0);
7343   KMP_DEBUG_ASSERT(num_threads >= 0);
7344 
7345   if (num_teams == 0)
7346     num_teams = 1; // default number of teams is 1.
7347   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7348     if (!__kmp_reserve_warn) {
7349       __kmp_reserve_warn = 1;
7350       __kmp_msg(kmp_ms_warning,
7351                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7352                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7353     }
7354     num_teams = __kmp_teams_max_nth;
7355   }
7356   // Set number of teams (number of threads in the outer "parallel" of the
7357   // teams)
7358   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7359 
7360   // Remember the number of threads for inner parallel regions
7361   if (!TCR_4(__kmp_init_middle))
7362     __kmp_middle_initialize(); // get internal globals calculated
7363   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7364   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7365   if (num_threads == 0) {
7366     num_threads = __kmp_avail_proc / num_teams;
7367     // adjust num_threads w/o warning as it is not user setting
7368     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7369     // no thread_limit clause specified -  do not change thread-limit-var ICV
7370     if (num_threads > __kmp_dflt_team_nth) {
7371       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7372     }
7373     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7374       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7375     } // prevent team size to exceed thread-limit-var
7376     if (num_teams * num_threads > __kmp_teams_max_nth) {
7377       num_threads = __kmp_teams_max_nth / num_teams;
7378     }
7379   } else {
7380     // This thread will be the master of the league masters
7381     // Store new thread limit; old limit is saved in th_cg_roots list
7382     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7383     // num_threads = min(num_threads, nthreads-var)
7384     if (num_threads > __kmp_dflt_team_nth) {
7385       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7386     }
7387     if (num_teams * num_threads > __kmp_teams_max_nth) {
7388       int new_threads = __kmp_teams_max_nth / num_teams;
7389       if (!__kmp_reserve_warn) { // user asked for too many threads
7390         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7391         __kmp_msg(kmp_ms_warning,
7392                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7393                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7394       }
7395       num_threads = new_threads;
7396     }
7397   }
7398   thr->th.th_teams_size.nth = num_threads;
7399 }
7400 
7401 // Set the proc_bind var to use in the following parallel region.
7402 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7403   kmp_info_t *thr = __kmp_threads[gtid];
7404   thr->th.th_set_proc_bind = proc_bind;
7405 }
7406 
7407 /* Launch the worker threads into the microtask. */
7408 
7409 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7410   kmp_info_t *this_thr = __kmp_threads[gtid];
7411 
7412 #ifdef KMP_DEBUG
7413   int f;
7414 #endif /* KMP_DEBUG */
7415 
7416   KMP_DEBUG_ASSERT(team);
7417   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7418   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7419   KMP_MB(); /* Flush all pending memory write invalidates.  */
7420 
7421   team->t.t_construct = 0; /* no single directives seen yet */
7422   team->t.t_ordered.dt.t_value =
7423       0; /* thread 0 enters the ordered section first */
7424 
7425   /* Reset the identifiers on the dispatch buffer */
7426   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7427   if (team->t.t_max_nproc > 1) {
7428     int i;
7429     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7430       team->t.t_disp_buffer[i].buffer_index = i;
7431       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7432     }
7433   } else {
7434     team->t.t_disp_buffer[0].buffer_index = 0;
7435     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7436   }
7437 
7438   KMP_MB(); /* Flush all pending memory write invalidates.  */
7439   KMP_ASSERT(this_thr->th.th_team == team);
7440 
7441 #ifdef KMP_DEBUG
7442   for (f = 0; f < team->t.t_nproc; f++) {
7443     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7444                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7445   }
7446 #endif /* KMP_DEBUG */
7447 
7448   /* release the worker threads so they may begin working */
7449   __kmp_fork_barrier(gtid, 0);
7450 }
7451 
7452 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7453   kmp_info_t *this_thr = __kmp_threads[gtid];
7454 
7455   KMP_DEBUG_ASSERT(team);
7456   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7457   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7458   KMP_MB(); /* Flush all pending memory write invalidates.  */
7459 
7460 /* Join barrier after fork */
7461 
7462 #ifdef KMP_DEBUG
7463   if (__kmp_threads[gtid] &&
7464       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7465     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7466                  __kmp_threads[gtid]);
7467     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7468                  "team->t.t_nproc=%d\n",
7469                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7470                  team->t.t_nproc);
7471     __kmp_print_structure();
7472   }
7473   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7474                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7475 #endif /* KMP_DEBUG */
7476 
7477   __kmp_join_barrier(gtid); /* wait for everyone */
7478 #if OMPT_SUPPORT
7479   if (ompt_enabled.enabled &&
7480       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7481     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7482     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7483     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7484 #if OMPT_OPTIONAL
7485     void *codeptr = NULL;
7486     if (KMP_MASTER_TID(ds_tid) &&
7487         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7488          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7489       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7490 
7491     if (ompt_enabled.ompt_callback_sync_region_wait) {
7492       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7493           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7494           codeptr);
7495     }
7496     if (ompt_enabled.ompt_callback_sync_region) {
7497       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7498           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7499           codeptr);
7500     }
7501 #endif
7502     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7503       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7504           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7505     }
7506   }
7507 #endif
7508 
7509   KMP_MB(); /* Flush all pending memory write invalidates.  */
7510   KMP_ASSERT(this_thr->th.th_team == team);
7511 }
7512 
7513 /* ------------------------------------------------------------------------ */
7514 
7515 #ifdef USE_LOAD_BALANCE
7516 
7517 // Return the worker threads actively spinning in the hot team, if we
7518 // are at the outermost level of parallelism.  Otherwise, return 0.
7519 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7520   int i;
7521   int retval;
7522   kmp_team_t *hot_team;
7523 
7524   if (root->r.r_active) {
7525     return 0;
7526   }
7527   hot_team = root->r.r_hot_team;
7528   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7529     return hot_team->t.t_nproc - 1; // Don't count master thread
7530   }
7531 
7532   // Skip the master thread - it is accounted for elsewhere.
7533   retval = 0;
7534   for (i = 1; i < hot_team->t.t_nproc; i++) {
7535     if (hot_team->t.t_threads[i]->th.th_active) {
7536       retval++;
7537     }
7538   }
7539   return retval;
7540 }
7541 
7542 // Perform an automatic adjustment to the number of
7543 // threads used by the next parallel region.
7544 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7545   int retval;
7546   int pool_active;
7547   int hot_team_active;
7548   int team_curr_active;
7549   int system_active;
7550 
7551   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7552                 set_nproc));
7553   KMP_DEBUG_ASSERT(root);
7554   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7555                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7556   KMP_DEBUG_ASSERT(set_nproc > 1);
7557 
7558   if (set_nproc == 1) {
7559     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7560     return 1;
7561   }
7562 
7563   // Threads that are active in the thread pool, active in the hot team for this
7564   // particular root (if we are at the outer par level), and the currently
7565   // executing thread (to become the master) are available to add to the new
7566   // team, but are currently contributing to the system load, and must be
7567   // accounted for.
7568   pool_active = __kmp_thread_pool_active_nth;
7569   hot_team_active = __kmp_active_hot_team_nproc(root);
7570   team_curr_active = pool_active + hot_team_active + 1;
7571 
7572   // Check the system load.
7573   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7574   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7575                 "hot team active = %d\n",
7576                 system_active, pool_active, hot_team_active));
7577 
7578   if (system_active < 0) {
7579     // There was an error reading the necessary info from /proc, so use the
7580     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7581     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7582     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7583     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7584 
7585     // Make this call behave like the thread limit algorithm.
7586     retval = __kmp_avail_proc - __kmp_nth +
7587              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7588     if (retval > set_nproc) {
7589       retval = set_nproc;
7590     }
7591     if (retval < KMP_MIN_NTH) {
7592       retval = KMP_MIN_NTH;
7593     }
7594 
7595     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7596                   retval));
7597     return retval;
7598   }
7599 
7600   // There is a slight delay in the load balance algorithm in detecting new
7601   // running procs. The real system load at this instant should be at least as
7602   // large as the #active omp thread that are available to add to the team.
7603   if (system_active < team_curr_active) {
7604     system_active = team_curr_active;
7605   }
7606   retval = __kmp_avail_proc - system_active + team_curr_active;
7607   if (retval > set_nproc) {
7608     retval = set_nproc;
7609   }
7610   if (retval < KMP_MIN_NTH) {
7611     retval = KMP_MIN_NTH;
7612   }
7613 
7614   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7615   return retval;
7616 } // __kmp_load_balance_nproc()
7617 
7618 #endif /* USE_LOAD_BALANCE */
7619 
7620 /* ------------------------------------------------------------------------ */
7621 
7622 /* NOTE: this is called with the __kmp_init_lock held */
7623 void __kmp_cleanup(void) {
7624   int f;
7625 
7626   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7627 
7628   if (TCR_4(__kmp_init_parallel)) {
7629 #if KMP_HANDLE_SIGNALS
7630     __kmp_remove_signals();
7631 #endif
7632     TCW_4(__kmp_init_parallel, FALSE);
7633   }
7634 
7635   if (TCR_4(__kmp_init_middle)) {
7636 #if KMP_AFFINITY_SUPPORTED
7637     __kmp_affinity_uninitialize();
7638 #endif /* KMP_AFFINITY_SUPPORTED */
7639     __kmp_cleanup_hierarchy();
7640     TCW_4(__kmp_init_middle, FALSE);
7641   }
7642 
7643   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7644 
7645   if (__kmp_init_serial) {
7646     __kmp_runtime_destroy();
7647     __kmp_init_serial = FALSE;
7648   }
7649 
7650   __kmp_cleanup_threadprivate_caches();
7651 
7652   for (f = 0; f < __kmp_threads_capacity; f++) {
7653     if (__kmp_root[f] != NULL) {
7654       __kmp_free(__kmp_root[f]);
7655       __kmp_root[f] = NULL;
7656     }
7657   }
7658   __kmp_free(__kmp_threads);
7659   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7660   // there is no need in freeing __kmp_root.
7661   __kmp_threads = NULL;
7662   __kmp_root = NULL;
7663   __kmp_threads_capacity = 0;
7664 
7665 #if KMP_USE_DYNAMIC_LOCK
7666   __kmp_cleanup_indirect_user_locks();
7667 #else
7668   __kmp_cleanup_user_locks();
7669 #endif
7670 
7671 #if KMP_AFFINITY_SUPPORTED
7672   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7673   __kmp_cpuinfo_file = NULL;
7674 #endif /* KMP_AFFINITY_SUPPORTED */
7675 
7676 #if KMP_USE_ADAPTIVE_LOCKS
7677 #if KMP_DEBUG_ADAPTIVE_LOCKS
7678   __kmp_print_speculative_stats();
7679 #endif
7680 #endif
7681   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7682   __kmp_nested_nth.nth = NULL;
7683   __kmp_nested_nth.size = 0;
7684   __kmp_nested_nth.used = 0;
7685   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7686   __kmp_nested_proc_bind.bind_types = NULL;
7687   __kmp_nested_proc_bind.size = 0;
7688   __kmp_nested_proc_bind.used = 0;
7689   if (__kmp_affinity_format) {
7690     KMP_INTERNAL_FREE(__kmp_affinity_format);
7691     __kmp_affinity_format = NULL;
7692   }
7693 
7694   __kmp_i18n_catclose();
7695 
7696 #if KMP_USE_HIER_SCHED
7697   __kmp_hier_scheds.deallocate();
7698 #endif
7699 
7700 #if KMP_STATS_ENABLED
7701   __kmp_stats_fini();
7702 #endif
7703 
7704   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7705 }
7706 
7707 /* ------------------------------------------------------------------------ */
7708 
7709 int __kmp_ignore_mppbeg(void) {
7710   char *env;
7711 
7712   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7713     if (__kmp_str_match_false(env))
7714       return FALSE;
7715   }
7716   // By default __kmpc_begin() is no-op.
7717   return TRUE;
7718 }
7719 
7720 int __kmp_ignore_mppend(void) {
7721   char *env;
7722 
7723   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7724     if (__kmp_str_match_false(env))
7725       return FALSE;
7726   }
7727   // By default __kmpc_end() is no-op.
7728   return TRUE;
7729 }
7730 
7731 void __kmp_internal_begin(void) {
7732   int gtid;
7733   kmp_root_t *root;
7734 
7735   /* this is a very important step as it will register new sibling threads
7736      and assign these new uber threads a new gtid */
7737   gtid = __kmp_entry_gtid();
7738   root = __kmp_threads[gtid]->th.th_root;
7739   KMP_ASSERT(KMP_UBER_GTID(gtid));
7740 
7741   if (root->r.r_begin)
7742     return;
7743   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7744   if (root->r.r_begin) {
7745     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7746     return;
7747   }
7748 
7749   root->r.r_begin = TRUE;
7750 
7751   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7752 }
7753 
7754 /* ------------------------------------------------------------------------ */
7755 
7756 void __kmp_user_set_library(enum library_type arg) {
7757   int gtid;
7758   kmp_root_t *root;
7759   kmp_info_t *thread;
7760 
7761   /* first, make sure we are initialized so we can get our gtid */
7762 
7763   gtid = __kmp_entry_gtid();
7764   thread = __kmp_threads[gtid];
7765 
7766   root = thread->th.th_root;
7767 
7768   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7769                 library_serial));
7770   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7771                                   thread */
7772     KMP_WARNING(SetLibraryIncorrectCall);
7773     return;
7774   }
7775 
7776   switch (arg) {
7777   case library_serial:
7778     thread->th.th_set_nproc = 0;
7779     set__nproc(thread, 1);
7780     break;
7781   case library_turnaround:
7782     thread->th.th_set_nproc = 0;
7783     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7784                                            : __kmp_dflt_team_nth_ub);
7785     break;
7786   case library_throughput:
7787     thread->th.th_set_nproc = 0;
7788     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7789                                            : __kmp_dflt_team_nth_ub);
7790     break;
7791   default:
7792     KMP_FATAL(UnknownLibraryType, arg);
7793   }
7794 
7795   __kmp_aux_set_library(arg);
7796 }
7797 
7798 void __kmp_aux_set_stacksize(size_t arg) {
7799   if (!__kmp_init_serial)
7800     __kmp_serial_initialize();
7801 
7802 #if KMP_OS_DARWIN
7803   if (arg & (0x1000 - 1)) {
7804     arg &= ~(0x1000 - 1);
7805     if (arg + 0x1000) /* check for overflow if we round up */
7806       arg += 0x1000;
7807   }
7808 #endif
7809   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7810 
7811   /* only change the default stacksize before the first parallel region */
7812   if (!TCR_4(__kmp_init_parallel)) {
7813     size_t value = arg; /* argument is in bytes */
7814 
7815     if (value < __kmp_sys_min_stksize)
7816       value = __kmp_sys_min_stksize;
7817     else if (value > KMP_MAX_STKSIZE)
7818       value = KMP_MAX_STKSIZE;
7819 
7820     __kmp_stksize = value;
7821 
7822     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7823   }
7824 
7825   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7826 }
7827 
7828 /* set the behaviour of the runtime library */
7829 /* TODO this can cause some odd behaviour with sibling parallelism... */
7830 void __kmp_aux_set_library(enum library_type arg) {
7831   __kmp_library = arg;
7832 
7833   switch (__kmp_library) {
7834   case library_serial: {
7835     KMP_INFORM(LibraryIsSerial);
7836   } break;
7837   case library_turnaround:
7838     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7839       __kmp_use_yield = 2; // only yield when oversubscribed
7840     break;
7841   case library_throughput:
7842     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7843       __kmp_dflt_blocktime = 200;
7844     break;
7845   default:
7846     KMP_FATAL(UnknownLibraryType, arg);
7847   }
7848 }
7849 
7850 /* Getting team information common for all team API */
7851 // Returns NULL if not in teams construct
7852 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7853   kmp_info_t *thr = __kmp_entry_thread();
7854   teams_serialized = 0;
7855   if (thr->th.th_teams_microtask) {
7856     kmp_team_t *team = thr->th.th_team;
7857     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7858     int ii = team->t.t_level;
7859     teams_serialized = team->t.t_serialized;
7860     int level = tlevel + 1;
7861     KMP_DEBUG_ASSERT(ii >= tlevel);
7862     while (ii > level) {
7863       for (teams_serialized = team->t.t_serialized;
7864            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7865       }
7866       if (team->t.t_serialized && (!teams_serialized)) {
7867         team = team->t.t_parent;
7868         continue;
7869       }
7870       if (ii > level) {
7871         team = team->t.t_parent;
7872         ii--;
7873       }
7874     }
7875     return team;
7876   }
7877   return NULL;
7878 }
7879 
7880 int __kmp_aux_get_team_num() {
7881   int serialized;
7882   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7883   if (team) {
7884     if (serialized > 1) {
7885       return 0; // teams region is serialized ( 1 team of 1 thread ).
7886     } else {
7887       return team->t.t_master_tid;
7888     }
7889   }
7890   return 0;
7891 }
7892 
7893 int __kmp_aux_get_num_teams() {
7894   int serialized;
7895   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7896   if (team) {
7897     if (serialized > 1) {
7898       return 1;
7899     } else {
7900       return team->t.t_parent->t.t_nproc;
7901     }
7902   }
7903   return 1;
7904 }
7905 
7906 /* ------------------------------------------------------------------------ */
7907 
7908 /*
7909  * Affinity Format Parser
7910  *
7911  * Field is in form of: %[[[0].]size]type
7912  * % and type are required (%% means print a literal '%')
7913  * type is either single char or long name surrounded by {},
7914  * e.g., N or {num_threads}
7915  * 0 => leading zeros
7916  * . => right justified when size is specified
7917  * by default output is left justified
7918  * size is the *minimum* field length
7919  * All other characters are printed as is
7920  *
7921  * Available field types:
7922  * L {thread_level}      - omp_get_level()
7923  * n {thread_num}        - omp_get_thread_num()
7924  * h {host}              - name of host machine
7925  * P {process_id}        - process id (integer)
7926  * T {thread_identifier} - native thread identifier (integer)
7927  * N {num_threads}       - omp_get_num_threads()
7928  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7929  * a {thread_affinity}   - comma separated list of integers or integer ranges
7930  *                         (values of affinity mask)
7931  *
7932  * Implementation-specific field types can be added
7933  * If a type is unknown, print "undefined"
7934 */
7935 
7936 // Structure holding the short name, long name, and corresponding data type
7937 // for snprintf.  A table of these will represent the entire valid keyword
7938 // field types.
7939 typedef struct kmp_affinity_format_field_t {
7940   char short_name; // from spec e.g., L -> thread level
7941   const char *long_name; // from spec thread_level -> thread level
7942   char field_format; // data type for snprintf (typically 'd' or 's'
7943   // for integer or string)
7944 } kmp_affinity_format_field_t;
7945 
7946 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7947 #if KMP_AFFINITY_SUPPORTED
7948     {'A', "thread_affinity", 's'},
7949 #endif
7950     {'t', "team_num", 'd'},
7951     {'T', "num_teams", 'd'},
7952     {'L', "nesting_level", 'd'},
7953     {'n', "thread_num", 'd'},
7954     {'N', "num_threads", 'd'},
7955     {'a', "ancestor_tnum", 'd'},
7956     {'H', "host", 's'},
7957     {'P', "process_id", 'd'},
7958     {'i', "native_thread_id", 'd'}};
7959 
7960 // Return the number of characters it takes to hold field
7961 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7962                                             const char **ptr,
7963                                             kmp_str_buf_t *field_buffer) {
7964   int rc, format_index, field_value;
7965   const char *width_left, *width_right;
7966   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7967   static const int FORMAT_SIZE = 20;
7968   char format[FORMAT_SIZE] = {0};
7969   char absolute_short_name = 0;
7970 
7971   KMP_DEBUG_ASSERT(gtid >= 0);
7972   KMP_DEBUG_ASSERT(th);
7973   KMP_DEBUG_ASSERT(**ptr == '%');
7974   KMP_DEBUG_ASSERT(field_buffer);
7975 
7976   __kmp_str_buf_clear(field_buffer);
7977 
7978   // Skip the initial %
7979   (*ptr)++;
7980 
7981   // Check for %% first
7982   if (**ptr == '%') {
7983     __kmp_str_buf_cat(field_buffer, "%", 1);
7984     (*ptr)++; // skip over the second %
7985     return 1;
7986   }
7987 
7988   // Parse field modifiers if they are present
7989   pad_zeros = false;
7990   if (**ptr == '0') {
7991     pad_zeros = true;
7992     (*ptr)++; // skip over 0
7993   }
7994   right_justify = false;
7995   if (**ptr == '.') {
7996     right_justify = true;
7997     (*ptr)++; // skip over .
7998   }
7999   // Parse width of field: [width_left, width_right)
8000   width_left = width_right = NULL;
8001   if (**ptr >= '0' && **ptr <= '9') {
8002     width_left = *ptr;
8003     SKIP_DIGITS(*ptr);
8004     width_right = *ptr;
8005   }
8006 
8007   // Create the format for KMP_SNPRINTF based on flags parsed above
8008   format_index = 0;
8009   format[format_index++] = '%';
8010   if (!right_justify)
8011     format[format_index++] = '-';
8012   if (pad_zeros)
8013     format[format_index++] = '0';
8014   if (width_left && width_right) {
8015     int i = 0;
8016     // Only allow 8 digit number widths.
8017     // This also prevents overflowing format variable
8018     while (i < 8 && width_left < width_right) {
8019       format[format_index++] = *width_left;
8020       width_left++;
8021       i++;
8022     }
8023   }
8024 
8025   // Parse a name (long or short)
8026   // Canonicalize the name into absolute_short_name
8027   found_valid_name = false;
8028   parse_long_name = (**ptr == '{');
8029   if (parse_long_name)
8030     (*ptr)++; // skip initial left brace
8031   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8032                              sizeof(__kmp_affinity_format_table[0]);
8033        ++i) {
8034     char short_name = __kmp_affinity_format_table[i].short_name;
8035     const char *long_name = __kmp_affinity_format_table[i].long_name;
8036     char field_format = __kmp_affinity_format_table[i].field_format;
8037     if (parse_long_name) {
8038       size_t length = KMP_STRLEN(long_name);
8039       if (strncmp(*ptr, long_name, length) == 0) {
8040         found_valid_name = true;
8041         (*ptr) += length; // skip the long name
8042       }
8043     } else if (**ptr == short_name) {
8044       found_valid_name = true;
8045       (*ptr)++; // skip the short name
8046     }
8047     if (found_valid_name) {
8048       format[format_index++] = field_format;
8049       format[format_index++] = '\0';
8050       absolute_short_name = short_name;
8051       break;
8052     }
8053   }
8054   if (parse_long_name) {
8055     if (**ptr != '}') {
8056       absolute_short_name = 0;
8057     } else {
8058       (*ptr)++; // skip over the right brace
8059     }
8060   }
8061 
8062   // Attempt to fill the buffer with the requested
8063   // value using snprintf within __kmp_str_buf_print()
8064   switch (absolute_short_name) {
8065   case 't':
8066     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8067     break;
8068   case 'T':
8069     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8070     break;
8071   case 'L':
8072     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8073     break;
8074   case 'n':
8075     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8076     break;
8077   case 'H': {
8078     static const int BUFFER_SIZE = 256;
8079     char buf[BUFFER_SIZE];
8080     __kmp_expand_host_name(buf, BUFFER_SIZE);
8081     rc = __kmp_str_buf_print(field_buffer, format, buf);
8082   } break;
8083   case 'P':
8084     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8085     break;
8086   case 'i':
8087     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8088     break;
8089   case 'N':
8090     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8091     break;
8092   case 'a':
8093     field_value =
8094         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8095     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8096     break;
8097 #if KMP_AFFINITY_SUPPORTED
8098   case 'A': {
8099     kmp_str_buf_t buf;
8100     __kmp_str_buf_init(&buf);
8101     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8102     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8103     __kmp_str_buf_free(&buf);
8104   } break;
8105 #endif
8106   default:
8107     // According to spec, If an implementation does not have info for field
8108     // type, then "undefined" is printed
8109     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8110     // Skip the field
8111     if (parse_long_name) {
8112       SKIP_TOKEN(*ptr);
8113       if (**ptr == '}')
8114         (*ptr)++;
8115     } else {
8116       (*ptr)++;
8117     }
8118   }
8119 
8120   KMP_ASSERT(format_index <= FORMAT_SIZE);
8121   return rc;
8122 }
8123 
8124 /*
8125  * Return number of characters needed to hold the affinity string
8126  * (not including null byte character)
8127  * The resultant string is printed to buffer, which the caller can then
8128  * handle afterwards
8129 */
8130 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8131                                   kmp_str_buf_t *buffer) {
8132   const char *parse_ptr;
8133   size_t retval;
8134   const kmp_info_t *th;
8135   kmp_str_buf_t field;
8136 
8137   KMP_DEBUG_ASSERT(buffer);
8138   KMP_DEBUG_ASSERT(gtid >= 0);
8139 
8140   __kmp_str_buf_init(&field);
8141   __kmp_str_buf_clear(buffer);
8142 
8143   th = __kmp_threads[gtid];
8144   retval = 0;
8145 
8146   // If format is NULL or zero-length string, then we use
8147   // affinity-format-var ICV
8148   parse_ptr = format;
8149   if (parse_ptr == NULL || *parse_ptr == '\0') {
8150     parse_ptr = __kmp_affinity_format;
8151   }
8152   KMP_DEBUG_ASSERT(parse_ptr);
8153 
8154   while (*parse_ptr != '\0') {
8155     // Parse a field
8156     if (*parse_ptr == '%') {
8157       // Put field in the buffer
8158       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8159       __kmp_str_buf_catbuf(buffer, &field);
8160       retval += rc;
8161     } else {
8162       // Put literal character in buffer
8163       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8164       retval++;
8165       parse_ptr++;
8166     }
8167   }
8168   __kmp_str_buf_free(&field);
8169   return retval;
8170 }
8171 
8172 // Displays the affinity string to stdout
8173 void __kmp_aux_display_affinity(int gtid, const char *format) {
8174   kmp_str_buf_t buf;
8175   __kmp_str_buf_init(&buf);
8176   __kmp_aux_capture_affinity(gtid, format, &buf);
8177   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8178   __kmp_str_buf_free(&buf);
8179 }
8180 
8181 /* ------------------------------------------------------------------------ */
8182 
8183 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8184   int blocktime = arg; /* argument is in milliseconds */
8185 #if KMP_USE_MONITOR
8186   int bt_intervals;
8187 #endif
8188   kmp_int8 bt_set;
8189 
8190   __kmp_save_internal_controls(thread);
8191 
8192   /* Normalize and set blocktime for the teams */
8193   if (blocktime < KMP_MIN_BLOCKTIME)
8194     blocktime = KMP_MIN_BLOCKTIME;
8195   else if (blocktime > KMP_MAX_BLOCKTIME)
8196     blocktime = KMP_MAX_BLOCKTIME;
8197 
8198   set__blocktime_team(thread->th.th_team, tid, blocktime);
8199   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8200 
8201 #if KMP_USE_MONITOR
8202   /* Calculate and set blocktime intervals for the teams */
8203   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8204 
8205   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8206   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8207 #endif
8208 
8209   /* Set whether blocktime has been set to "TRUE" */
8210   bt_set = TRUE;
8211 
8212   set__bt_set_team(thread->th.th_team, tid, bt_set);
8213   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8214 #if KMP_USE_MONITOR
8215   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8216                 "bt_intervals=%d, monitor_updates=%d\n",
8217                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8218                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8219                 __kmp_monitor_wakeups));
8220 #else
8221   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8222                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8223                 thread->th.th_team->t.t_id, tid, blocktime));
8224 #endif
8225 }
8226 
8227 void __kmp_aux_set_defaults(char const *str, size_t len) {
8228   if (!__kmp_init_serial) {
8229     __kmp_serial_initialize();
8230   }
8231   __kmp_env_initialize(str);
8232 
8233   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8234     __kmp_env_print();
8235   }
8236 } // __kmp_aux_set_defaults
8237 
8238 /* ------------------------------------------------------------------------ */
8239 /* internal fast reduction routines */
8240 
8241 PACKED_REDUCTION_METHOD_T
8242 __kmp_determine_reduction_method(
8243     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8244     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8245     kmp_critical_name *lck) {
8246 
8247   // Default reduction method: critical construct ( lck != NULL, like in current
8248   // PAROPT )
8249   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8250   // can be selected by RTL
8251   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8252   // can be selected by RTL
8253   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8254   // among generated by PAROPT.
8255 
8256   PACKED_REDUCTION_METHOD_T retval;
8257 
8258   int team_size;
8259 
8260   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8261   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8262 
8263 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8264   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8265 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8266 
8267   retval = critical_reduce_block;
8268 
8269   // another choice of getting a team size (with 1 dynamic deference) is slower
8270   team_size = __kmp_get_team_num_threads(global_tid);
8271   if (team_size == 1) {
8272 
8273     retval = empty_reduce_block;
8274 
8275   } else {
8276 
8277     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8278 
8279 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8280     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8281 
8282 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8283     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8284 
8285     int teamsize_cutoff = 4;
8286 
8287 #if KMP_MIC_SUPPORTED
8288     if (__kmp_mic_type != non_mic) {
8289       teamsize_cutoff = 8;
8290     }
8291 #endif
8292     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8293     if (tree_available) {
8294       if (team_size <= teamsize_cutoff) {
8295         if (atomic_available) {
8296           retval = atomic_reduce_block;
8297         }
8298       } else {
8299         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8300       }
8301     } else if (atomic_available) {
8302       retval = atomic_reduce_block;
8303     }
8304 #else
8305 #error "Unknown or unsupported OS"
8306 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8307        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8308 
8309 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8310 
8311 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8312 
8313     // basic tuning
8314 
8315     if (atomic_available) {
8316       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8317         retval = atomic_reduce_block;
8318       }
8319     } // otherwise: use critical section
8320 
8321 #elif KMP_OS_DARWIN
8322 
8323     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8324     if (atomic_available && (num_vars <= 3)) {
8325       retval = atomic_reduce_block;
8326     } else if (tree_available) {
8327       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8328           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8329         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8330       }
8331     } // otherwise: use critical section
8332 
8333 #else
8334 #error "Unknown or unsupported OS"
8335 #endif
8336 
8337 #else
8338 #error "Unknown or unsupported architecture"
8339 #endif
8340   }
8341 
8342   // KMP_FORCE_REDUCTION
8343 
8344   // If the team is serialized (team_size == 1), ignore the forced reduction
8345   // method and stay with the unsynchronized method (empty_reduce_block)
8346   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8347       team_size != 1) {
8348 
8349     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8350 
8351     int atomic_available, tree_available;
8352 
8353     switch ((forced_retval = __kmp_force_reduction_method)) {
8354     case critical_reduce_block:
8355       KMP_ASSERT(lck); // lck should be != 0
8356       break;
8357 
8358     case atomic_reduce_block:
8359       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8360       if (!atomic_available) {
8361         KMP_WARNING(RedMethodNotSupported, "atomic");
8362         forced_retval = critical_reduce_block;
8363       }
8364       break;
8365 
8366     case tree_reduce_block:
8367       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8368       if (!tree_available) {
8369         KMP_WARNING(RedMethodNotSupported, "tree");
8370         forced_retval = critical_reduce_block;
8371       } else {
8372 #if KMP_FAST_REDUCTION_BARRIER
8373         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8374 #endif
8375       }
8376       break;
8377 
8378     default:
8379       KMP_ASSERT(0); // "unsupported method specified"
8380     }
8381 
8382     retval = forced_retval;
8383   }
8384 
8385   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8386 
8387 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8388 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8389 
8390   return (retval);
8391 }
8392 // this function is for testing set/get/determine reduce method
8393 kmp_int32 __kmp_get_reduce_method(void) {
8394   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8395 }
8396 
8397 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8398 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8399 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8400 
8401 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8402 // OpenMP is used subsequently.
8403 void __kmp_hard_pause() {
8404   __kmp_pause_status = kmp_hard_paused;
8405   __kmp_internal_end_thread(-1);
8406 }
8407 
8408 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8409 void __kmp_resume_if_soft_paused() {
8410   if (__kmp_pause_status == kmp_soft_paused) {
8411     __kmp_pause_status = kmp_not_paused;
8412 
8413     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8414       kmp_info_t *thread = __kmp_threads[gtid];
8415       if (thread) { // Wake it if sleeping
8416         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8417                          thread);
8418         if (fl.is_sleeping())
8419           fl.resume(gtid);
8420         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8421           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8422         } else { // thread holds the lock and may sleep soon
8423           do { // until either the thread sleeps, or we can get the lock
8424             if (fl.is_sleeping()) {
8425               fl.resume(gtid);
8426               break;
8427             } else if (__kmp_try_suspend_mx(thread)) {
8428               __kmp_unlock_suspend_mx(thread);
8429               break;
8430             }
8431           } while (1);
8432         }
8433       }
8434     }
8435   }
8436 }
8437 
8438 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8439 // TODO: add warning messages
8440 int __kmp_pause_resource(kmp_pause_status_t level) {
8441   if (level == kmp_not_paused) { // requesting resume
8442     if (__kmp_pause_status == kmp_not_paused) {
8443       // error message about runtime not being paused, so can't resume
8444       return 1;
8445     } else {
8446       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8447                        __kmp_pause_status == kmp_hard_paused);
8448       __kmp_pause_status = kmp_not_paused;
8449       return 0;
8450     }
8451   } else if (level == kmp_soft_paused) { // requesting soft pause
8452     if (__kmp_pause_status != kmp_not_paused) {
8453       // error message about already being paused
8454       return 1;
8455     } else {
8456       __kmp_soft_pause();
8457       return 0;
8458     }
8459   } else if (level == kmp_hard_paused) { // requesting hard pause
8460     if (__kmp_pause_status != kmp_not_paused) {
8461       // error message about already being paused
8462       return 1;
8463     } else {
8464       __kmp_hard_pause();
8465       return 0;
8466     }
8467   } else {
8468     // error message about invalid level
8469     return 1;
8470   }
8471 }
8472 
8473 
8474 void __kmp_omp_display_env(int verbose) {
8475   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8476   if (__kmp_init_serial == 0)
8477     __kmp_do_serial_initialize();
8478   __kmp_display_env_impl(!verbose, verbose);
8479   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8480 }
8481