1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if KMP_OS_WINDOWS
45 // windows does not need include files as it doesn't use shared memory
46 #else
47 #include <sys/mman.h>
48 #include <sys/stat.h>
49 #include <fcntl.h>
50 #define SHM_SIZE 1024
51 #endif
52 
53 #if defined(KMP_GOMP_COMPAT)
54 char const __kmp_version_alt_comp[] =
55     KMP_VERSION_PREFIX "alternative compiler support: yes";
56 #endif /* defined(KMP_GOMP_COMPAT) */
57 
58 char const __kmp_version_omp_api[] =
59     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63     KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79                                   int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81                                   kmp_internal_control_t *new_icvs,
82                                   ident_t *loc);
83 #if KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85                                    int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91                           kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
102 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
103 
104 /* Calculate the identifier of the current thread */
105 /* fast (and somewhat portable) way to get unique identifier of executing
106    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
107 int __kmp_get_global_thread_id() {
108   int i;
109   kmp_info_t **other_threads;
110   size_t stack_data;
111   char *stack_addr;
112   size_t stack_size;
113   char *stack_base;
114 
115   KA_TRACE(
116       1000,
117       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
118        __kmp_nth, __kmp_all_nth));
119 
120   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
121      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
122      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
123      __kmp_init_gtid for this to work. */
124 
125   if (!TCR_4(__kmp_init_gtid))
126     return KMP_GTID_DNE;
127 
128 #ifdef KMP_TDATA_GTID
129   if (TCR_4(__kmp_gtid_mode) >= 3) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
131     return __kmp_gtid;
132   }
133 #endif
134   if (TCR_4(__kmp_gtid_mode) >= 2) {
135     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
136     return __kmp_gtid_get_specific();
137   }
138   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
139 
140   stack_addr = (char *)&stack_data;
141   other_threads = __kmp_threads;
142 
143   /* ATT: The code below is a source of potential bugs due to unsynchronized
144      access to __kmp_threads array. For example:
145      1. Current thread loads other_threads[i] to thr and checks it, it is
146         non-NULL.
147      2. Current thread is suspended by OS.
148      3. Another thread unregisters and finishes (debug versions of free()
149         may fill memory with something like 0xEF).
150      4. Current thread is resumed.
151      5. Current thread reads junk from *thr.
152      TODO: Fix it.  --ln  */
153 
154   for (i = 0; i < __kmp_threads_capacity; i++) {
155 
156     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
157     if (!thr)
158       continue;
159 
160     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
161     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
162 
163     /* stack grows down -- search through all of the active threads */
164 
165     if (stack_addr <= stack_base) {
166       size_t stack_diff = stack_base - stack_addr;
167 
168       if (stack_diff <= stack_size) {
169         /* The only way we can be closer than the allocated */
170         /* stack size is if we are running on this thread. */
171         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
172         return i;
173       }
174     }
175   }
176 
177   /* get specific to try and determine our gtid */
178   KA_TRACE(1000,
179            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
180             "thread, using TLS\n"));
181   i = __kmp_gtid_get_specific();
182 
183   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
184 
185   /* if we havn't been assigned a gtid, then return code */
186   if (i < 0)
187     return i;
188 
189   /* dynamically updated stack window for uber threads to avoid get_specific
190      call */
191   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
192     KMP_FATAL(StackOverflow, i);
193   }
194 
195   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
196   if (stack_addr > stack_base) {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
198     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
199             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
200                 stack_base);
201   } else {
202     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
203             stack_base - stack_addr);
204   }
205 
206   /* Reprint stack bounds for ubermaster since they have been refined */
207   if (__kmp_storage_map) {
208     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
209     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
210     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
211                                  other_threads[i]->th.th_info.ds.ds_stacksize,
212                                  "th_%d stack (refinement)", i);
213   }
214   return i;
215 }
216 
217 int __kmp_get_global_thread_id_reg() {
218   int gtid;
219 
220   if (!__kmp_init_serial) {
221     gtid = KMP_GTID_DNE;
222   } else
223 #ifdef KMP_TDATA_GTID
224       if (TCR_4(__kmp_gtid_mode) >= 3) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
226     gtid = __kmp_gtid;
227   } else
228 #endif
229       if (TCR_4(__kmp_gtid_mode) >= 2) {
230     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
231     gtid = __kmp_gtid_get_specific();
232   } else {
233     KA_TRACE(1000,
234              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
235     gtid = __kmp_get_global_thread_id();
236   }
237 
238   /* we must be a new uber master sibling thread */
239   if (gtid == KMP_GTID_DNE) {
240     KA_TRACE(10,
241              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
242               "Registering a new gtid.\n"));
243     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
244     if (!__kmp_init_serial) {
245       __kmp_do_serial_initialize();
246       gtid = __kmp_gtid_get_specific();
247     } else {
248       gtid = __kmp_register_root(FALSE);
249     }
250     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
251     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
252   }
253 
254   KMP_DEBUG_ASSERT(gtid >= 0);
255 
256   return gtid;
257 }
258 
259 /* caller must hold forkjoin_lock */
260 void __kmp_check_stack_overlap(kmp_info_t *th) {
261   int f;
262   char *stack_beg = NULL;
263   char *stack_end = NULL;
264   int gtid;
265 
266   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
267   if (__kmp_storage_map) {
268     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
269     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
270 
271     gtid = __kmp_gtid_from_thread(th);
272 
273     if (gtid == KMP_GTID_MONITOR) {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%s stack (%s)", "mon",
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     } else {
279       __kmp_print_storage_map_gtid(
280           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281           "th_%d stack (%s)", gtid,
282           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283     }
284   }
285 
286   /* No point in checking ubermaster threads since they use refinement and
287    * cannot overlap */
288   gtid = __kmp_gtid_from_thread(th);
289   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
290     KA_TRACE(10,
291              ("__kmp_check_stack_overlap: performing extensive checking\n"));
292     if (stack_beg == NULL) {
293       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
294       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
295     }
296 
297     for (f = 0; f < __kmp_threads_capacity; f++) {
298       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
299 
300       if (f_th && f_th != th) {
301         char *other_stack_end =
302             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
303         char *other_stack_beg =
304             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
305         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
306             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
307 
308           /* Print the other stack values before the abort */
309           if (__kmp_storage_map)
310             __kmp_print_storage_map_gtid(
311                 -1, other_stack_beg, other_stack_end,
312                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
313                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
314 
315           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
316                       __kmp_msg_null);
317         }
318       }
319     }
320   }
321   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
322 }
323 
324 /* ------------------------------------------------------------------------ */
325 
326 void __kmp_infinite_loop(void) {
327   static int done = FALSE;
328 
329   while (!done) {
330     KMP_YIELD(TRUE);
331   }
332 }
333 
334 #define MAX_MESSAGE 512
335 
336 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
337                                   char const *format, ...) {
338   char buffer[MAX_MESSAGE];
339   va_list ap;
340 
341   va_start(ap, format);
342   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
343                p2, (unsigned long)size, format);
344   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
345   __kmp_vprintf(kmp_err, buffer, ap);
346 #if KMP_PRINT_DATA_PLACEMENT
347   int node;
348   if (gtid >= 0) {
349     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
350       if (__kmp_storage_map_verbose) {
351         node = __kmp_get_host_node(p1);
352         if (node < 0) /* doesn't work, so don't try this next time */
353           __kmp_storage_map_verbose = FALSE;
354         else {
355           char *last;
356           int lastNode;
357           int localProc = __kmp_get_cpu_from_gtid(gtid);
358 
359           const int page_size = KMP_GET_PAGE_SIZE();
360 
361           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
362           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
363           if (localProc >= 0)
364             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
365                                  localProc >> 1);
366           else
367             __kmp_printf_no_lock("  GTID %d\n", gtid);
368 #if KMP_USE_PRCTL
369           /* The more elaborate format is disabled for now because of the prctl
370            * hanging bug. */
371           do {
372             last = p1;
373             lastNode = node;
374             /* This loop collates adjacent pages with the same host node. */
375             do {
376               (char *)p1 += page_size;
377             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
378             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
379                                  lastNode);
380           } while (p1 <= p2);
381 #else
382           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
383                                (char *)p1 + (page_size - 1),
384                                __kmp_get_host_node(p1));
385           if (p1 < p2) {
386             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
387                                  (char *)p2 + (page_size - 1),
388                                  __kmp_get_host_node(p2));
389           }
390 #endif
391         }
392       }
393     } else
394       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
395   }
396 #endif /* KMP_PRINT_DATA_PLACEMENT */
397   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
398 }
399 
400 void __kmp_warn(char const *format, ...) {
401   char buffer[MAX_MESSAGE];
402   va_list ap;
403 
404   if (__kmp_generate_warnings == kmp_warnings_off) {
405     return;
406   }
407 
408   va_start(ap, format);
409 
410   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
411   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
412   __kmp_vprintf(kmp_err, buffer, ap);
413   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
414 
415   va_end(ap);
416 }
417 
418 void __kmp_abort_process() {
419   // Later threads may stall here, but that's ok because abort() will kill them.
420   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
421 
422   if (__kmp_debug_buf) {
423     __kmp_dump_debug_buffer();
424   }
425 
426   if (KMP_OS_WINDOWS) {
427     // Let other threads know of abnormal termination and prevent deadlock
428     // if abort happened during library initialization or shutdown
429     __kmp_global.g.g_abort = SIGABRT;
430 
431     /* On Windows* OS by default abort() causes pop-up error box, which stalls
432        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
433        boxes. _set_abort_behavior() works well, but this function is not
434        available in VS7 (this is not problem for DLL, but it is a problem for
435        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
436        help, at least in some versions of MS C RTL.
437 
438        It seems following sequence is the only way to simulate abort() and
439        avoid pop-up error box. */
440     raise(SIGABRT);
441     _exit(3); // Just in case, if signal ignored, exit anyway.
442   } else {
443     __kmp_unregister_library();
444     abort();
445   }
446 
447   __kmp_infinite_loop();
448   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449 
450 } // __kmp_abort_process
451 
452 void __kmp_abort_thread(void) {
453   // TODO: Eliminate g_abort global variable and this function.
454   // In case of abort just call abort(), it will kill all the threads.
455   __kmp_infinite_loop();
456 } // __kmp_abort_thread
457 
458 /* Print out the storage map for the major kmp_info_t thread data structures
459    that are allocated together. */
460 
461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463                                gtid);
464 
465   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467 
468   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
470 
471   __kmp_print_storage_map_gtid(
472       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474 
475   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476                                &thr->th.th_bar[bs_plain_barrier + 1],
477                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478                                gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483                                gtid);
484 
485 #if KMP_FAST_REDUCTION_BARRIER
486   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487                                &thr->th.th_bar[bs_reduction_barrier + 1],
488                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489                                gtid);
490 #endif // KMP_FAST_REDUCTION_BARRIER
491 }
492 
493 /* Print out the storage map for the major kmp_team_t team data structures
494    that are allocated together. */
495 
496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497                                          int team_id, int num_thr) {
498   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500                                header, team_id);
501 
502   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503                                &team->t.t_bar[bs_last_barrier],
504                                sizeof(kmp_balign_team_t) * bs_last_barrier,
505                                "%s_%d.t_bar", header, team_id);
506 
507   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508                                &team->t.t_bar[bs_plain_barrier + 1],
509                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510                                header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513                                &team->t.t_bar[bs_forkjoin_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[forkjoin]", header, team_id);
516 
517 #if KMP_FAST_REDUCTION_BARRIER
518   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519                                &team->t.t_bar[bs_reduction_barrier + 1],
520                                sizeof(kmp_balign_team_t),
521                                "%s_%d.t_bar[reduction]", header, team_id);
522 #endif // KMP_FAST_REDUCTION_BARRIER
523 
524   __kmp_print_storage_map_gtid(
525       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527 
528   __kmp_print_storage_map_gtid(
529       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531 
532   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533                                &team->t.t_disp_buffer[num_disp_buff],
534                                sizeof(dispatch_shared_info_t) * num_disp_buff,
535                                "%s_%d.t_disp_buffer", header, team_id);
536 }
537 
538 static void __kmp_init_allocator() { __kmp_init_memkind(); }
539 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
540 
541 /* ------------------------------------------------------------------------ */
542 
543 #if KMP_DYNAMIC_LIB
544 #if KMP_OS_WINDOWS
545 
546 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
547   // TODO: Change to __kmp_break_bootstrap_lock().
548   __kmp_init_bootstrap_lock(lck); // make the lock released
549 }
550 
551 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
552   int i;
553   int thread_count;
554 
555   // PROCESS_DETACH is expected to be called by a thread that executes
556   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
557   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
558   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
559   // threads can be still alive here, although being about to be terminated. The
560   // threads in the array with ds_thread==0 are most suspicious. Actually, it
561   // can be not safe to access the __kmp_threads[].
562 
563   // TODO: does it make sense to check __kmp_roots[] ?
564 
565   // Let's check that there are no other alive threads registered with the OMP
566   // lib.
567   while (1) {
568     thread_count = 0;
569     for (i = 0; i < __kmp_threads_capacity; ++i) {
570       if (!__kmp_threads)
571         continue;
572       kmp_info_t *th = __kmp_threads[i];
573       if (th == NULL)
574         continue;
575       int gtid = th->th.th_info.ds.ds_gtid;
576       if (gtid == gtid_req)
577         continue;
578       if (gtid < 0)
579         continue;
580       DWORD exit_val;
581       int alive = __kmp_is_thread_alive(th, &exit_val);
582       if (alive) {
583         ++thread_count;
584       }
585     }
586     if (thread_count == 0)
587       break; // success
588   }
589 
590   // Assume that I'm alone. Now it might be safe to check and reset locks.
591   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
592   __kmp_reset_lock(&__kmp_forkjoin_lock);
593 #ifdef KMP_DEBUG
594   __kmp_reset_lock(&__kmp_stdio_lock);
595 #endif // KMP_DEBUG
596 }
597 
598 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
599   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
600 
601   switch (fdwReason) {
602 
603   case DLL_PROCESS_ATTACH:
604     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
605 
606     return TRUE;
607 
608   case DLL_PROCESS_DETACH:
609     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
610 
611     if (lpReserved != NULL) {
612       // lpReserved is used for telling the difference:
613       //   lpReserved == NULL when FreeLibrary() was called,
614       //   lpReserved != NULL when the process terminates.
615       // When FreeLibrary() is called, worker threads remain alive. So they will
616       // release the forkjoin lock by themselves. When the process terminates,
617       // worker threads disappear triggering the problem of unreleased forkjoin
618       // lock as described below.
619 
620       // A worker thread can take the forkjoin lock. The problem comes up if
621       // that worker thread becomes dead before it releases the forkjoin lock.
622       // The forkjoin lock remains taken, while the thread executing
623       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
624       // to take the forkjoin lock and will always fail, so that the application
625       // will never finish [normally]. This scenario is possible if
626       // __kmpc_end() has not been executed. It looks like it's not a corner
627       // case, but common cases:
628       // - the main function was compiled by an alternative compiler;
629       // - the main function was compiled by icl but without /Qopenmp
630       //   (application with plugins);
631       // - application terminates by calling C exit(), Fortran CALL EXIT() or
632       //   Fortran STOP.
633       // - alive foreign thread prevented __kmpc_end from doing cleanup.
634       //
635       // This is a hack to work around the problem.
636       // TODO: !!! figure out something better.
637       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
638     }
639 
640     __kmp_internal_end_library(__kmp_gtid_get_specific());
641 
642     return TRUE;
643 
644   case DLL_THREAD_ATTACH:
645     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
646 
647     /* if we want to register new siblings all the time here call
648      * __kmp_get_gtid(); */
649     return TRUE;
650 
651   case DLL_THREAD_DETACH:
652     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
653 
654     __kmp_internal_end_thread(__kmp_gtid_get_specific());
655     return TRUE;
656   }
657 
658   return TRUE;
659 }
660 
661 #endif /* KMP_OS_WINDOWS */
662 #endif /* KMP_DYNAMIC_LIB */
663 
664 /* __kmp_parallel_deo -- Wait until it's our turn. */
665 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
666   int gtid = *gtid_ref;
667 #ifdef BUILD_PARALLEL_ORDERED
668   kmp_team_t *team = __kmp_team_from_gtid(gtid);
669 #endif /* BUILD_PARALLEL_ORDERED */
670 
671   if (__kmp_env_consistency_check) {
672     if (__kmp_threads[gtid]->th.th_root->r.r_active)
673 #if KMP_USE_DYNAMIC_LOCK
674       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
675 #else
676       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
677 #endif
678   }
679 #ifdef BUILD_PARALLEL_ORDERED
680   if (!team->t.t_serialized) {
681     KMP_MB();
682     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
683              NULL);
684     KMP_MB();
685   }
686 #endif /* BUILD_PARALLEL_ORDERED */
687 }
688 
689 /* __kmp_parallel_dxo -- Signal the next task. */
690 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
691   int gtid = *gtid_ref;
692 #ifdef BUILD_PARALLEL_ORDERED
693   int tid = __kmp_tid_from_gtid(gtid);
694   kmp_team_t *team = __kmp_team_from_gtid(gtid);
695 #endif /* BUILD_PARALLEL_ORDERED */
696 
697   if (__kmp_env_consistency_check) {
698     if (__kmp_threads[gtid]->th.th_root->r.r_active)
699       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
700   }
701 #ifdef BUILD_PARALLEL_ORDERED
702   if (!team->t.t_serialized) {
703     KMP_MB(); /* Flush all pending memory write invalidates.  */
704 
705     /* use the tid of the next thread in this team */
706     /* TODO replace with general release procedure */
707     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
708 
709     KMP_MB(); /* Flush all pending memory write invalidates.  */
710   }
711 #endif /* BUILD_PARALLEL_ORDERED */
712 }
713 
714 /* ------------------------------------------------------------------------ */
715 /* The BARRIER for a SINGLE process section is always explicit   */
716 
717 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
718   int status;
719   kmp_info_t *th;
720   kmp_team_t *team;
721 
722   if (!TCR_4(__kmp_init_parallel))
723     __kmp_parallel_initialize();
724   __kmp_resume_if_soft_paused();
725 
726   th = __kmp_threads[gtid];
727   team = th->th.th_team;
728   status = 0;
729 
730   th->th.th_ident = id_ref;
731 
732   if (team->t.t_serialized) {
733     status = 1;
734   } else {
735     kmp_int32 old_this = th->th.th_local.this_construct;
736 
737     ++th->th.th_local.this_construct;
738     /* try to set team count to thread count--success means thread got the
739        single block */
740     /* TODO: Should this be acquire or release? */
741     if (team->t.t_construct == old_this) {
742       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
743                                               th->th.th_local.this_construct);
744     }
745 #if USE_ITT_BUILD
746     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
747         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
748         team->t.t_active_level ==
749             1) { // Only report metadata by master of active team at level 1
750       __kmp_itt_metadata_single(id_ref);
751     }
752 #endif /* USE_ITT_BUILD */
753   }
754 
755   if (__kmp_env_consistency_check) {
756     if (status && push_ws) {
757       __kmp_push_workshare(gtid, ct_psingle, id_ref);
758     } else {
759       __kmp_check_workshare(gtid, ct_psingle, id_ref);
760     }
761   }
762 #if USE_ITT_BUILD
763   if (status) {
764     __kmp_itt_single_start(gtid);
765   }
766 #endif /* USE_ITT_BUILD */
767   return status;
768 }
769 
770 void __kmp_exit_single(int gtid) {
771 #if USE_ITT_BUILD
772   __kmp_itt_single_end(gtid);
773 #endif /* USE_ITT_BUILD */
774   if (__kmp_env_consistency_check)
775     __kmp_pop_workshare(gtid, ct_psingle, NULL);
776 }
777 
778 /* determine if we can go parallel or must use a serialized parallel region and
779  * how many threads we can use
780  * set_nproc is the number of threads requested for the team
781  * returns 0 if we should serialize or only use one thread,
782  * otherwise the number of threads to use
783  * The forkjoin lock is held by the caller. */
784 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
785                                  int master_tid, int set_nthreads,
786                                  int enter_teams) {
787   int capacity;
788   int new_nthreads;
789   KMP_DEBUG_ASSERT(__kmp_init_serial);
790   KMP_DEBUG_ASSERT(root && parent_team);
791   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
792 
793   // If dyn-var is set, dynamically adjust the number of desired threads,
794   // according to the method specified by dynamic_mode.
795   new_nthreads = set_nthreads;
796   if (!get__dynamic_2(parent_team, master_tid)) {
797     ;
798   }
799 #ifdef USE_LOAD_BALANCE
800   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
801     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
802     if (new_nthreads == 1) {
803       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
804                     "reservation to 1 thread\n",
805                     master_tid));
806       return 1;
807     }
808     if (new_nthreads < set_nthreads) {
809       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
810                     "reservation to %d threads\n",
811                     master_tid, new_nthreads));
812     }
813   }
814 #endif /* USE_LOAD_BALANCE */
815   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
816     new_nthreads = __kmp_avail_proc - __kmp_nth +
817                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
818     if (new_nthreads <= 1) {
819       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
820                     "reservation to 1 thread\n",
821                     master_tid));
822       return 1;
823     }
824     if (new_nthreads < set_nthreads) {
825       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
826                     "reservation to %d threads\n",
827                     master_tid, new_nthreads));
828     } else {
829       new_nthreads = set_nthreads;
830     }
831   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
832     if (set_nthreads > 2) {
833       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
834       new_nthreads = (new_nthreads % set_nthreads) + 1;
835       if (new_nthreads == 1) {
836         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
837                       "reservation to 1 thread\n",
838                       master_tid));
839         return 1;
840       }
841       if (new_nthreads < set_nthreads) {
842         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
843                       "reservation to %d threads\n",
844                       master_tid, new_nthreads));
845       }
846     }
847   } else {
848     KMP_ASSERT(0);
849   }
850 
851   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
852   if (__kmp_nth + new_nthreads -
853           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
854       __kmp_max_nth) {
855     int tl_nthreads = __kmp_max_nth - __kmp_nth +
856                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
857     if (tl_nthreads <= 0) {
858       tl_nthreads = 1;
859     }
860 
861     // If dyn-var is false, emit a 1-time warning.
862     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
863       __kmp_reserve_warn = 1;
864       __kmp_msg(kmp_ms_warning,
865                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
866                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
867     }
868     if (tl_nthreads == 1) {
869       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
870                     "reduced reservation to 1 thread\n",
871                     master_tid));
872       return 1;
873     }
874     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
875                   "reservation to %d threads\n",
876                   master_tid, tl_nthreads));
877     new_nthreads = tl_nthreads;
878   }
879 
880   // Respect OMP_THREAD_LIMIT
881   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
882   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
883   if (cg_nthreads + new_nthreads -
884           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
885       max_cg_threads) {
886     int tl_nthreads = max_cg_threads - cg_nthreads +
887                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
888     if (tl_nthreads <= 0) {
889       tl_nthreads = 1;
890     }
891 
892     // If dyn-var is false, emit a 1-time warning.
893     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
894       __kmp_reserve_warn = 1;
895       __kmp_msg(kmp_ms_warning,
896                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
897                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
898     }
899     if (tl_nthreads == 1) {
900       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
901                     "reduced reservation to 1 thread\n",
902                     master_tid));
903       return 1;
904     }
905     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
906                   "reservation to %d threads\n",
907                   master_tid, tl_nthreads));
908     new_nthreads = tl_nthreads;
909   }
910 
911   // Check if the threads array is large enough, or needs expanding.
912   // See comment in __kmp_register_root() about the adjustment if
913   // __kmp_threads[0] == NULL.
914   capacity = __kmp_threads_capacity;
915   if (TCR_PTR(__kmp_threads[0]) == NULL) {
916     --capacity;
917   }
918   if (__kmp_nth + new_nthreads -
919           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
920       capacity) {
921     // Expand the threads array.
922     int slotsRequired = __kmp_nth + new_nthreads -
923                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
924                         capacity;
925     int slotsAdded = __kmp_expand_threads(slotsRequired);
926     if (slotsAdded < slotsRequired) {
927       // The threads array was not expanded enough.
928       new_nthreads -= (slotsRequired - slotsAdded);
929       KMP_ASSERT(new_nthreads >= 1);
930 
931       // If dyn-var is false, emit a 1-time warning.
932       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
933         __kmp_reserve_warn = 1;
934         if (__kmp_tp_cached) {
935           __kmp_msg(kmp_ms_warning,
936                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
937                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
938                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
939         } else {
940           __kmp_msg(kmp_ms_warning,
941                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
942                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
943         }
944       }
945     }
946   }
947 
948 #ifdef KMP_DEBUG
949   if (new_nthreads == 1) {
950     KC_TRACE(10,
951              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
952               "dead roots and rechecking; requested %d threads\n",
953               __kmp_get_gtid(), set_nthreads));
954   } else {
955     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
956                   " %d threads\n",
957                   __kmp_get_gtid(), new_nthreads, set_nthreads));
958   }
959 #endif // KMP_DEBUG
960   return new_nthreads;
961 }
962 
963 /* Allocate threads from the thread pool and assign them to the new team. We are
964    assured that there are enough threads available, because we checked on that
965    earlier within critical section forkjoin */
966 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
967                                     kmp_info_t *master_th, int master_gtid) {
968   int i;
969   int use_hot_team;
970 
971   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
972   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
973   KMP_MB();
974 
975   /* first, let's setup the master thread */
976   master_th->th.th_info.ds.ds_tid = 0;
977   master_th->th.th_team = team;
978   master_th->th.th_team_nproc = team->t.t_nproc;
979   master_th->th.th_team_master = master_th;
980   master_th->th.th_team_serialized = FALSE;
981   master_th->th.th_dispatch = &team->t.t_dispatch[0];
982 
983 /* make sure we are not the optimized hot team */
984 #if KMP_NESTED_HOT_TEAMS
985   use_hot_team = 0;
986   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
987   if (hot_teams) { // hot teams array is not allocated if
988     // KMP_HOT_TEAMS_MAX_LEVEL=0
989     int level = team->t.t_active_level - 1; // index in array of hot teams
990     if (master_th->th.th_teams_microtask) { // are we inside the teams?
991       if (master_th->th.th_teams_size.nteams > 1) {
992         ++level; // level was not increased in teams construct for
993         // team_of_masters
994       }
995       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
996           master_th->th.th_teams_level == team->t.t_level) {
997         ++level; // level was not increased in teams construct for
998         // team_of_workers before the parallel
999       } // team->t.t_level will be increased inside parallel
1000     }
1001     if (level < __kmp_hot_teams_max_level) {
1002       if (hot_teams[level].hot_team) {
1003         // hot team has already been allocated for given level
1004         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1005         use_hot_team = 1; // the team is ready to use
1006       } else {
1007         use_hot_team = 0; // AC: threads are not allocated yet
1008         hot_teams[level].hot_team = team; // remember new hot team
1009         hot_teams[level].hot_team_nth = team->t.t_nproc;
1010       }
1011     } else {
1012       use_hot_team = 0;
1013     }
1014   }
1015 #else
1016   use_hot_team = team == root->r.r_hot_team;
1017 #endif
1018   if (!use_hot_team) {
1019 
1020     /* install the master thread */
1021     team->t.t_threads[0] = master_th;
1022     __kmp_initialize_info(master_th, team, 0, master_gtid);
1023 
1024     /* now, install the worker threads */
1025     for (i = 1; i < team->t.t_nproc; i++) {
1026 
1027       /* fork or reallocate a new thread and install it in team */
1028       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1029       team->t.t_threads[i] = thr;
1030       KMP_DEBUG_ASSERT(thr);
1031       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1032       /* align team and thread arrived states */
1033       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1034                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1035                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1036                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1037                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1038                     team->t.t_bar[bs_plain_barrier].b_arrived));
1039       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1040       thr->th.th_teams_level = master_th->th.th_teams_level;
1041       thr->th.th_teams_size = master_th->th.th_teams_size;
1042       { // Initialize threads' barrier data.
1043         int b;
1044         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1045         for (b = 0; b < bs_last_barrier; ++b) {
1046           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1047           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1048 #if USE_DEBUGGER
1049           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1050 #endif
1051         }
1052       }
1053     }
1054 
1055 #if KMP_AFFINITY_SUPPORTED
1056     __kmp_partition_places(team);
1057 #endif
1058   }
1059 
1060   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1061     for (i = 0; i < team->t.t_nproc; i++) {
1062       kmp_info_t *thr = team->t.t_threads[i];
1063       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1064           thr->th.th_prev_level != team->t.t_level) {
1065         team->t.t_display_affinity = 1;
1066         break;
1067       }
1068     }
1069   }
1070 
1071   KMP_MB();
1072 }
1073 
1074 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1075 // Propagate any changes to the floating point control registers out to the team
1076 // We try to avoid unnecessary writes to the relevant cache line in the team
1077 // structure, so we don't make changes unless they are needed.
1078 inline static void propagateFPControl(kmp_team_t *team) {
1079   if (__kmp_inherit_fp_control) {
1080     kmp_int16 x87_fpu_control_word;
1081     kmp_uint32 mxcsr;
1082 
1083     // Get master values of FPU control flags (both X87 and vector)
1084     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1085     __kmp_store_mxcsr(&mxcsr);
1086     mxcsr &= KMP_X86_MXCSR_MASK;
1087 
1088     // There is no point looking at t_fp_control_saved here.
1089     // If it is TRUE, we still have to update the values if they are different
1090     // from those we now have. If it is FALSE we didn't save anything yet, but
1091     // our objective is the same. We have to ensure that the values in the team
1092     // are the same as those we have.
1093     // So, this code achieves what we need whether or not t_fp_control_saved is
1094     // true. By checking whether the value needs updating we avoid unnecessary
1095     // writes that would put the cache-line into a written state, causing all
1096     // threads in the team to have to read it again.
1097     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1098     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1099     // Although we don't use this value, other code in the runtime wants to know
1100     // whether it should restore them. So we must ensure it is correct.
1101     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1102   } else {
1103     // Similarly here. Don't write to this cache-line in the team structure
1104     // unless we have to.
1105     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1106   }
1107 }
1108 
1109 // Do the opposite, setting the hardware registers to the updated values from
1110 // the team.
1111 inline static void updateHWFPControl(kmp_team_t *team) {
1112   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1113     // Only reset the fp control regs if they have been changed in the team.
1114     // the parallel region that we are exiting.
1115     kmp_int16 x87_fpu_control_word;
1116     kmp_uint32 mxcsr;
1117     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1118     __kmp_store_mxcsr(&mxcsr);
1119     mxcsr &= KMP_X86_MXCSR_MASK;
1120 
1121     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1122       __kmp_clear_x87_fpu_status_word();
1123       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1124     }
1125 
1126     if (team->t.t_mxcsr != mxcsr) {
1127       __kmp_load_mxcsr(&team->t.t_mxcsr);
1128     }
1129   }
1130 }
1131 #else
1132 #define propagateFPControl(x) ((void)0)
1133 #define updateHWFPControl(x) ((void)0)
1134 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1135 
1136 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1137                                      int realloc); // forward declaration
1138 
1139 /* Run a parallel region that has been serialized, so runs only in a team of the
1140    single master thread. */
1141 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1142   kmp_info_t *this_thr;
1143   kmp_team_t *serial_team;
1144 
1145   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1146 
1147   /* Skip all this code for autopar serialized loops since it results in
1148      unacceptable overhead */
1149   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1150     return;
1151 
1152   if (!TCR_4(__kmp_init_parallel))
1153     __kmp_parallel_initialize();
1154   __kmp_resume_if_soft_paused();
1155 
1156   this_thr = __kmp_threads[global_tid];
1157   serial_team = this_thr->th.th_serial_team;
1158 
1159   /* utilize the serialized team held by this thread */
1160   KMP_DEBUG_ASSERT(serial_team);
1161   KMP_MB();
1162 
1163   if (__kmp_tasking_mode != tskm_immediate_exec) {
1164     KMP_DEBUG_ASSERT(
1165         this_thr->th.th_task_team ==
1166         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1167     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1168                      NULL);
1169     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1170                   "team %p, new task_team = NULL\n",
1171                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1172     this_thr->th.th_task_team = NULL;
1173   }
1174 
1175   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1176   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1177     proc_bind = proc_bind_false;
1178   } else if (proc_bind == proc_bind_default) {
1179     // No proc_bind clause was specified, so use the current value
1180     // of proc-bind-var for this parallel region.
1181     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1182   }
1183   // Reset for next parallel region
1184   this_thr->th.th_set_proc_bind = proc_bind_default;
1185 
1186 #if OMPT_SUPPORT
1187   ompt_data_t ompt_parallel_data = ompt_data_none;
1188   ompt_data_t *implicit_task_data;
1189   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1190   if (ompt_enabled.enabled &&
1191       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1192 
1193     ompt_task_info_t *parent_task_info;
1194     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1195 
1196     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1197     if (ompt_enabled.ompt_callback_parallel_begin) {
1198       int team_size = 1;
1199 
1200       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1201           &(parent_task_info->task_data), &(parent_task_info->frame),
1202           &ompt_parallel_data, team_size,
1203           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1204     }
1205   }
1206 #endif // OMPT_SUPPORT
1207 
1208   if (this_thr->th.th_team != serial_team) {
1209     // Nested level will be an index in the nested nthreads array
1210     int level = this_thr->th.th_team->t.t_level;
1211 
1212     if (serial_team->t.t_serialized) {
1213       /* this serial team was already used
1214          TODO increase performance by making this locks more specific */
1215       kmp_team_t *new_team;
1216 
1217       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1218 
1219       new_team =
1220           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1221 #if OMPT_SUPPORT
1222                               ompt_parallel_data,
1223 #endif
1224                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1225                               0 USE_NESTED_HOT_ARG(NULL));
1226       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1227       KMP_ASSERT(new_team);
1228 
1229       /* setup new serialized team and install it */
1230       new_team->t.t_threads[0] = this_thr;
1231       new_team->t.t_parent = this_thr->th.th_team;
1232       serial_team = new_team;
1233       this_thr->th.th_serial_team = serial_team;
1234 
1235       KF_TRACE(
1236           10,
1237           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1238            global_tid, serial_team));
1239 
1240       /* TODO the above breaks the requirement that if we run out of resources,
1241          then we can still guarantee that serialized teams are ok, since we may
1242          need to allocate a new one */
1243     } else {
1244       KF_TRACE(
1245           10,
1246           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1247            global_tid, serial_team));
1248     }
1249 
1250     /* we have to initialize this serial team */
1251     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1252     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1253     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1254     serial_team->t.t_ident = loc;
1255     serial_team->t.t_serialized = 1;
1256     serial_team->t.t_nproc = 1;
1257     serial_team->t.t_parent = this_thr->th.th_team;
1258     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1259     this_thr->th.th_team = serial_team;
1260     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1261 
1262     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1263                   this_thr->th.th_current_task));
1264     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1265     this_thr->th.th_current_task->td_flags.executing = 0;
1266 
1267     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1268 
1269     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1270        implicit task for each serialized task represented by
1271        team->t.t_serialized? */
1272     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1273               &this_thr->th.th_current_task->td_parent->td_icvs);
1274 
1275     // Thread value exists in the nested nthreads array for the next nested
1276     // level
1277     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1278       this_thr->th.th_current_task->td_icvs.nproc =
1279           __kmp_nested_nth.nth[level + 1];
1280     }
1281 
1282     if (__kmp_nested_proc_bind.used &&
1283         (level + 1 < __kmp_nested_proc_bind.used)) {
1284       this_thr->th.th_current_task->td_icvs.proc_bind =
1285           __kmp_nested_proc_bind.bind_types[level + 1];
1286     }
1287 
1288 #if USE_DEBUGGER
1289     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1290 #endif
1291     this_thr->th.th_info.ds.ds_tid = 0;
1292 
1293     /* set thread cache values */
1294     this_thr->th.th_team_nproc = 1;
1295     this_thr->th.th_team_master = this_thr;
1296     this_thr->th.th_team_serialized = 1;
1297 
1298     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1299     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1300     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1301 
1302     propagateFPControl(serial_team);
1303 
1304     /* check if we need to allocate dispatch buffers stack */
1305     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1306     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1307       serial_team->t.t_dispatch->th_disp_buffer =
1308           (dispatch_private_info_t *)__kmp_allocate(
1309               sizeof(dispatch_private_info_t));
1310     }
1311     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1312 
1313     KMP_MB();
1314 
1315   } else {
1316     /* this serialized team is already being used,
1317      * that's fine, just add another nested level */
1318     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1319     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1320     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1321     ++serial_team->t.t_serialized;
1322     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1323 
1324     // Nested level will be an index in the nested nthreads array
1325     int level = this_thr->th.th_team->t.t_level;
1326     // Thread value exists in the nested nthreads array for the next nested
1327     // level
1328     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1329       this_thr->th.th_current_task->td_icvs.nproc =
1330           __kmp_nested_nth.nth[level + 1];
1331     }
1332     serial_team->t.t_level++;
1333     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1334                   "of serial team %p to %d\n",
1335                   global_tid, serial_team, serial_team->t.t_level));
1336 
1337     /* allocate/push dispatch buffers stack */
1338     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1339     {
1340       dispatch_private_info_t *disp_buffer =
1341           (dispatch_private_info_t *)__kmp_allocate(
1342               sizeof(dispatch_private_info_t));
1343       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1344       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1345     }
1346     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1347 
1348     KMP_MB();
1349   }
1350   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1351 
1352   // Perform the display affinity functionality for
1353   // serialized parallel regions
1354   if (__kmp_display_affinity) {
1355     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1356         this_thr->th.th_prev_num_threads != 1) {
1357       // NULL means use the affinity-format-var ICV
1358       __kmp_aux_display_affinity(global_tid, NULL);
1359       this_thr->th.th_prev_level = serial_team->t.t_level;
1360       this_thr->th.th_prev_num_threads = 1;
1361     }
1362   }
1363 
1364   if (__kmp_env_consistency_check)
1365     __kmp_push_parallel(global_tid, NULL);
1366 #if OMPT_SUPPORT
1367   serial_team->t.ompt_team_info.master_return_address = codeptr;
1368   if (ompt_enabled.enabled &&
1369       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1370     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1371 
1372     ompt_lw_taskteam_t lw_taskteam;
1373     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1374                             &ompt_parallel_data, codeptr);
1375 
1376     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1377     // don't use lw_taskteam after linking. content was swaped
1378 
1379     /* OMPT implicit task begin */
1380     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1381     if (ompt_enabled.ompt_callback_implicit_task) {
1382       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1383           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1384           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1385       OMPT_CUR_TASK_INFO(this_thr)
1386           ->thread_num = __kmp_tid_from_gtid(global_tid);
1387     }
1388 
1389     /* OMPT state */
1390     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1391     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1392   }
1393 #endif
1394 }
1395 
1396 /* most of the work for a fork */
1397 /* return true if we really went parallel, false if serialized */
1398 int __kmp_fork_call(ident_t *loc, int gtid,
1399                     enum fork_context_e call_context, // Intel, GNU, ...
1400                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1401                     kmp_va_list ap) {
1402   void **argv;
1403   int i;
1404   int master_tid;
1405   int master_this_cons;
1406   kmp_team_t *team;
1407   kmp_team_t *parent_team;
1408   kmp_info_t *master_th;
1409   kmp_root_t *root;
1410   int nthreads;
1411   int master_active;
1412   int master_set_numthreads;
1413   int level;
1414   int active_level;
1415   int teams_level;
1416 #if KMP_NESTED_HOT_TEAMS
1417   kmp_hot_team_ptr_t **p_hot_teams;
1418 #endif
1419   { // KMP_TIME_BLOCK
1420     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1421     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1422 
1423     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1424     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1425       /* Some systems prefer the stack for the root thread(s) to start with */
1426       /* some gap from the parent stack to prevent false sharing. */
1427       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1428       /* These 2 lines below are so this does not get optimized out */
1429       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1430         __kmp_stkpadding += (short)((kmp_int64)dummy);
1431     }
1432 
1433     /* initialize if needed */
1434     KMP_DEBUG_ASSERT(
1435         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1436     if (!TCR_4(__kmp_init_parallel))
1437       __kmp_parallel_initialize();
1438     __kmp_resume_if_soft_paused();
1439 
1440     /* setup current data */
1441     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1442     // shutdown
1443     parent_team = master_th->th.th_team;
1444     master_tid = master_th->th.th_info.ds.ds_tid;
1445     master_this_cons = master_th->th.th_local.this_construct;
1446     root = master_th->th.th_root;
1447     master_active = root->r.r_active;
1448     master_set_numthreads = master_th->th.th_set_nproc;
1449 
1450 #if OMPT_SUPPORT
1451     ompt_data_t ompt_parallel_data = ompt_data_none;
1452     ompt_data_t *parent_task_data;
1453     ompt_frame_t *ompt_frame;
1454     ompt_data_t *implicit_task_data;
1455     void *return_address = NULL;
1456 
1457     if (ompt_enabled.enabled) {
1458       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1459                                     NULL, NULL);
1460       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1461     }
1462 #endif
1463 
1464     // Nested level will be an index in the nested nthreads array
1465     level = parent_team->t.t_level;
1466     // used to launch non-serial teams even if nested is not allowed
1467     active_level = parent_team->t.t_active_level;
1468     // needed to check nesting inside the teams
1469     teams_level = master_th->th.th_teams_level;
1470 #if KMP_NESTED_HOT_TEAMS
1471     p_hot_teams = &master_th->th.th_hot_teams;
1472     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1473       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1474           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1475       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1476       // it is either actual or not needed (when active_level > 0)
1477       (*p_hot_teams)[0].hot_team_nth = 1;
1478     }
1479 #endif
1480 
1481 #if OMPT_SUPPORT
1482     if (ompt_enabled.enabled) {
1483       if (ompt_enabled.ompt_callback_parallel_begin) {
1484         int team_size = master_set_numthreads
1485                             ? master_set_numthreads
1486                             : get__nproc_2(parent_team, master_tid);
1487         int flags = OMPT_INVOKER(call_context) |
1488                     ((microtask == (microtask_t)__kmp_teams_master)
1489                          ? ompt_parallel_league
1490                          : ompt_parallel_team);
1491         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1492             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1493             return_address);
1494       }
1495       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1496     }
1497 #endif
1498 
1499     master_th->th.th_ident = loc;
1500 
1501     if (master_th->th.th_teams_microtask && ap &&
1502         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1503       // AC: This is start of parallel that is nested inside teams construct.
1504       // The team is actual (hot), all workers are ready at the fork barrier.
1505       // No lock needed to initialize the team a bit, then free workers.
1506       parent_team->t.t_ident = loc;
1507       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1508       parent_team->t.t_argc = argc;
1509       argv = (void **)parent_team->t.t_argv;
1510       for (i = argc - 1; i >= 0; --i)
1511         *argv++ = va_arg(kmp_va_deref(ap), void *);
1512       // Increment our nested depth levels, but not increase the serialization
1513       if (parent_team == master_th->th.th_serial_team) {
1514         // AC: we are in serialized parallel
1515         __kmpc_serialized_parallel(loc, gtid);
1516         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1517 
1518         if (call_context == fork_context_gnu) {
1519           // AC: need to decrement t_serialized for enquiry functions to work
1520           // correctly, will restore at join time
1521           parent_team->t.t_serialized--;
1522           return TRUE;
1523         }
1524 
1525 #if OMPT_SUPPORT
1526         void *dummy;
1527         void **exit_frame_p;
1528 
1529         ompt_lw_taskteam_t lw_taskteam;
1530 
1531         if (ompt_enabled.enabled) {
1532           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1533                                   &ompt_parallel_data, return_address);
1534           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1535 
1536           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1537           // don't use lw_taskteam after linking. content was swaped
1538 
1539           /* OMPT implicit task begin */
1540           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1541           if (ompt_enabled.ompt_callback_implicit_task) {
1542             OMPT_CUR_TASK_INFO(master_th)
1543                 ->thread_num = __kmp_tid_from_gtid(gtid);
1544             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1545                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1546                 implicit_task_data, 1,
1547                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1548           }
1549 
1550           /* OMPT state */
1551           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1552         } else {
1553           exit_frame_p = &dummy;
1554         }
1555 #endif
1556         // AC: need to decrement t_serialized for enquiry functions to work
1557         // correctly, will restore at join time
1558         parent_team->t.t_serialized--;
1559 
1560         {
1561           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1562           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1563           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1564 #if OMPT_SUPPORT
1565                                  ,
1566                                  exit_frame_p
1567 #endif
1568                                  );
1569         }
1570 
1571 #if OMPT_SUPPORT
1572         if (ompt_enabled.enabled) {
1573           *exit_frame_p = NULL;
1574           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1575           if (ompt_enabled.ompt_callback_implicit_task) {
1576             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1577                 ompt_scope_end, NULL, implicit_task_data, 1,
1578                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1579           }
1580           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1581           __ompt_lw_taskteam_unlink(master_th);
1582           if (ompt_enabled.ompt_callback_parallel_end) {
1583             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1584                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1585                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1586                 return_address);
1587           }
1588           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1589         }
1590 #endif
1591         return TRUE;
1592       }
1593 
1594       parent_team->t.t_pkfn = microtask;
1595       parent_team->t.t_invoke = invoker;
1596       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1597       parent_team->t.t_active_level++;
1598       parent_team->t.t_level++;
1599       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1600 
1601 #if OMPT_SUPPORT
1602       if (ompt_enabled.enabled) {
1603         ompt_lw_taskteam_t lw_taskteam;
1604         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1605                                 &ompt_parallel_data, return_address);
1606         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1607       }
1608 #endif
1609 
1610       /* Change number of threads in the team if requested */
1611       if (master_set_numthreads) { // The parallel has num_threads clause
1612         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1613           // AC: only can reduce number of threads dynamically, can't increase
1614           kmp_info_t **other_threads = parent_team->t.t_threads;
1615           parent_team->t.t_nproc = master_set_numthreads;
1616           for (i = 0; i < master_set_numthreads; ++i) {
1617             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1618           }
1619           // Keep extra threads hot in the team for possible next parallels
1620         }
1621         master_th->th.th_set_nproc = 0;
1622       }
1623 
1624 #if USE_DEBUGGER
1625       if (__kmp_debugging) { // Let debugger override number of threads.
1626         int nth = __kmp_omp_num_threads(loc);
1627         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1628           master_set_numthreads = nth;
1629         }
1630       }
1631 #endif
1632 
1633 #if USE_ITT_BUILD
1634       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1635            KMP_ITT_DEBUG) &&
1636           __kmp_forkjoin_frames_mode == 3 &&
1637           parent_team->t.t_active_level == 1 // only report frames at level 1
1638           && master_th->th.th_teams_size.nteams == 1) {
1639         kmp_uint64 tmp_time = __itt_get_timestamp();
1640         master_th->th.th_frame_time = tmp_time;
1641         parent_team->t.t_region_time = tmp_time;
1642       }
1643       if (__itt_stack_caller_create_ptr) {
1644         // create new stack stitching id before entering fork barrier
1645         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1646       }
1647 #endif /* USE_ITT_BUILD */
1648 
1649       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1650                     "master_th=%p, gtid=%d\n",
1651                     root, parent_team, master_th, gtid));
1652       __kmp_internal_fork(loc, gtid, parent_team);
1653       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1654                     "master_th=%p, gtid=%d\n",
1655                     root, parent_team, master_th, gtid));
1656 
1657       if (call_context == fork_context_gnu)
1658         return TRUE;
1659 
1660       /* Invoke microtask for MASTER thread */
1661       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1662                     parent_team->t.t_id, parent_team->t.t_pkfn));
1663 
1664       if (!parent_team->t.t_invoke(gtid)) {
1665         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1666       }
1667       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1668                     parent_team->t.t_id, parent_team->t.t_pkfn));
1669       KMP_MB(); /* Flush all pending memory write invalidates.  */
1670 
1671       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1672 
1673       return TRUE;
1674     } // Parallel closely nested in teams construct
1675 
1676 #if KMP_DEBUG
1677     if (__kmp_tasking_mode != tskm_immediate_exec) {
1678       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1679                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1680     }
1681 #endif
1682 
1683     if (parent_team->t.t_active_level >=
1684         master_th->th.th_current_task->td_icvs.max_active_levels) {
1685       nthreads = 1;
1686     } else {
1687       int enter_teams = ((ap == NULL && active_level == 0) ||
1688                          (ap && teams_level > 0 && teams_level == level));
1689       nthreads =
1690           master_set_numthreads
1691               ? master_set_numthreads
1692               : get__nproc_2(
1693                     parent_team,
1694                     master_tid); // TODO: get nproc directly from current task
1695 
1696       // Check if we need to take forkjoin lock? (no need for serialized
1697       // parallel out of teams construct). This code moved here from
1698       // __kmp_reserve_threads() to speedup nested serialized parallels.
1699       if (nthreads > 1) {
1700         if ((get__max_active_levels(master_th) == 1 &&
1701              (root->r.r_in_parallel && !enter_teams)) ||
1702             (__kmp_library == library_serial)) {
1703           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1704                         " threads\n",
1705                         gtid, nthreads));
1706           nthreads = 1;
1707         }
1708       }
1709       if (nthreads > 1) {
1710         /* determine how many new threads we can use */
1711         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1712         /* AC: If we execute teams from parallel region (on host), then teams
1713            should be created but each can only have 1 thread if nesting is
1714            disabled. If teams called from serial region, then teams and their
1715            threads should be created regardless of the nesting setting. */
1716         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1717                                          nthreads, enter_teams);
1718         if (nthreads == 1) {
1719           // Free lock for single thread execution here; for multi-thread
1720           // execution it will be freed later after team of threads created
1721           // and initialized
1722           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1723         }
1724       }
1725     }
1726     KMP_DEBUG_ASSERT(nthreads > 0);
1727 
1728     // If we temporarily changed the set number of threads then restore it now
1729     master_th->th.th_set_nproc = 0;
1730 
1731     /* create a serialized parallel region? */
1732     if (nthreads == 1) {
1733 /* josh todo: hypothetical question: what do we do for OS X*? */
1734 #if KMP_OS_LINUX &&                                                            \
1735     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1736       void *args[argc];
1737 #else
1738       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1739 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1740           KMP_ARCH_AARCH64) */
1741 
1742       KA_TRACE(20,
1743                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1744 
1745       __kmpc_serialized_parallel(loc, gtid);
1746 
1747       if (call_context == fork_context_intel) {
1748         /* TODO this sucks, use the compiler itself to pass args! :) */
1749         master_th->th.th_serial_team->t.t_ident = loc;
1750         if (!ap) {
1751           // revert change made in __kmpc_serialized_parallel()
1752           master_th->th.th_serial_team->t.t_level--;
1753 // Get args from parent team for teams construct
1754 
1755 #if OMPT_SUPPORT
1756           void *dummy;
1757           void **exit_frame_p;
1758           ompt_task_info_t *task_info;
1759 
1760           ompt_lw_taskteam_t lw_taskteam;
1761 
1762           if (ompt_enabled.enabled) {
1763             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1764                                     &ompt_parallel_data, return_address);
1765 
1766             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1767             // don't use lw_taskteam after linking. content was swaped
1768 
1769             task_info = OMPT_CUR_TASK_INFO(master_th);
1770             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1771             if (ompt_enabled.ompt_callback_implicit_task) {
1772               OMPT_CUR_TASK_INFO(master_th)
1773                   ->thread_num = __kmp_tid_from_gtid(gtid);
1774               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1775                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1776                   &(task_info->task_data), 1,
1777                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1778                   ompt_task_implicit);
1779             }
1780 
1781             /* OMPT state */
1782             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1783           } else {
1784             exit_frame_p = &dummy;
1785           }
1786 #endif
1787 
1788           {
1789             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1790             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1791             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1792                                    parent_team->t.t_argv
1793 #if OMPT_SUPPORT
1794                                    ,
1795                                    exit_frame_p
1796 #endif
1797                                    );
1798           }
1799 
1800 #if OMPT_SUPPORT
1801           if (ompt_enabled.enabled) {
1802             *exit_frame_p = NULL;
1803             if (ompt_enabled.ompt_callback_implicit_task) {
1804               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1805                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1806                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1807                   ompt_task_implicit);
1808             }
1809             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1810             __ompt_lw_taskteam_unlink(master_th);
1811             if (ompt_enabled.ompt_callback_parallel_end) {
1812               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1813                   &ompt_parallel_data, parent_task_data,
1814                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1815                   return_address);
1816             }
1817             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1818           }
1819 #endif
1820         } else if (microtask == (microtask_t)__kmp_teams_master) {
1821           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1822                            master_th->th.th_serial_team);
1823           team = master_th->th.th_team;
1824           // team->t.t_pkfn = microtask;
1825           team->t.t_invoke = invoker;
1826           __kmp_alloc_argv_entries(argc, team, TRUE);
1827           team->t.t_argc = argc;
1828           argv = (void **)team->t.t_argv;
1829           if (ap) {
1830             for (i = argc - 1; i >= 0; --i)
1831               *argv++ = va_arg(kmp_va_deref(ap), void *);
1832           } else {
1833             for (i = 0; i < argc; ++i)
1834               // Get args from parent team for teams construct
1835               argv[i] = parent_team->t.t_argv[i];
1836           }
1837           // AC: revert change made in __kmpc_serialized_parallel()
1838           //     because initial code in teams should have level=0
1839           team->t.t_level--;
1840           // AC: call special invoker for outer "parallel" of teams construct
1841           invoker(gtid);
1842 #if OMPT_SUPPORT
1843           if (ompt_enabled.enabled) {
1844             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1845             if (ompt_enabled.ompt_callback_implicit_task) {
1846               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1847                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1848                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1849             }
1850             if (ompt_enabled.ompt_callback_parallel_end) {
1851               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1852                   &ompt_parallel_data, parent_task_data,
1853                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1854                   return_address);
1855             }
1856             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1857           }
1858 #endif
1859         } else {
1860           argv = args;
1861           for (i = argc - 1; i >= 0; --i)
1862             *argv++ = va_arg(kmp_va_deref(ap), void *);
1863           KMP_MB();
1864 
1865 #if OMPT_SUPPORT
1866           void *dummy;
1867           void **exit_frame_p;
1868           ompt_task_info_t *task_info;
1869 
1870           ompt_lw_taskteam_t lw_taskteam;
1871 
1872           if (ompt_enabled.enabled) {
1873             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1874                                     &ompt_parallel_data, return_address);
1875             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1876             // don't use lw_taskteam after linking. content was swaped
1877             task_info = OMPT_CUR_TASK_INFO(master_th);
1878             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1879 
1880             /* OMPT implicit task begin */
1881             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1882             if (ompt_enabled.ompt_callback_implicit_task) {
1883               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1884                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1885                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1886                   ompt_task_implicit);
1887               OMPT_CUR_TASK_INFO(master_th)
1888                   ->thread_num = __kmp_tid_from_gtid(gtid);
1889             }
1890 
1891             /* OMPT state */
1892             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1893           } else {
1894             exit_frame_p = &dummy;
1895           }
1896 #endif
1897 
1898           {
1899             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1900             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1901             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1902 #if OMPT_SUPPORT
1903                                    ,
1904                                    exit_frame_p
1905 #endif
1906                                    );
1907           }
1908 
1909 #if OMPT_SUPPORT
1910           if (ompt_enabled.enabled) {
1911             *exit_frame_p = NULL;
1912             if (ompt_enabled.ompt_callback_implicit_task) {
1913               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1914                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1915                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1916                   ompt_task_implicit);
1917             }
1918 
1919             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1920             __ompt_lw_taskteam_unlink(master_th);
1921             if (ompt_enabled.ompt_callback_parallel_end) {
1922               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1923                   &ompt_parallel_data, parent_task_data,
1924                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1925                   return_address);
1926             }
1927             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1928           }
1929 #endif
1930         }
1931       } else if (call_context == fork_context_gnu) {
1932 #if OMPT_SUPPORT
1933         ompt_lw_taskteam_t lwt;
1934         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1935                                 return_address);
1936 
1937         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1938         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1939 // don't use lw_taskteam after linking. content was swaped
1940 #endif
1941 
1942         // we were called from GNU native code
1943         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1944         return FALSE;
1945       } else {
1946         KMP_ASSERT2(call_context < fork_context_last,
1947                     "__kmp_fork_call: unknown fork_context parameter");
1948       }
1949 
1950       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1951       KMP_MB();
1952       return FALSE;
1953     } // if (nthreads == 1)
1954 
1955     // GEH: only modify the executing flag in the case when not serialized
1956     //      serialized case is handled in kmpc_serialized_parallel
1957     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1958                   "curtask=%p, curtask_max_aclevel=%d\n",
1959                   parent_team->t.t_active_level, master_th,
1960                   master_th->th.th_current_task,
1961                   master_th->th.th_current_task->td_icvs.max_active_levels));
1962     // TODO: GEH - cannot do this assertion because root thread not set up as
1963     // executing
1964     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1965     master_th->th.th_current_task->td_flags.executing = 0;
1966 
1967     if (!master_th->th.th_teams_microtask || level > teams_level) {
1968       /* Increment our nested depth level */
1969       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1970     }
1971 
1972     // See if we need to make a copy of the ICVs.
1973     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1974     if ((level + 1 < __kmp_nested_nth.used) &&
1975         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1976       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1977     } else {
1978       nthreads_icv = 0; // don't update
1979     }
1980 
1981     // Figure out the proc_bind_policy for the new team.
1982     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1983     kmp_proc_bind_t proc_bind_icv =
1984         proc_bind_default; // proc_bind_default means don't update
1985     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1986       proc_bind = proc_bind_false;
1987     } else {
1988       if (proc_bind == proc_bind_default) {
1989         // No proc_bind clause specified; use current proc-bind-var for this
1990         // parallel region
1991         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1992       }
1993       /* else: The proc_bind policy was specified explicitly on parallel clause.
1994          This overrides proc-bind-var for this parallel region, but does not
1995          change proc-bind-var. */
1996       // Figure the value of proc-bind-var for the child threads.
1997       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1998           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1999            master_th->th.th_current_task->td_icvs.proc_bind)) {
2000         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2001       }
2002     }
2003 
2004     // Reset for next parallel region
2005     master_th->th.th_set_proc_bind = proc_bind_default;
2006 
2007     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2008       kmp_internal_control_t new_icvs;
2009       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2010       new_icvs.next = NULL;
2011       if (nthreads_icv > 0) {
2012         new_icvs.nproc = nthreads_icv;
2013       }
2014       if (proc_bind_icv != proc_bind_default) {
2015         new_icvs.proc_bind = proc_bind_icv;
2016       }
2017 
2018       /* allocate a new parallel team */
2019       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2020       team = __kmp_allocate_team(root, nthreads, nthreads,
2021 #if OMPT_SUPPORT
2022                                  ompt_parallel_data,
2023 #endif
2024                                  proc_bind, &new_icvs,
2025                                  argc USE_NESTED_HOT_ARG(master_th));
2026     } else {
2027       /* allocate a new parallel team */
2028       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2029       team = __kmp_allocate_team(root, nthreads, nthreads,
2030 #if OMPT_SUPPORT
2031                                  ompt_parallel_data,
2032 #endif
2033                                  proc_bind,
2034                                  &master_th->th.th_current_task->td_icvs,
2035                                  argc USE_NESTED_HOT_ARG(master_th));
2036     }
2037     KF_TRACE(
2038         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2039 
2040     /* setup the new team */
2041     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2042     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2043     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2044     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2045     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2046 #if OMPT_SUPPORT
2047     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2048                           return_address);
2049 #endif
2050     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2051     // TODO: parent_team->t.t_level == INT_MAX ???
2052     if (!master_th->th.th_teams_microtask || level > teams_level) {
2053       int new_level = parent_team->t.t_level + 1;
2054       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2055       new_level = parent_team->t.t_active_level + 1;
2056       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2057     } else {
2058       // AC: Do not increase parallel level at start of the teams construct
2059       int new_level = parent_team->t.t_level;
2060       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2061       new_level = parent_team->t.t_active_level;
2062       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2063     }
2064     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2065     // set master's schedule as new run-time schedule
2066     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2067 
2068     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2069     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2070 
2071     // Update the floating point rounding in the team if required.
2072     propagateFPControl(team);
2073 
2074     if (__kmp_tasking_mode != tskm_immediate_exec) {
2075       // Set master's task team to team's task team. Unless this is hot team, it
2076       // should be NULL.
2077       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2078                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2079       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2080                     "%p, new task_team %p / team %p\n",
2081                     __kmp_gtid_from_thread(master_th),
2082                     master_th->th.th_task_team, parent_team,
2083                     team->t.t_task_team[master_th->th.th_task_state], team));
2084 
2085       if (active_level || master_th->th.th_task_team) {
2086         // Take a memo of master's task_state
2087         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2088         if (master_th->th.th_task_state_top >=
2089             master_th->th.th_task_state_stack_sz) { // increase size
2090           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2091           kmp_uint8 *old_stack, *new_stack;
2092           kmp_uint32 i;
2093           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2094           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2095             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2096           }
2097           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2098                ++i) { // zero-init rest of stack
2099             new_stack[i] = 0;
2100           }
2101           old_stack = master_th->th.th_task_state_memo_stack;
2102           master_th->th.th_task_state_memo_stack = new_stack;
2103           master_th->th.th_task_state_stack_sz = new_size;
2104           __kmp_free(old_stack);
2105         }
2106         // Store master's task_state on stack
2107         master_th->th
2108             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2109             master_th->th.th_task_state;
2110         master_th->th.th_task_state_top++;
2111 #if KMP_NESTED_HOT_TEAMS
2112         if (master_th->th.th_hot_teams &&
2113             active_level < __kmp_hot_teams_max_level &&
2114             team == master_th->th.th_hot_teams[active_level].hot_team) {
2115           // Restore master's nested state if nested hot team
2116           master_th->th.th_task_state =
2117               master_th->th
2118                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2119         } else {
2120 #endif
2121           master_th->th.th_task_state = 0;
2122 #if KMP_NESTED_HOT_TEAMS
2123         }
2124 #endif
2125       }
2126 #if !KMP_NESTED_HOT_TEAMS
2127       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2128                        (team == root->r.r_hot_team));
2129 #endif
2130     }
2131 
2132     KA_TRACE(
2133         20,
2134         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2135          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2136          team->t.t_nproc));
2137     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2138                      (team->t.t_master_tid == 0 &&
2139                       (team->t.t_parent == root->r.r_root_team ||
2140                        team->t.t_parent->t.t_serialized)));
2141     KMP_MB();
2142 
2143     /* now, setup the arguments */
2144     argv = (void **)team->t.t_argv;
2145     if (ap) {
2146       for (i = argc - 1; i >= 0; --i) {
2147         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2148         KMP_CHECK_UPDATE(*argv, new_argv);
2149         argv++;
2150       }
2151     } else {
2152       for (i = 0; i < argc; ++i) {
2153         // Get args from parent team for teams construct
2154         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2155       }
2156     }
2157 
2158     /* now actually fork the threads */
2159     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2160     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2161       root->r.r_active = TRUE;
2162 
2163     __kmp_fork_team_threads(root, team, master_th, gtid);
2164     __kmp_setup_icv_copy(team, nthreads,
2165                          &master_th->th.th_current_task->td_icvs, loc);
2166 
2167 #if OMPT_SUPPORT
2168     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2169 #endif
2170 
2171     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2172 
2173 #if USE_ITT_BUILD
2174     if (team->t.t_active_level == 1 // only report frames at level 1
2175         && !master_th->th.th_teams_microtask) { // not in teams construct
2176 #if USE_ITT_NOTIFY
2177       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2178           (__kmp_forkjoin_frames_mode == 3 ||
2179            __kmp_forkjoin_frames_mode == 1)) {
2180         kmp_uint64 tmp_time = 0;
2181         if (__itt_get_timestamp_ptr)
2182           tmp_time = __itt_get_timestamp();
2183         // Internal fork - report frame begin
2184         master_th->th.th_frame_time = tmp_time;
2185         if (__kmp_forkjoin_frames_mode == 3)
2186           team->t.t_region_time = tmp_time;
2187       } else
2188 // only one notification scheme (either "submit" or "forking/joined", not both)
2189 #endif /* USE_ITT_NOTIFY */
2190           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2191               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2192         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2193         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2194       }
2195     }
2196 #endif /* USE_ITT_BUILD */
2197 
2198     /* now go on and do the work */
2199     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2200     KMP_MB();
2201     KF_TRACE(10,
2202              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2203               root, team, master_th, gtid));
2204 
2205 #if USE_ITT_BUILD
2206     if (__itt_stack_caller_create_ptr) {
2207       team->t.t_stack_id =
2208           __kmp_itt_stack_caller_create(); // create new stack stitching id
2209       // before entering fork barrier
2210     }
2211 #endif /* USE_ITT_BUILD */
2212 
2213     // AC: skip __kmp_internal_fork at teams construct, let only master
2214     // threads execute
2215     if (ap) {
2216       __kmp_internal_fork(loc, gtid, team);
2217       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2218                     "master_th=%p, gtid=%d\n",
2219                     root, team, master_th, gtid));
2220     }
2221 
2222     if (call_context == fork_context_gnu) {
2223       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2224       return TRUE;
2225     }
2226 
2227     /* Invoke microtask for MASTER thread */
2228     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2229                   team->t.t_id, team->t.t_pkfn));
2230   } // END of timer KMP_fork_call block
2231 
2232 #if KMP_STATS_ENABLED
2233   // If beginning a teams construct, then change thread state
2234   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2235   if (!ap) {
2236     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2237   }
2238 #endif
2239 
2240   if (!team->t.t_invoke(gtid)) {
2241     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2242   }
2243 
2244 #if KMP_STATS_ENABLED
2245   // If was beginning of a teams construct, then reset thread state
2246   if (!ap) {
2247     KMP_SET_THREAD_STATE(previous_state);
2248   }
2249 #endif
2250 
2251   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2252                 team->t.t_id, team->t.t_pkfn));
2253   KMP_MB(); /* Flush all pending memory write invalidates.  */
2254 
2255   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2256 
2257 #if OMPT_SUPPORT
2258   if (ompt_enabled.enabled) {
2259     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2260   }
2261 #endif
2262 
2263   return TRUE;
2264 }
2265 
2266 #if OMPT_SUPPORT
2267 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2268                                             kmp_team_t *team) {
2269   // restore state outside the region
2270   thread->th.ompt_thread_info.state =
2271       ((team->t.t_serialized) ? ompt_state_work_serial
2272                               : ompt_state_work_parallel);
2273 }
2274 
2275 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2276                                    kmp_team_t *team, ompt_data_t *parallel_data,
2277                                    int flags, void *codeptr) {
2278   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2279   if (ompt_enabled.ompt_callback_parallel_end) {
2280     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2281         parallel_data, &(task_info->task_data), flags, codeptr);
2282   }
2283 
2284   task_info->frame.enter_frame = ompt_data_none;
2285   __kmp_join_restore_state(thread, team);
2286 }
2287 #endif
2288 
2289 void __kmp_join_call(ident_t *loc, int gtid
2290 #if OMPT_SUPPORT
2291                      ,
2292                      enum fork_context_e fork_context
2293 #endif
2294                      ,
2295                      int exit_teams) {
2296   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2297   kmp_team_t *team;
2298   kmp_team_t *parent_team;
2299   kmp_info_t *master_th;
2300   kmp_root_t *root;
2301   int master_active;
2302 
2303   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2304 
2305   /* setup current data */
2306   master_th = __kmp_threads[gtid];
2307   root = master_th->th.th_root;
2308   team = master_th->th.th_team;
2309   parent_team = team->t.t_parent;
2310 
2311   master_th->th.th_ident = loc;
2312 
2313 #if OMPT_SUPPORT
2314   void *team_microtask = (void *)team->t.t_pkfn;
2315   // For GOMP interface with serialized parallel, need the
2316   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2317   // and end-parallel events.
2318   if (ompt_enabled.enabled &&
2319       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2320     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2321   }
2322 #endif
2323 
2324 #if KMP_DEBUG
2325   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2326     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2327                   "th_task_team = %p\n",
2328                   __kmp_gtid_from_thread(master_th), team,
2329                   team->t.t_task_team[master_th->th.th_task_state],
2330                   master_th->th.th_task_team));
2331     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2332                      team->t.t_task_team[master_th->th.th_task_state]);
2333   }
2334 #endif
2335 
2336   if (team->t.t_serialized) {
2337     if (master_th->th.th_teams_microtask) {
2338       // We are in teams construct
2339       int level = team->t.t_level;
2340       int tlevel = master_th->th.th_teams_level;
2341       if (level == tlevel) {
2342         // AC: we haven't incremented it earlier at start of teams construct,
2343         //     so do it here - at the end of teams construct
2344         team->t.t_level++;
2345       } else if (level == tlevel + 1) {
2346         // AC: we are exiting parallel inside teams, need to increment
2347         // serialization in order to restore it in the next call to
2348         // __kmpc_end_serialized_parallel
2349         team->t.t_serialized++;
2350       }
2351     }
2352     __kmpc_end_serialized_parallel(loc, gtid);
2353 
2354 #if OMPT_SUPPORT
2355     if (ompt_enabled.enabled) {
2356       __kmp_join_restore_state(master_th, parent_team);
2357     }
2358 #endif
2359 
2360     return;
2361   }
2362 
2363   master_active = team->t.t_master_active;
2364 
2365   if (!exit_teams) {
2366     // AC: No barrier for internal teams at exit from teams construct.
2367     //     But there is barrier for external team (league).
2368     __kmp_internal_join(loc, gtid, team);
2369   } else {
2370     master_th->th.th_task_state =
2371         0; // AC: no tasking in teams (out of any parallel)
2372   }
2373 
2374   KMP_MB();
2375 
2376 #if OMPT_SUPPORT
2377   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2378   void *codeptr = team->t.ompt_team_info.master_return_address;
2379 #endif
2380 
2381 #if USE_ITT_BUILD
2382   if (__itt_stack_caller_create_ptr) {
2383     // destroy the stack stitching id after join barrier
2384     __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2385   }
2386   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2387   if (team->t.t_active_level == 1 &&
2388       (!master_th->th.th_teams_microtask || /* not in teams construct */
2389        master_th->th.th_teams_size.nteams == 1)) {
2390     master_th->th.th_ident = loc;
2391     // only one notification scheme (either "submit" or "forking/joined", not
2392     // both)
2393     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2394         __kmp_forkjoin_frames_mode == 3)
2395       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2396                              master_th->th.th_frame_time, 0, loc,
2397                              master_th->th.th_team_nproc, 1);
2398     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2399              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2400       __kmp_itt_region_joined(gtid);
2401   } // active_level == 1
2402 #endif /* USE_ITT_BUILD */
2403 
2404   if (master_th->th.th_teams_microtask && !exit_teams &&
2405       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2406       team->t.t_level == master_th->th.th_teams_level + 1) {
2407 // AC: We need to leave the team structure intact at the end of parallel
2408 // inside the teams construct, so that at the next parallel same (hot) team
2409 // works, only adjust nesting levels
2410 #if OMPT_SUPPORT
2411     ompt_data_t ompt_parallel_data = ompt_data_none;
2412     if (ompt_enabled.enabled) {
2413       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2414       if (ompt_enabled.ompt_callback_implicit_task) {
2415         int ompt_team_size = team->t.t_nproc;
2416         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2417             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2418             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2419       }
2420       task_info->frame.exit_frame = ompt_data_none;
2421       task_info->task_data = ompt_data_none;
2422       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2423       __ompt_lw_taskteam_unlink(master_th);
2424     }
2425 #endif
2426     /* Decrement our nested depth level */
2427     team->t.t_level--;
2428     team->t.t_active_level--;
2429     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2430 
2431     // Restore number of threads in the team if needed. This code relies on
2432     // the proper adjustment of th_teams_size.nth after the fork in
2433     // __kmp_teams_master on each teams master in the case that
2434     // __kmp_reserve_threads reduced it.
2435     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2436       int old_num = master_th->th.th_team_nproc;
2437       int new_num = master_th->th.th_teams_size.nth;
2438       kmp_info_t **other_threads = team->t.t_threads;
2439       team->t.t_nproc = new_num;
2440       for (int i = 0; i < old_num; ++i) {
2441         other_threads[i]->th.th_team_nproc = new_num;
2442       }
2443       // Adjust states of non-used threads of the team
2444       for (int i = old_num; i < new_num; ++i) {
2445         // Re-initialize thread's barrier data.
2446         KMP_DEBUG_ASSERT(other_threads[i]);
2447         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2448         for (int b = 0; b < bs_last_barrier; ++b) {
2449           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2450           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2451 #if USE_DEBUGGER
2452           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2453 #endif
2454         }
2455         if (__kmp_tasking_mode != tskm_immediate_exec) {
2456           // Synchronize thread's task state
2457           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2458         }
2459       }
2460     }
2461 
2462 #if OMPT_SUPPORT
2463     if (ompt_enabled.enabled) {
2464       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2465                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2466     }
2467 #endif
2468 
2469     return;
2470   }
2471 
2472   /* do cleanup and restore the parent team */
2473   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2474   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2475 
2476   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2477 
2478   /* jc: The following lock has instructions with REL and ACQ semantics,
2479      separating the parallel user code called in this parallel region
2480      from the serial user code called after this function returns. */
2481   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2482 
2483   if (!master_th->th.th_teams_microtask ||
2484       team->t.t_level > master_th->th.th_teams_level) {
2485     /* Decrement our nested depth level */
2486     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2487   }
2488   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2489 
2490 #if OMPT_SUPPORT
2491   if (ompt_enabled.enabled) {
2492     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2493     if (ompt_enabled.ompt_callback_implicit_task) {
2494       int flags = (team_microtask == (void *)__kmp_teams_master)
2495                       ? ompt_task_initial
2496                       : ompt_task_implicit;
2497       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2498       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2499           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2500           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2501     }
2502     task_info->frame.exit_frame = ompt_data_none;
2503     task_info->task_data = ompt_data_none;
2504   }
2505 #endif
2506 
2507   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2508                 master_th, team));
2509   __kmp_pop_current_task_from_thread(master_th);
2510 
2511 #if KMP_AFFINITY_SUPPORTED
2512   // Restore master thread's partition.
2513   master_th->th.th_first_place = team->t.t_first_place;
2514   master_th->th.th_last_place = team->t.t_last_place;
2515 #endif // KMP_AFFINITY_SUPPORTED
2516   master_th->th.th_def_allocator = team->t.t_def_allocator;
2517 
2518   updateHWFPControl(team);
2519 
2520   if (root->r.r_active != master_active)
2521     root->r.r_active = master_active;
2522 
2523   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2524                             master_th)); // this will free worker threads
2525 
2526   /* this race was fun to find. make sure the following is in the critical
2527      region otherwise assertions may fail occasionally since the old team may be
2528      reallocated and the hierarchy appears inconsistent. it is actually safe to
2529      run and won't cause any bugs, but will cause those assertion failures. it's
2530      only one deref&assign so might as well put this in the critical region */
2531   master_th->th.th_team = parent_team;
2532   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2533   master_th->th.th_team_master = parent_team->t.t_threads[0];
2534   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2535 
2536   /* restore serialized team, if need be */
2537   if (parent_team->t.t_serialized &&
2538       parent_team != master_th->th.th_serial_team &&
2539       parent_team != root->r.r_root_team) {
2540     __kmp_free_team(root,
2541                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2542     master_th->th.th_serial_team = parent_team;
2543   }
2544 
2545   if (__kmp_tasking_mode != tskm_immediate_exec) {
2546     if (master_th->th.th_task_state_top >
2547         0) { // Restore task state from memo stack
2548       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2549       // Remember master's state if we re-use this nested hot team
2550       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2551           master_th->th.th_task_state;
2552       --master_th->th.th_task_state_top; // pop
2553       // Now restore state at this level
2554       master_th->th.th_task_state =
2555           master_th->th
2556               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2557     }
2558     // Copy the task team from the parent team to the master thread
2559     master_th->th.th_task_team =
2560         parent_team->t.t_task_team[master_th->th.th_task_state];
2561     KA_TRACE(20,
2562              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2563               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2564               parent_team));
2565   }
2566 
2567   // TODO: GEH - cannot do this assertion because root thread not set up as
2568   // executing
2569   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2570   master_th->th.th_current_task->td_flags.executing = 1;
2571 
2572   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2573 
2574 #if OMPT_SUPPORT
2575   int flags =
2576       OMPT_INVOKER(fork_context) |
2577       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2578                                                       : ompt_parallel_team);
2579   if (ompt_enabled.enabled) {
2580     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2581                     codeptr);
2582   }
2583 #endif
2584 
2585   KMP_MB();
2586   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2587 }
2588 
2589 /* Check whether we should push an internal control record onto the
2590    serial team stack.  If so, do it.  */
2591 void __kmp_save_internal_controls(kmp_info_t *thread) {
2592 
2593   if (thread->th.th_team != thread->th.th_serial_team) {
2594     return;
2595   }
2596   if (thread->th.th_team->t.t_serialized > 1) {
2597     int push = 0;
2598 
2599     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2600       push = 1;
2601     } else {
2602       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2603           thread->th.th_team->t.t_serialized) {
2604         push = 1;
2605       }
2606     }
2607     if (push) { /* push a record on the serial team's stack */
2608       kmp_internal_control_t *control =
2609           (kmp_internal_control_t *)__kmp_allocate(
2610               sizeof(kmp_internal_control_t));
2611 
2612       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2613 
2614       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2615 
2616       control->next = thread->th.th_team->t.t_control_stack_top;
2617       thread->th.th_team->t.t_control_stack_top = control;
2618     }
2619   }
2620 }
2621 
2622 /* Changes set_nproc */
2623 void __kmp_set_num_threads(int new_nth, int gtid) {
2624   kmp_info_t *thread;
2625   kmp_root_t *root;
2626 
2627   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2628   KMP_DEBUG_ASSERT(__kmp_init_serial);
2629 
2630   if (new_nth < 1)
2631     new_nth = 1;
2632   else if (new_nth > __kmp_max_nth)
2633     new_nth = __kmp_max_nth;
2634 
2635   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2636   thread = __kmp_threads[gtid];
2637   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2638     return; // nothing to do
2639 
2640   __kmp_save_internal_controls(thread);
2641 
2642   set__nproc(thread, new_nth);
2643 
2644   // If this omp_set_num_threads() call will cause the hot team size to be
2645   // reduced (in the absence of a num_threads clause), then reduce it now,
2646   // rather than waiting for the next parallel region.
2647   root = thread->th.th_root;
2648   if (__kmp_init_parallel && (!root->r.r_active) &&
2649       (root->r.r_hot_team->t.t_nproc > new_nth)
2650 #if KMP_NESTED_HOT_TEAMS
2651       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2652 #endif
2653       ) {
2654     kmp_team_t *hot_team = root->r.r_hot_team;
2655     int f;
2656 
2657     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2658 
2659     // Release the extra threads we don't need any more.
2660     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2661       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2662       if (__kmp_tasking_mode != tskm_immediate_exec) {
2663         // When decreasing team size, threads no longer in the team should unref
2664         // task team.
2665         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2666       }
2667       __kmp_free_thread(hot_team->t.t_threads[f]);
2668       hot_team->t.t_threads[f] = NULL;
2669     }
2670     hot_team->t.t_nproc = new_nth;
2671 #if KMP_NESTED_HOT_TEAMS
2672     if (thread->th.th_hot_teams) {
2673       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2674       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2675     }
2676 #endif
2677 
2678     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2679 
2680     // Update the t_nproc field in the threads that are still active.
2681     for (f = 0; f < new_nth; f++) {
2682       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2683       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2684     }
2685     // Special flag in case omp_set_num_threads() call
2686     hot_team->t.t_size_changed = -1;
2687   }
2688 }
2689 
2690 /* Changes max_active_levels */
2691 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2692   kmp_info_t *thread;
2693 
2694   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2695                 "%d = (%d)\n",
2696                 gtid, max_active_levels));
2697   KMP_DEBUG_ASSERT(__kmp_init_serial);
2698 
2699   // validate max_active_levels
2700   if (max_active_levels < 0) {
2701     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2702     // We ignore this call if the user has specified a negative value.
2703     // The current setting won't be changed. The last valid setting will be
2704     // used. A warning will be issued (if warnings are allowed as controlled by
2705     // the KMP_WARNINGS env var).
2706     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2707                   "max_active_levels for thread %d = (%d)\n",
2708                   gtid, max_active_levels));
2709     return;
2710   }
2711   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2712     // it's OK, the max_active_levels is within the valid range: [ 0;
2713     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2714     // We allow a zero value. (implementation defined behavior)
2715   } else {
2716     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2717                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2718     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2719     // Current upper limit is MAX_INT. (implementation defined behavior)
2720     // If the input exceeds the upper limit, we correct the input to be the
2721     // upper limit. (implementation defined behavior)
2722     // Actually, the flow should never get here until we use MAX_INT limit.
2723   }
2724   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2725                 "max_active_levels for thread %d = (%d)\n",
2726                 gtid, max_active_levels));
2727 
2728   thread = __kmp_threads[gtid];
2729 
2730   __kmp_save_internal_controls(thread);
2731 
2732   set__max_active_levels(thread, max_active_levels);
2733 }
2734 
2735 /* Gets max_active_levels */
2736 int __kmp_get_max_active_levels(int gtid) {
2737   kmp_info_t *thread;
2738 
2739   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2740   KMP_DEBUG_ASSERT(__kmp_init_serial);
2741 
2742   thread = __kmp_threads[gtid];
2743   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2744   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2745                 "curtask_maxaclevel=%d\n",
2746                 gtid, thread->th.th_current_task,
2747                 thread->th.th_current_task->td_icvs.max_active_levels));
2748   return thread->th.th_current_task->td_icvs.max_active_levels;
2749 }
2750 
2751 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2752 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2753 
2754 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2755 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2756   kmp_info_t *thread;
2757   kmp_sched_t orig_kind;
2758   //    kmp_team_t *team;
2759 
2760   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2761                 gtid, (int)kind, chunk));
2762   KMP_DEBUG_ASSERT(__kmp_init_serial);
2763 
2764   // Check if the kind parameter is valid, correct if needed.
2765   // Valid parameters should fit in one of two intervals - standard or extended:
2766   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2767   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2768   orig_kind = kind;
2769   kind = __kmp_sched_without_mods(kind);
2770 
2771   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2772       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2773     // TODO: Hint needs attention in case we change the default schedule.
2774     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2775               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2776               __kmp_msg_null);
2777     kind = kmp_sched_default;
2778     chunk = 0; // ignore chunk value in case of bad kind
2779   }
2780 
2781   thread = __kmp_threads[gtid];
2782 
2783   __kmp_save_internal_controls(thread);
2784 
2785   if (kind < kmp_sched_upper_std) {
2786     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2787       // differ static chunked vs. unchunked:  chunk should be invalid to
2788       // indicate unchunked schedule (which is the default)
2789       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2790     } else {
2791       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2792           __kmp_sch_map[kind - kmp_sched_lower - 1];
2793     }
2794   } else {
2795     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2796     //    kmp_sched_lower - 2 ];
2797     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2798         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2799                       kmp_sched_lower - 2];
2800   }
2801   __kmp_sched_apply_mods_intkind(
2802       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2803   if (kind == kmp_sched_auto || chunk < 1) {
2804     // ignore parameter chunk for schedule auto
2805     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2806   } else {
2807     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2808   }
2809 }
2810 
2811 /* Gets def_sched_var ICV values */
2812 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2813   kmp_info_t *thread;
2814   enum sched_type th_type;
2815 
2816   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2817   KMP_DEBUG_ASSERT(__kmp_init_serial);
2818 
2819   thread = __kmp_threads[gtid];
2820 
2821   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2822   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2823   case kmp_sch_static:
2824   case kmp_sch_static_greedy:
2825   case kmp_sch_static_balanced:
2826     *kind = kmp_sched_static;
2827     __kmp_sched_apply_mods_stdkind(kind, th_type);
2828     *chunk = 0; // chunk was not set, try to show this fact via zero value
2829     return;
2830   case kmp_sch_static_chunked:
2831     *kind = kmp_sched_static;
2832     break;
2833   case kmp_sch_dynamic_chunked:
2834     *kind = kmp_sched_dynamic;
2835     break;
2836   case kmp_sch_guided_chunked:
2837   case kmp_sch_guided_iterative_chunked:
2838   case kmp_sch_guided_analytical_chunked:
2839     *kind = kmp_sched_guided;
2840     break;
2841   case kmp_sch_auto:
2842     *kind = kmp_sched_auto;
2843     break;
2844   case kmp_sch_trapezoidal:
2845     *kind = kmp_sched_trapezoidal;
2846     break;
2847 #if KMP_STATIC_STEAL_ENABLED
2848   case kmp_sch_static_steal:
2849     *kind = kmp_sched_static_steal;
2850     break;
2851 #endif
2852   default:
2853     KMP_FATAL(UnknownSchedulingType, th_type);
2854   }
2855 
2856   __kmp_sched_apply_mods_stdkind(kind, th_type);
2857   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2858 }
2859 
2860 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2861 
2862   int ii, dd;
2863   kmp_team_t *team;
2864   kmp_info_t *thr;
2865 
2866   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2867   KMP_DEBUG_ASSERT(__kmp_init_serial);
2868 
2869   // validate level
2870   if (level == 0)
2871     return 0;
2872   if (level < 0)
2873     return -1;
2874   thr = __kmp_threads[gtid];
2875   team = thr->th.th_team;
2876   ii = team->t.t_level;
2877   if (level > ii)
2878     return -1;
2879 
2880   if (thr->th.th_teams_microtask) {
2881     // AC: we are in teams region where multiple nested teams have same level
2882     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2883     if (level <=
2884         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2885       KMP_DEBUG_ASSERT(ii >= tlevel);
2886       // AC: As we need to pass by the teams league, we need to artificially
2887       // increase ii
2888       if (ii == tlevel) {
2889         ii += 2; // three teams have same level
2890       } else {
2891         ii++; // two teams have same level
2892       }
2893     }
2894   }
2895 
2896   if (ii == level)
2897     return __kmp_tid_from_gtid(gtid);
2898 
2899   dd = team->t.t_serialized;
2900   level++;
2901   while (ii > level) {
2902     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2903     }
2904     if ((team->t.t_serialized) && (!dd)) {
2905       team = team->t.t_parent;
2906       continue;
2907     }
2908     if (ii > level) {
2909       team = team->t.t_parent;
2910       dd = team->t.t_serialized;
2911       ii--;
2912     }
2913   }
2914 
2915   return (dd > 1) ? (0) : (team->t.t_master_tid);
2916 }
2917 
2918 int __kmp_get_team_size(int gtid, int level) {
2919 
2920   int ii, dd;
2921   kmp_team_t *team;
2922   kmp_info_t *thr;
2923 
2924   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2925   KMP_DEBUG_ASSERT(__kmp_init_serial);
2926 
2927   // validate level
2928   if (level == 0)
2929     return 1;
2930   if (level < 0)
2931     return -1;
2932   thr = __kmp_threads[gtid];
2933   team = thr->th.th_team;
2934   ii = team->t.t_level;
2935   if (level > ii)
2936     return -1;
2937 
2938   if (thr->th.th_teams_microtask) {
2939     // AC: we are in teams region where multiple nested teams have same level
2940     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2941     if (level <=
2942         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2943       KMP_DEBUG_ASSERT(ii >= tlevel);
2944       // AC: As we need to pass by the teams league, we need to artificially
2945       // increase ii
2946       if (ii == tlevel) {
2947         ii += 2; // three teams have same level
2948       } else {
2949         ii++; // two teams have same level
2950       }
2951     }
2952   }
2953 
2954   while (ii > level) {
2955     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2956     }
2957     if (team->t.t_serialized && (!dd)) {
2958       team = team->t.t_parent;
2959       continue;
2960     }
2961     if (ii > level) {
2962       team = team->t.t_parent;
2963       ii--;
2964     }
2965   }
2966 
2967   return team->t.t_nproc;
2968 }
2969 
2970 kmp_r_sched_t __kmp_get_schedule_global() {
2971   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2972   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2973   // independently. So one can get the updated schedule here.
2974 
2975   kmp_r_sched_t r_sched;
2976 
2977   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2978   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2979   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2980   // different roots (even in OMP 2.5)
2981   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2982   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2983   if (s == kmp_sch_static) {
2984     // replace STATIC with more detailed schedule (balanced or greedy)
2985     r_sched.r_sched_type = __kmp_static;
2986   } else if (s == kmp_sch_guided_chunked) {
2987     // replace GUIDED with more detailed schedule (iterative or analytical)
2988     r_sched.r_sched_type = __kmp_guided;
2989   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2990     r_sched.r_sched_type = __kmp_sched;
2991   }
2992   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2993 
2994   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2995     // __kmp_chunk may be wrong here (if it was not ever set)
2996     r_sched.chunk = KMP_DEFAULT_CHUNK;
2997   } else {
2998     r_sched.chunk = __kmp_chunk;
2999   }
3000 
3001   return r_sched;
3002 }
3003 
3004 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3005    at least argc number of *t_argv entries for the requested team. */
3006 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3007 
3008   KMP_DEBUG_ASSERT(team);
3009   if (!realloc || argc > team->t.t_max_argc) {
3010 
3011     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3012                    "current entries=%d\n",
3013                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3014     /* if previously allocated heap space for args, free them */
3015     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3016       __kmp_free((void *)team->t.t_argv);
3017 
3018     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3019       /* use unused space in the cache line for arguments */
3020       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3021       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3022                      "argv entries\n",
3023                      team->t.t_id, team->t.t_max_argc));
3024       team->t.t_argv = &team->t.t_inline_argv[0];
3025       if (__kmp_storage_map) {
3026         __kmp_print_storage_map_gtid(
3027             -1, &team->t.t_inline_argv[0],
3028             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3029             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3030             team->t.t_id);
3031       }
3032     } else {
3033       /* allocate space for arguments in the heap */
3034       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3035                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3036                                : 2 * argc;
3037       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3038                      "argv entries\n",
3039                      team->t.t_id, team->t.t_max_argc));
3040       team->t.t_argv =
3041           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3042       if (__kmp_storage_map) {
3043         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3044                                      &team->t.t_argv[team->t.t_max_argc],
3045                                      sizeof(void *) * team->t.t_max_argc,
3046                                      "team_%d.t_argv", team->t.t_id);
3047       }
3048     }
3049   }
3050 }
3051 
3052 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3053   int i;
3054   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3055   team->t.t_threads =
3056       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3057   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3058       sizeof(dispatch_shared_info_t) * num_disp_buff);
3059   team->t.t_dispatch =
3060       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3061   team->t.t_implicit_task_taskdata =
3062       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3063   team->t.t_max_nproc = max_nth;
3064 
3065   /* setup dispatch buffers */
3066   for (i = 0; i < num_disp_buff; ++i) {
3067     team->t.t_disp_buffer[i].buffer_index = i;
3068     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3069   }
3070 }
3071 
3072 static void __kmp_free_team_arrays(kmp_team_t *team) {
3073   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3074   int i;
3075   for (i = 0; i < team->t.t_max_nproc; ++i) {
3076     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3077       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3078       team->t.t_dispatch[i].th_disp_buffer = NULL;
3079     }
3080   }
3081 #if KMP_USE_HIER_SCHED
3082   __kmp_dispatch_free_hierarchies(team);
3083 #endif
3084   __kmp_free(team->t.t_threads);
3085   __kmp_free(team->t.t_disp_buffer);
3086   __kmp_free(team->t.t_dispatch);
3087   __kmp_free(team->t.t_implicit_task_taskdata);
3088   team->t.t_threads = NULL;
3089   team->t.t_disp_buffer = NULL;
3090   team->t.t_dispatch = NULL;
3091   team->t.t_implicit_task_taskdata = 0;
3092 }
3093 
3094 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3095   kmp_info_t **oldThreads = team->t.t_threads;
3096 
3097   __kmp_free(team->t.t_disp_buffer);
3098   __kmp_free(team->t.t_dispatch);
3099   __kmp_free(team->t.t_implicit_task_taskdata);
3100   __kmp_allocate_team_arrays(team, max_nth);
3101 
3102   KMP_MEMCPY(team->t.t_threads, oldThreads,
3103              team->t.t_nproc * sizeof(kmp_info_t *));
3104 
3105   __kmp_free(oldThreads);
3106 }
3107 
3108 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3109 
3110   kmp_r_sched_t r_sched =
3111       __kmp_get_schedule_global(); // get current state of scheduling globals
3112 
3113   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3114 
3115   kmp_internal_control_t g_icvs = {
3116     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3117     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3118     // adjustment of threads (per thread)
3119     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3120     // whether blocktime is explicitly set
3121     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3122 #if KMP_USE_MONITOR
3123     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3124 // intervals
3125 #endif
3126     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3127     // next parallel region (per thread)
3128     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3129     __kmp_cg_max_nth, // int thread_limit;
3130     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3131     // for max_active_levels
3132     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3133     // {sched,chunk} pair
3134     __kmp_nested_proc_bind.bind_types[0],
3135     __kmp_default_device,
3136     NULL // struct kmp_internal_control *next;
3137   };
3138 
3139   return g_icvs;
3140 }
3141 
3142 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3143 
3144   kmp_internal_control_t gx_icvs;
3145   gx_icvs.serial_nesting_level =
3146       0; // probably =team->t.t_serial like in save_inter_controls
3147   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3148   gx_icvs.next = NULL;
3149 
3150   return gx_icvs;
3151 }
3152 
3153 static void __kmp_initialize_root(kmp_root_t *root) {
3154   int f;
3155   kmp_team_t *root_team;
3156   kmp_team_t *hot_team;
3157   int hot_team_max_nth;
3158   kmp_r_sched_t r_sched =
3159       __kmp_get_schedule_global(); // get current state of scheduling globals
3160   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3161   KMP_DEBUG_ASSERT(root);
3162   KMP_ASSERT(!root->r.r_begin);
3163 
3164   /* setup the root state structure */
3165   __kmp_init_lock(&root->r.r_begin_lock);
3166   root->r.r_begin = FALSE;
3167   root->r.r_active = FALSE;
3168   root->r.r_in_parallel = 0;
3169   root->r.r_blocktime = __kmp_dflt_blocktime;
3170 
3171   /* setup the root team for this task */
3172   /* allocate the root team structure */
3173   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3174 
3175   root_team =
3176       __kmp_allocate_team(root,
3177                           1, // new_nproc
3178                           1, // max_nproc
3179 #if OMPT_SUPPORT
3180                           ompt_data_none, // root parallel id
3181 #endif
3182                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3183                           0 // argc
3184                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3185                           );
3186 #if USE_DEBUGGER
3187   // Non-NULL value should be assigned to make the debugger display the root
3188   // team.
3189   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3190 #endif
3191 
3192   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3193 
3194   root->r.r_root_team = root_team;
3195   root_team->t.t_control_stack_top = NULL;
3196 
3197   /* initialize root team */
3198   root_team->t.t_threads[0] = NULL;
3199   root_team->t.t_nproc = 1;
3200   root_team->t.t_serialized = 1;
3201   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3202   root_team->t.t_sched.sched = r_sched.sched;
3203   KA_TRACE(
3204       20,
3205       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3206        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3207 
3208   /* setup the  hot team for this task */
3209   /* allocate the hot team structure */
3210   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3211 
3212   hot_team =
3213       __kmp_allocate_team(root,
3214                           1, // new_nproc
3215                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3216 #if OMPT_SUPPORT
3217                           ompt_data_none, // root parallel id
3218 #endif
3219                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3220                           0 // argc
3221                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3222                           );
3223   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3224 
3225   root->r.r_hot_team = hot_team;
3226   root_team->t.t_control_stack_top = NULL;
3227 
3228   /* first-time initialization */
3229   hot_team->t.t_parent = root_team;
3230 
3231   /* initialize hot team */
3232   hot_team_max_nth = hot_team->t.t_max_nproc;
3233   for (f = 0; f < hot_team_max_nth; ++f) {
3234     hot_team->t.t_threads[f] = NULL;
3235   }
3236   hot_team->t.t_nproc = 1;
3237   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3238   hot_team->t.t_sched.sched = r_sched.sched;
3239   hot_team->t.t_size_changed = 0;
3240 }
3241 
3242 #ifdef KMP_DEBUG
3243 
3244 typedef struct kmp_team_list_item {
3245   kmp_team_p const *entry;
3246   struct kmp_team_list_item *next;
3247 } kmp_team_list_item_t;
3248 typedef kmp_team_list_item_t *kmp_team_list_t;
3249 
3250 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3251     kmp_team_list_t list, // List of teams.
3252     kmp_team_p const *team // Team to add.
3253     ) {
3254 
3255   // List must terminate with item where both entry and next are NULL.
3256   // Team is added to the list only once.
3257   // List is sorted in ascending order by team id.
3258   // Team id is *not* a key.
3259 
3260   kmp_team_list_t l;
3261 
3262   KMP_DEBUG_ASSERT(list != NULL);
3263   if (team == NULL) {
3264     return;
3265   }
3266 
3267   __kmp_print_structure_team_accum(list, team->t.t_parent);
3268   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3269 
3270   // Search list for the team.
3271   l = list;
3272   while (l->next != NULL && l->entry != team) {
3273     l = l->next;
3274   }
3275   if (l->next != NULL) {
3276     return; // Team has been added before, exit.
3277   }
3278 
3279   // Team is not found. Search list again for insertion point.
3280   l = list;
3281   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3282     l = l->next;
3283   }
3284 
3285   // Insert team.
3286   {
3287     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3288         sizeof(kmp_team_list_item_t));
3289     *item = *l;
3290     l->entry = team;
3291     l->next = item;
3292   }
3293 }
3294 
3295 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3296 
3297                                        ) {
3298   __kmp_printf("%s", title);
3299   if (team != NULL) {
3300     __kmp_printf("%2x %p\n", team->t.t_id, team);
3301   } else {
3302     __kmp_printf(" - (nil)\n");
3303   }
3304 }
3305 
3306 static void __kmp_print_structure_thread(char const *title,
3307                                          kmp_info_p const *thread) {
3308   __kmp_printf("%s", title);
3309   if (thread != NULL) {
3310     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3311   } else {
3312     __kmp_printf(" - (nil)\n");
3313   }
3314 }
3315 
3316 void __kmp_print_structure(void) {
3317 
3318   kmp_team_list_t list;
3319 
3320   // Initialize list of teams.
3321   list =
3322       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3323   list->entry = NULL;
3324   list->next = NULL;
3325 
3326   __kmp_printf("\n------------------------------\nGlobal Thread "
3327                "Table\n------------------------------\n");
3328   {
3329     int gtid;
3330     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3331       __kmp_printf("%2d", gtid);
3332       if (__kmp_threads != NULL) {
3333         __kmp_printf(" %p", __kmp_threads[gtid]);
3334       }
3335       if (__kmp_root != NULL) {
3336         __kmp_printf(" %p", __kmp_root[gtid]);
3337       }
3338       __kmp_printf("\n");
3339     }
3340   }
3341 
3342   // Print out __kmp_threads array.
3343   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3344                "----------\n");
3345   if (__kmp_threads != NULL) {
3346     int gtid;
3347     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3348       kmp_info_t const *thread = __kmp_threads[gtid];
3349       if (thread != NULL) {
3350         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3351         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3352         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3353         __kmp_print_structure_team("    Serial Team:  ",
3354                                    thread->th.th_serial_team);
3355         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3356         __kmp_print_structure_thread("    Master:       ",
3357                                      thread->th.th_team_master);
3358         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3359         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3360         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3361         __kmp_print_structure_thread("    Next in pool: ",
3362                                      thread->th.th_next_pool);
3363         __kmp_printf("\n");
3364         __kmp_print_structure_team_accum(list, thread->th.th_team);
3365         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3366       }
3367     }
3368   } else {
3369     __kmp_printf("Threads array is not allocated.\n");
3370   }
3371 
3372   // Print out __kmp_root array.
3373   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3374                "--------\n");
3375   if (__kmp_root != NULL) {
3376     int gtid;
3377     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3378       kmp_root_t const *root = __kmp_root[gtid];
3379       if (root != NULL) {
3380         __kmp_printf("GTID %2d %p:\n", gtid, root);
3381         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3382         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3383         __kmp_print_structure_thread("    Uber Thread:  ",
3384                                      root->r.r_uber_thread);
3385         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3386         __kmp_printf("    In Parallel:  %2d\n",
3387                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3388         __kmp_printf("\n");
3389         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3390         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3391       }
3392     }
3393   } else {
3394     __kmp_printf("Ubers array is not allocated.\n");
3395   }
3396 
3397   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3398                "--------\n");
3399   while (list->next != NULL) {
3400     kmp_team_p const *team = list->entry;
3401     int i;
3402     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3403     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3404     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3405     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3406     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3407     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3408     for (i = 0; i < team->t.t_nproc; ++i) {
3409       __kmp_printf("    Thread %2d:      ", i);
3410       __kmp_print_structure_thread("", team->t.t_threads[i]);
3411     }
3412     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3413     __kmp_printf("\n");
3414     list = list->next;
3415   }
3416 
3417   // Print out __kmp_thread_pool and __kmp_team_pool.
3418   __kmp_printf("\n------------------------------\nPools\n----------------------"
3419                "--------\n");
3420   __kmp_print_structure_thread("Thread pool:          ",
3421                                CCAST(kmp_info_t *, __kmp_thread_pool));
3422   __kmp_print_structure_team("Team pool:            ",
3423                              CCAST(kmp_team_t *, __kmp_team_pool));
3424   __kmp_printf("\n");
3425 
3426   // Free team list.
3427   while (list != NULL) {
3428     kmp_team_list_item_t *item = list;
3429     list = list->next;
3430     KMP_INTERNAL_FREE(item);
3431   }
3432 }
3433 
3434 #endif
3435 
3436 //---------------------------------------------------------------------------
3437 //  Stuff for per-thread fast random number generator
3438 //  Table of primes
3439 static const unsigned __kmp_primes[] = {
3440     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3441     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3442     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3443     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3444     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3445     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3446     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3447     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3448     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3449     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3450     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3451 
3452 //---------------------------------------------------------------------------
3453 //  __kmp_get_random: Get a random number using a linear congruential method.
3454 unsigned short __kmp_get_random(kmp_info_t *thread) {
3455   unsigned x = thread->th.th_x;
3456   unsigned short r = x >> 16;
3457 
3458   thread->th.th_x = x * thread->th.th_a + 1;
3459 
3460   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3461                 thread->th.th_info.ds.ds_tid, r));
3462 
3463   return r;
3464 }
3465 //--------------------------------------------------------
3466 // __kmp_init_random: Initialize a random number generator
3467 void __kmp_init_random(kmp_info_t *thread) {
3468   unsigned seed = thread->th.th_info.ds.ds_tid;
3469 
3470   thread->th.th_a =
3471       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3472   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3473   KA_TRACE(30,
3474            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3475 }
3476 
3477 #if KMP_OS_WINDOWS
3478 /* reclaim array entries for root threads that are already dead, returns number
3479  * reclaimed */
3480 static int __kmp_reclaim_dead_roots(void) {
3481   int i, r = 0;
3482 
3483   for (i = 0; i < __kmp_threads_capacity; ++i) {
3484     if (KMP_UBER_GTID(i) &&
3485         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3486         !__kmp_root[i]
3487              ->r.r_active) { // AC: reclaim only roots died in non-active state
3488       r += __kmp_unregister_root_other_thread(i);
3489     }
3490   }
3491   return r;
3492 }
3493 #endif
3494 
3495 /* This function attempts to create free entries in __kmp_threads and
3496    __kmp_root, and returns the number of free entries generated.
3497 
3498    For Windows* OS static library, the first mechanism used is to reclaim array
3499    entries for root threads that are already dead.
3500 
3501    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3502    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3503    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3504    threadprivate cache array has been created. Synchronization with
3505    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3506 
3507    After any dead root reclamation, if the clipping value allows array expansion
3508    to result in the generation of a total of nNeed free slots, the function does
3509    that expansion. If not, nothing is done beyond the possible initial root
3510    thread reclamation.
3511 
3512    If any argument is negative, the behavior is undefined. */
3513 static int __kmp_expand_threads(int nNeed) {
3514   int added = 0;
3515   int minimumRequiredCapacity;
3516   int newCapacity;
3517   kmp_info_t **newThreads;
3518   kmp_root_t **newRoot;
3519 
3520 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3521 // resizing __kmp_threads does not need additional protection if foreign
3522 // threads are present
3523 
3524 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3525   /* only for Windows static library */
3526   /* reclaim array entries for root threads that are already dead */
3527   added = __kmp_reclaim_dead_roots();
3528 
3529   if (nNeed) {
3530     nNeed -= added;
3531     if (nNeed < 0)
3532       nNeed = 0;
3533   }
3534 #endif
3535   if (nNeed <= 0)
3536     return added;
3537 
3538   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3539   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3540   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3541   // > __kmp_max_nth in one of two ways:
3542   //
3543   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3544   //    may not be reused by another thread, so we may need to increase
3545   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3546   //
3547   // 2) New foreign root(s) are encountered.  We always register new foreign
3548   //    roots. This may cause a smaller # of threads to be allocated at
3549   //    subsequent parallel regions, but the worker threads hang around (and
3550   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3551   //
3552   // Anyway, that is the reason for moving the check to see if
3553   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3554   // instead of having it performed here. -BB
3555 
3556   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3557 
3558   /* compute expansion headroom to check if we can expand */
3559   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3560     /* possible expansion too small -- give up */
3561     return added;
3562   }
3563   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3564 
3565   newCapacity = __kmp_threads_capacity;
3566   do {
3567     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3568                                                           : __kmp_sys_max_nth;
3569   } while (newCapacity < minimumRequiredCapacity);
3570   newThreads = (kmp_info_t **)__kmp_allocate(
3571       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3572   newRoot =
3573       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3574   KMP_MEMCPY(newThreads, __kmp_threads,
3575              __kmp_threads_capacity * sizeof(kmp_info_t *));
3576   KMP_MEMCPY(newRoot, __kmp_root,
3577              __kmp_threads_capacity * sizeof(kmp_root_t *));
3578 
3579   kmp_info_t **temp_threads = __kmp_threads;
3580   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3581   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3582   __kmp_free(temp_threads);
3583   added += newCapacity - __kmp_threads_capacity;
3584   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3585 
3586   if (newCapacity > __kmp_tp_capacity) {
3587     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3588     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3589       __kmp_threadprivate_resize_cache(newCapacity);
3590     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3591       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3592     }
3593     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3594   }
3595 
3596   return added;
3597 }
3598 
3599 /* Register the current thread as a root thread and obtain our gtid. We must
3600    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3601    thread that calls from __kmp_do_serial_initialize() */
3602 int __kmp_register_root(int initial_thread) {
3603   kmp_info_t *root_thread;
3604   kmp_root_t *root;
3605   int gtid;
3606   int capacity;
3607   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3608   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3609   KMP_MB();
3610 
3611   /* 2007-03-02:
3612      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3613      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3614      work as expected -- it may return false (that means there is at least one
3615      empty slot in __kmp_threads array), but it is possible the only free slot
3616      is #0, which is reserved for initial thread and so cannot be used for this
3617      one. Following code workarounds this bug.
3618 
3619      However, right solution seems to be not reserving slot #0 for initial
3620      thread because:
3621      (1) there is no magic in slot #0,
3622      (2) we cannot detect initial thread reliably (the first thread which does
3623         serial initialization may be not a real initial thread).
3624   */
3625   capacity = __kmp_threads_capacity;
3626   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3627     --capacity;
3628   }
3629 
3630   /* see if there are too many threads */
3631   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3632     if (__kmp_tp_cached) {
3633       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3634                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3635                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3636     } else {
3637       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3638                   __kmp_msg_null);
3639     }
3640   }
3641 
3642   /* find an available thread slot */
3643   /* Don't reassign the zero slot since we need that to only be used by initial
3644      thread */
3645   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3646        gtid++)
3647     ;
3648   KA_TRACE(1,
3649            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3650   KMP_ASSERT(gtid < __kmp_threads_capacity);
3651 
3652   /* update global accounting */
3653   __kmp_all_nth++;
3654   TCW_4(__kmp_nth, __kmp_nth + 1);
3655 
3656   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3657   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3658   if (__kmp_adjust_gtid_mode) {
3659     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3660       if (TCR_4(__kmp_gtid_mode) != 2) {
3661         TCW_4(__kmp_gtid_mode, 2);
3662       }
3663     } else {
3664       if (TCR_4(__kmp_gtid_mode) != 1) {
3665         TCW_4(__kmp_gtid_mode, 1);
3666       }
3667     }
3668   }
3669 
3670 #ifdef KMP_ADJUST_BLOCKTIME
3671   /* Adjust blocktime to zero if necessary            */
3672   /* Middle initialization might not have occurred yet */
3673   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3674     if (__kmp_nth > __kmp_avail_proc) {
3675       __kmp_zero_bt = TRUE;
3676     }
3677   }
3678 #endif /* KMP_ADJUST_BLOCKTIME */
3679 
3680   /* setup this new hierarchy */
3681   if (!(root = __kmp_root[gtid])) {
3682     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3683     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3684   }
3685 
3686 #if KMP_STATS_ENABLED
3687   // Initialize stats as soon as possible (right after gtid assignment).
3688   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3689   __kmp_stats_thread_ptr->startLife();
3690   KMP_SET_THREAD_STATE(SERIAL_REGION);
3691   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3692 #endif
3693   __kmp_initialize_root(root);
3694 
3695   /* setup new root thread structure */
3696   if (root->r.r_uber_thread) {
3697     root_thread = root->r.r_uber_thread;
3698   } else {
3699     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3700     if (__kmp_storage_map) {
3701       __kmp_print_thread_storage_map(root_thread, gtid);
3702     }
3703     root_thread->th.th_info.ds.ds_gtid = gtid;
3704 #if OMPT_SUPPORT
3705     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3706 #endif
3707     root_thread->th.th_root = root;
3708     if (__kmp_env_consistency_check) {
3709       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3710     }
3711 #if USE_FAST_MEMORY
3712     __kmp_initialize_fast_memory(root_thread);
3713 #endif /* USE_FAST_MEMORY */
3714 
3715 #if KMP_USE_BGET
3716     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3717     __kmp_initialize_bget(root_thread);
3718 #endif
3719     __kmp_init_random(root_thread); // Initialize random number generator
3720   }
3721 
3722   /* setup the serial team held in reserve by the root thread */
3723   if (!root_thread->th.th_serial_team) {
3724     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3725     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3726     root_thread->th.th_serial_team = __kmp_allocate_team(
3727         root, 1, 1,
3728 #if OMPT_SUPPORT
3729         ompt_data_none, // root parallel id
3730 #endif
3731         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3732   }
3733   KMP_ASSERT(root_thread->th.th_serial_team);
3734   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3735                 root_thread->th.th_serial_team));
3736 
3737   /* drop root_thread into place */
3738   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3739 
3740   root->r.r_root_team->t.t_threads[0] = root_thread;
3741   root->r.r_hot_team->t.t_threads[0] = root_thread;
3742   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3743   // AC: the team created in reserve, not for execution (it is unused for now).
3744   root_thread->th.th_serial_team->t.t_serialized = 0;
3745   root->r.r_uber_thread = root_thread;
3746 
3747   /* initialize the thread, get it ready to go */
3748   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3749   TCW_4(__kmp_init_gtid, TRUE);
3750 
3751   /* prepare the master thread for get_gtid() */
3752   __kmp_gtid_set_specific(gtid);
3753 
3754 #if USE_ITT_BUILD
3755   __kmp_itt_thread_name(gtid);
3756 #endif /* USE_ITT_BUILD */
3757 
3758 #ifdef KMP_TDATA_GTID
3759   __kmp_gtid = gtid;
3760 #endif
3761   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3762   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3763 
3764   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3765                 "plain=%u\n",
3766                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3767                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3768                 KMP_INIT_BARRIER_STATE));
3769   { // Initialize barrier data.
3770     int b;
3771     for (b = 0; b < bs_last_barrier; ++b) {
3772       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3773 #if USE_DEBUGGER
3774       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3775 #endif
3776     }
3777   }
3778   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3779                    KMP_INIT_BARRIER_STATE);
3780 
3781 #if KMP_AFFINITY_SUPPORTED
3782   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3783   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3784   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3785   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3786   if (TCR_4(__kmp_init_middle)) {
3787     __kmp_affinity_set_init_mask(gtid, TRUE);
3788   }
3789 #endif /* KMP_AFFINITY_SUPPORTED */
3790   root_thread->th.th_def_allocator = __kmp_def_allocator;
3791   root_thread->th.th_prev_level = 0;
3792   root_thread->th.th_prev_num_threads = 1;
3793 
3794   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3795   tmp->cg_root = root_thread;
3796   tmp->cg_thread_limit = __kmp_cg_max_nth;
3797   tmp->cg_nthreads = 1;
3798   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3799                  " cg_nthreads init to 1\n",
3800                  root_thread, tmp));
3801   tmp->up = NULL;
3802   root_thread->th.th_cg_roots = tmp;
3803 
3804   __kmp_root_counter++;
3805 
3806 #if OMPT_SUPPORT
3807   if (!initial_thread && ompt_enabled.enabled) {
3808 
3809     kmp_info_t *root_thread = ompt_get_thread();
3810 
3811     ompt_set_thread_state(root_thread, ompt_state_overhead);
3812 
3813     if (ompt_enabled.ompt_callback_thread_begin) {
3814       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3815           ompt_thread_initial, __ompt_get_thread_data_internal());
3816     }
3817     ompt_data_t *task_data;
3818     ompt_data_t *parallel_data;
3819     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3820     if (ompt_enabled.ompt_callback_implicit_task) {
3821       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3822           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3823     }
3824 
3825     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3826   }
3827 #endif
3828 
3829   KMP_MB();
3830   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3831 
3832   return gtid;
3833 }
3834 
3835 #if KMP_NESTED_HOT_TEAMS
3836 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3837                                 const int max_level) {
3838   int i, n, nth;
3839   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3840   if (!hot_teams || !hot_teams[level].hot_team) {
3841     return 0;
3842   }
3843   KMP_DEBUG_ASSERT(level < max_level);
3844   kmp_team_t *team = hot_teams[level].hot_team;
3845   nth = hot_teams[level].hot_team_nth;
3846   n = nth - 1; // master is not freed
3847   if (level < max_level - 1) {
3848     for (i = 0; i < nth; ++i) {
3849       kmp_info_t *th = team->t.t_threads[i];
3850       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3851       if (i > 0 && th->th.th_hot_teams) {
3852         __kmp_free(th->th.th_hot_teams);
3853         th->th.th_hot_teams = NULL;
3854       }
3855     }
3856   }
3857   __kmp_free_team(root, team, NULL);
3858   return n;
3859 }
3860 #endif
3861 
3862 // Resets a root thread and clear its root and hot teams.
3863 // Returns the number of __kmp_threads entries directly and indirectly freed.
3864 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3865   kmp_team_t *root_team = root->r.r_root_team;
3866   kmp_team_t *hot_team = root->r.r_hot_team;
3867   int n = hot_team->t.t_nproc;
3868   int i;
3869 
3870   KMP_DEBUG_ASSERT(!root->r.r_active);
3871 
3872   root->r.r_root_team = NULL;
3873   root->r.r_hot_team = NULL;
3874   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3875   // before call to __kmp_free_team().
3876   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3877 #if KMP_NESTED_HOT_TEAMS
3878   if (__kmp_hot_teams_max_level >
3879       0) { // need to free nested hot teams and their threads if any
3880     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3881       kmp_info_t *th = hot_team->t.t_threads[i];
3882       if (__kmp_hot_teams_max_level > 1) {
3883         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3884       }
3885       if (th->th.th_hot_teams) {
3886         __kmp_free(th->th.th_hot_teams);
3887         th->th.th_hot_teams = NULL;
3888       }
3889     }
3890   }
3891 #endif
3892   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3893 
3894   // Before we can reap the thread, we need to make certain that all other
3895   // threads in the teams that had this root as ancestor have stopped trying to
3896   // steal tasks.
3897   if (__kmp_tasking_mode != tskm_immediate_exec) {
3898     __kmp_wait_to_unref_task_teams();
3899   }
3900 
3901 #if KMP_OS_WINDOWS
3902   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3903   KA_TRACE(
3904       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3905            "\n",
3906            (LPVOID) & (root->r.r_uber_thread->th),
3907            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3908   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3909 #endif /* KMP_OS_WINDOWS */
3910 
3911 #if OMPT_SUPPORT
3912   ompt_data_t *task_data;
3913   ompt_data_t *parallel_data;
3914   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3915   if (ompt_enabled.ompt_callback_implicit_task) {
3916     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3917         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3918   }
3919   if (ompt_enabled.ompt_callback_thread_end) {
3920     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3921         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3922   }
3923 #endif
3924 
3925   TCW_4(__kmp_nth,
3926         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3927   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3928   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3929                  " to %d\n",
3930                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3931                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3932   if (i == 1) {
3933     // need to free contention group structure
3934     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3935                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3936     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3937     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3938     root->r.r_uber_thread->th.th_cg_roots = NULL;
3939   }
3940   __kmp_reap_thread(root->r.r_uber_thread, 1);
3941 
3942   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3943   // instead of freeing.
3944   root->r.r_uber_thread = NULL;
3945   /* mark root as no longer in use */
3946   root->r.r_begin = FALSE;
3947 
3948   return n;
3949 }
3950 
3951 void __kmp_unregister_root_current_thread(int gtid) {
3952   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3953   /* this lock should be ok, since unregister_root_current_thread is never
3954      called during an abort, only during a normal close. furthermore, if you
3955      have the forkjoin lock, you should never try to get the initz lock */
3956   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3957   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3958     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3959                   "exiting T#%d\n",
3960                   gtid));
3961     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3962     return;
3963   }
3964   kmp_root_t *root = __kmp_root[gtid];
3965 
3966   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3967   KMP_ASSERT(KMP_UBER_GTID(gtid));
3968   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3969   KMP_ASSERT(root->r.r_active == FALSE);
3970 
3971   KMP_MB();
3972 
3973   kmp_info_t *thread = __kmp_threads[gtid];
3974   kmp_team_t *team = thread->th.th_team;
3975   kmp_task_team_t *task_team = thread->th.th_task_team;
3976 
3977   // we need to wait for the proxy tasks before finishing the thread
3978   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3979 #if OMPT_SUPPORT
3980     // the runtime is shutting down so we won't report any events
3981     thread->th.ompt_thread_info.state = ompt_state_undefined;
3982 #endif
3983     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3984   }
3985 
3986   __kmp_reset_root(gtid, root);
3987 
3988   KMP_MB();
3989   KC_TRACE(10,
3990            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3991 
3992   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3993 }
3994 
3995 #if KMP_OS_WINDOWS
3996 /* __kmp_forkjoin_lock must be already held
3997    Unregisters a root thread that is not the current thread.  Returns the number
3998    of __kmp_threads entries freed as a result. */
3999 static int __kmp_unregister_root_other_thread(int gtid) {
4000   kmp_root_t *root = __kmp_root[gtid];
4001   int r;
4002 
4003   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4004   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4005   KMP_ASSERT(KMP_UBER_GTID(gtid));
4006   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4007   KMP_ASSERT(root->r.r_active == FALSE);
4008 
4009   r = __kmp_reset_root(gtid, root);
4010   KC_TRACE(10,
4011            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4012   return r;
4013 }
4014 #endif
4015 
4016 #if KMP_DEBUG
4017 void __kmp_task_info() {
4018 
4019   kmp_int32 gtid = __kmp_entry_gtid();
4020   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4021   kmp_info_t *this_thr = __kmp_threads[gtid];
4022   kmp_team_t *steam = this_thr->th.th_serial_team;
4023   kmp_team_t *team = this_thr->th.th_team;
4024 
4025   __kmp_printf(
4026       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4027       "ptask=%p\n",
4028       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4029       team->t.t_implicit_task_taskdata[tid].td_parent);
4030 }
4031 #endif // KMP_DEBUG
4032 
4033 /* TODO optimize with one big memclr, take out what isn't needed, split
4034    responsibility to workers as much as possible, and delay initialization of
4035    features as much as possible  */
4036 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4037                                   int tid, int gtid) {
4038   /* this_thr->th.th_info.ds.ds_gtid is setup in
4039      kmp_allocate_thread/create_worker.
4040      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4041   kmp_info_t *master = team->t.t_threads[0];
4042   KMP_DEBUG_ASSERT(this_thr != NULL);
4043   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4044   KMP_DEBUG_ASSERT(team);
4045   KMP_DEBUG_ASSERT(team->t.t_threads);
4046   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4047   KMP_DEBUG_ASSERT(master);
4048   KMP_DEBUG_ASSERT(master->th.th_root);
4049 
4050   KMP_MB();
4051 
4052   TCW_SYNC_PTR(this_thr->th.th_team, team);
4053 
4054   this_thr->th.th_info.ds.ds_tid = tid;
4055   this_thr->th.th_set_nproc = 0;
4056   if (__kmp_tasking_mode != tskm_immediate_exec)
4057     // When tasking is possible, threads are not safe to reap until they are
4058     // done tasking; this will be set when tasking code is exited in wait
4059     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4060   else // no tasking --> always safe to reap
4061     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4062   this_thr->th.th_set_proc_bind = proc_bind_default;
4063 #if KMP_AFFINITY_SUPPORTED
4064   this_thr->th.th_new_place = this_thr->th.th_current_place;
4065 #endif
4066   this_thr->th.th_root = master->th.th_root;
4067 
4068   /* setup the thread's cache of the team structure */
4069   this_thr->th.th_team_nproc = team->t.t_nproc;
4070   this_thr->th.th_team_master = master;
4071   this_thr->th.th_team_serialized = team->t.t_serialized;
4072   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4073 
4074   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4075 
4076   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4077                 tid, gtid, this_thr, this_thr->th.th_current_task));
4078 
4079   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4080                            team, tid, TRUE);
4081 
4082   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4083                 tid, gtid, this_thr, this_thr->th.th_current_task));
4084   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4085   // __kmp_initialize_team()?
4086 
4087   /* TODO no worksharing in speculative threads */
4088   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4089 
4090   this_thr->th.th_local.this_construct = 0;
4091 
4092   if (!this_thr->th.th_pri_common) {
4093     this_thr->th.th_pri_common =
4094         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4095     if (__kmp_storage_map) {
4096       __kmp_print_storage_map_gtid(
4097           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4098           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4099     }
4100     this_thr->th.th_pri_head = NULL;
4101   }
4102 
4103   if (this_thr != master && // Master's CG root is initialized elsewhere
4104       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4105     // Make new thread's CG root same as master's
4106     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4107     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4108     if (tmp) {
4109       // worker changes CG, need to check if old CG should be freed
4110       int i = tmp->cg_nthreads--;
4111       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4112                      " on node %p of thread %p to %d\n",
4113                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4114       if (i == 1) {
4115         __kmp_free(tmp); // last thread left CG --> free it
4116       }
4117     }
4118     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4119     // Increment new thread's CG root's counter to add the new thread
4120     this_thr->th.th_cg_roots->cg_nthreads++;
4121     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4122                    " node %p of thread %p to %d\n",
4123                    this_thr, this_thr->th.th_cg_roots,
4124                    this_thr->th.th_cg_roots->cg_root,
4125                    this_thr->th.th_cg_roots->cg_nthreads));
4126     this_thr->th.th_current_task->td_icvs.thread_limit =
4127         this_thr->th.th_cg_roots->cg_thread_limit;
4128   }
4129 
4130   /* Initialize dynamic dispatch */
4131   {
4132     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4133     // Use team max_nproc since this will never change for the team.
4134     size_t disp_size =
4135         sizeof(dispatch_private_info_t) *
4136         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4137     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4138                   team->t.t_max_nproc));
4139     KMP_ASSERT(dispatch);
4140     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4141     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4142 
4143     dispatch->th_disp_index = 0;
4144     dispatch->th_doacross_buf_idx = 0;
4145     if (!dispatch->th_disp_buffer) {
4146       dispatch->th_disp_buffer =
4147           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4148 
4149       if (__kmp_storage_map) {
4150         __kmp_print_storage_map_gtid(
4151             gtid, &dispatch->th_disp_buffer[0],
4152             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4153                                           ? 1
4154                                           : __kmp_dispatch_num_buffers],
4155             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4156                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4157             gtid, team->t.t_id, gtid);
4158       }
4159     } else {
4160       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4161     }
4162 
4163     dispatch->th_dispatch_pr_current = 0;
4164     dispatch->th_dispatch_sh_current = 0;
4165 
4166     dispatch->th_deo_fcn = 0; /* ORDERED     */
4167     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4168   }
4169 
4170   this_thr->th.th_next_pool = NULL;
4171 
4172   if (!this_thr->th.th_task_state_memo_stack) {
4173     size_t i;
4174     this_thr->th.th_task_state_memo_stack =
4175         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4176     this_thr->th.th_task_state_top = 0;
4177     this_thr->th.th_task_state_stack_sz = 4;
4178     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4179          ++i) // zero init the stack
4180       this_thr->th.th_task_state_memo_stack[i] = 0;
4181   }
4182 
4183   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4184   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4185 
4186   KMP_MB();
4187 }
4188 
4189 /* allocate a new thread for the requesting team. this is only called from
4190    within a forkjoin critical section. we will first try to get an available
4191    thread from the thread pool. if none is available, we will fork a new one
4192    assuming we are able to create a new one. this should be assured, as the
4193    caller should check on this first. */
4194 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4195                                   int new_tid) {
4196   kmp_team_t *serial_team;
4197   kmp_info_t *new_thr;
4198   int new_gtid;
4199 
4200   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4201   KMP_DEBUG_ASSERT(root && team);
4202 #if !KMP_NESTED_HOT_TEAMS
4203   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4204 #endif
4205   KMP_MB();
4206 
4207   /* first, try to get one from the thread pool */
4208   if (__kmp_thread_pool) {
4209     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4210     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4211     if (new_thr == __kmp_thread_pool_insert_pt) {
4212       __kmp_thread_pool_insert_pt = NULL;
4213     }
4214     TCW_4(new_thr->th.th_in_pool, FALSE);
4215     __kmp_suspend_initialize_thread(new_thr);
4216     __kmp_lock_suspend_mx(new_thr);
4217     if (new_thr->th.th_active_in_pool == TRUE) {
4218       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4219       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4220       new_thr->th.th_active_in_pool = FALSE;
4221     }
4222     __kmp_unlock_suspend_mx(new_thr);
4223 
4224     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4225                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4226     KMP_ASSERT(!new_thr->th.th_team);
4227     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4228 
4229     /* setup the thread structure */
4230     __kmp_initialize_info(new_thr, team, new_tid,
4231                           new_thr->th.th_info.ds.ds_gtid);
4232     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4233 
4234     TCW_4(__kmp_nth, __kmp_nth + 1);
4235 
4236     new_thr->th.th_task_state = 0;
4237     new_thr->th.th_task_state_top = 0;
4238     new_thr->th.th_task_state_stack_sz = 4;
4239 
4240 #ifdef KMP_ADJUST_BLOCKTIME
4241     /* Adjust blocktime back to zero if necessary */
4242     /* Middle initialization might not have occurred yet */
4243     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4244       if (__kmp_nth > __kmp_avail_proc) {
4245         __kmp_zero_bt = TRUE;
4246       }
4247     }
4248 #endif /* KMP_ADJUST_BLOCKTIME */
4249 
4250 #if KMP_DEBUG
4251     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4252     // KMP_BARRIER_PARENT_FLAG.
4253     int b;
4254     kmp_balign_t *balign = new_thr->th.th_bar;
4255     for (b = 0; b < bs_last_barrier; ++b)
4256       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4257 #endif
4258 
4259     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4260                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4261 
4262     KMP_MB();
4263     return new_thr;
4264   }
4265 
4266   /* no, well fork a new one */
4267   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4268   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4269 
4270 #if KMP_USE_MONITOR
4271   // If this is the first worker thread the RTL is creating, then also
4272   // launch the monitor thread.  We try to do this as early as possible.
4273   if (!TCR_4(__kmp_init_monitor)) {
4274     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4275     if (!TCR_4(__kmp_init_monitor)) {
4276       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4277       TCW_4(__kmp_init_monitor, 1);
4278       __kmp_create_monitor(&__kmp_monitor);
4279       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4280 #if KMP_OS_WINDOWS
4281       // AC: wait until monitor has started. This is a fix for CQ232808.
4282       // The reason is that if the library is loaded/unloaded in a loop with
4283       // small (parallel) work in between, then there is high probability that
4284       // monitor thread started after the library shutdown. At shutdown it is
4285       // too late to cope with the problem, because when the master is in
4286       // DllMain (process detach) the monitor has no chances to start (it is
4287       // blocked), and master has no means to inform the monitor that the
4288       // library has gone, because all the memory which the monitor can access
4289       // is going to be released/reset.
4290       while (TCR_4(__kmp_init_monitor) < 2) {
4291         KMP_YIELD(TRUE);
4292       }
4293       KF_TRACE(10, ("after monitor thread has started\n"));
4294 #endif
4295     }
4296     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4297   }
4298 #endif
4299 
4300   KMP_MB();
4301   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4302     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4303   }
4304 
4305   /* allocate space for it. */
4306   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4307 
4308   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4309 
4310 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4311   // suppress race conditions detection on synchronization flags in debug mode
4312   // this helps to analyze library internals eliminating false positives
4313   __itt_suppress_mark_range(
4314       __itt_suppress_range, __itt_suppress_threading_errors,
4315       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4316   __itt_suppress_mark_range(
4317       __itt_suppress_range, __itt_suppress_threading_errors,
4318       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4319 #if KMP_OS_WINDOWS
4320   __itt_suppress_mark_range(
4321       __itt_suppress_range, __itt_suppress_threading_errors,
4322       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4323 #else
4324   __itt_suppress_mark_range(__itt_suppress_range,
4325                             __itt_suppress_threading_errors,
4326                             &new_thr->th.th_suspend_init_count,
4327                             sizeof(new_thr->th.th_suspend_init_count));
4328 #endif
4329   // TODO: check if we need to also suppress b_arrived flags
4330   __itt_suppress_mark_range(__itt_suppress_range,
4331                             __itt_suppress_threading_errors,
4332                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4333                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4334   __itt_suppress_mark_range(__itt_suppress_range,
4335                             __itt_suppress_threading_errors,
4336                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4337                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4338   __itt_suppress_mark_range(__itt_suppress_range,
4339                             __itt_suppress_threading_errors,
4340                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4341                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4342 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4343   if (__kmp_storage_map) {
4344     __kmp_print_thread_storage_map(new_thr, new_gtid);
4345   }
4346 
4347   // add the reserve serialized team, initialized from the team's master thread
4348   {
4349     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4350     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4351     new_thr->th.th_serial_team = serial_team =
4352         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4353 #if OMPT_SUPPORT
4354                                           ompt_data_none, // root parallel id
4355 #endif
4356                                           proc_bind_default, &r_icvs,
4357                                           0 USE_NESTED_HOT_ARG(NULL));
4358   }
4359   KMP_ASSERT(serial_team);
4360   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4361   // execution (it is unused for now).
4362   serial_team->t.t_threads[0] = new_thr;
4363   KF_TRACE(10,
4364            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4365             new_thr));
4366 
4367   /* setup the thread structures */
4368   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4369 
4370 #if USE_FAST_MEMORY
4371   __kmp_initialize_fast_memory(new_thr);
4372 #endif /* USE_FAST_MEMORY */
4373 
4374 #if KMP_USE_BGET
4375   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4376   __kmp_initialize_bget(new_thr);
4377 #endif
4378 
4379   __kmp_init_random(new_thr); // Initialize random number generator
4380 
4381   /* Initialize these only once when thread is grabbed for a team allocation */
4382   KA_TRACE(20,
4383            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4384             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4385 
4386   int b;
4387   kmp_balign_t *balign = new_thr->th.th_bar;
4388   for (b = 0; b < bs_last_barrier; ++b) {
4389     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4390     balign[b].bb.team = NULL;
4391     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4392     balign[b].bb.use_oncore_barrier = 0;
4393   }
4394 
4395   new_thr->th.th_spin_here = FALSE;
4396   new_thr->th.th_next_waiting = 0;
4397 #if KMP_OS_UNIX
4398   new_thr->th.th_blocking = false;
4399 #endif
4400 
4401 #if KMP_AFFINITY_SUPPORTED
4402   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4403   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4404   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4405   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4406 #endif
4407   new_thr->th.th_def_allocator = __kmp_def_allocator;
4408   new_thr->th.th_prev_level = 0;
4409   new_thr->th.th_prev_num_threads = 1;
4410 
4411   TCW_4(new_thr->th.th_in_pool, FALSE);
4412   new_thr->th.th_active_in_pool = FALSE;
4413   TCW_4(new_thr->th.th_active, TRUE);
4414 
4415   /* adjust the global counters */
4416   __kmp_all_nth++;
4417   __kmp_nth++;
4418 
4419   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4420   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4421   if (__kmp_adjust_gtid_mode) {
4422     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4423       if (TCR_4(__kmp_gtid_mode) != 2) {
4424         TCW_4(__kmp_gtid_mode, 2);
4425       }
4426     } else {
4427       if (TCR_4(__kmp_gtid_mode) != 1) {
4428         TCW_4(__kmp_gtid_mode, 1);
4429       }
4430     }
4431   }
4432 
4433 #ifdef KMP_ADJUST_BLOCKTIME
4434   /* Adjust blocktime back to zero if necessary       */
4435   /* Middle initialization might not have occurred yet */
4436   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4437     if (__kmp_nth > __kmp_avail_proc) {
4438       __kmp_zero_bt = TRUE;
4439     }
4440   }
4441 #endif /* KMP_ADJUST_BLOCKTIME */
4442 
4443   /* actually fork it and create the new worker thread */
4444   KF_TRACE(
4445       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4446   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4447   KF_TRACE(10,
4448            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4449 
4450   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4451                 new_gtid));
4452   KMP_MB();
4453   return new_thr;
4454 }
4455 
4456 /* Reinitialize team for reuse.
4457    The hot team code calls this case at every fork barrier, so EPCC barrier
4458    test are extremely sensitive to changes in it, esp. writes to the team
4459    struct, which cause a cache invalidation in all threads.
4460    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4461 static void __kmp_reinitialize_team(kmp_team_t *team,
4462                                     kmp_internal_control_t *new_icvs,
4463                                     ident_t *loc) {
4464   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4465                 team->t.t_threads[0], team));
4466   KMP_DEBUG_ASSERT(team && new_icvs);
4467   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4468   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4469 
4470   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4471   // Copy ICVs to the master thread's implicit taskdata
4472   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4473   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4474 
4475   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4476                 team->t.t_threads[0], team));
4477 }
4478 
4479 /* Initialize the team data structure.
4480    This assumes the t_threads and t_max_nproc are already set.
4481    Also, we don't touch the arguments */
4482 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4483                                   kmp_internal_control_t *new_icvs,
4484                                   ident_t *loc) {
4485   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4486 
4487   /* verify */
4488   KMP_DEBUG_ASSERT(team);
4489   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4490   KMP_DEBUG_ASSERT(team->t.t_threads);
4491   KMP_MB();
4492 
4493   team->t.t_master_tid = 0; /* not needed */
4494   /* team->t.t_master_bar;        not needed */
4495   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4496   team->t.t_nproc = new_nproc;
4497 
4498   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4499   team->t.t_next_pool = NULL;
4500   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4501    * up hot team */
4502 
4503   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4504   team->t.t_invoke = NULL; /* not needed */
4505 
4506   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4507   team->t.t_sched.sched = new_icvs->sched.sched;
4508 
4509 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4510   team->t.t_fp_control_saved = FALSE; /* not needed */
4511   team->t.t_x87_fpu_control_word = 0; /* not needed */
4512   team->t.t_mxcsr = 0; /* not needed */
4513 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4514 
4515   team->t.t_construct = 0;
4516 
4517   team->t.t_ordered.dt.t_value = 0;
4518   team->t.t_master_active = FALSE;
4519 
4520 #ifdef KMP_DEBUG
4521   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4522 #endif
4523 #if KMP_OS_WINDOWS
4524   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4525 #endif
4526 
4527   team->t.t_control_stack_top = NULL;
4528 
4529   __kmp_reinitialize_team(team, new_icvs, loc);
4530 
4531   KMP_MB();
4532   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4533 }
4534 
4535 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4536 /* Sets full mask for thread and returns old mask, no changes to structures. */
4537 static void
4538 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4539   if (KMP_AFFINITY_CAPABLE()) {
4540     int status;
4541     if (old_mask != NULL) {
4542       status = __kmp_get_system_affinity(old_mask, TRUE);
4543       int error = errno;
4544       if (status != 0) {
4545         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4546                     __kmp_msg_null);
4547       }
4548     }
4549     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4550   }
4551 }
4552 #endif
4553 
4554 #if KMP_AFFINITY_SUPPORTED
4555 
4556 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4557 // It calculates the worker + master thread's partition based upon the parent
4558 // thread's partition, and binds each worker to a thread in their partition.
4559 // The master thread's partition should already include its current binding.
4560 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4561   // Copy the master thread's place partition to the team struct
4562   kmp_info_t *master_th = team->t.t_threads[0];
4563   KMP_DEBUG_ASSERT(master_th != NULL);
4564   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4565   int first_place = master_th->th.th_first_place;
4566   int last_place = master_th->th.th_last_place;
4567   int masters_place = master_th->th.th_current_place;
4568   team->t.t_first_place = first_place;
4569   team->t.t_last_place = last_place;
4570 
4571   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4572                 "bound to place %d partition = [%d,%d]\n",
4573                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4574                 team->t.t_id, masters_place, first_place, last_place));
4575 
4576   switch (proc_bind) {
4577 
4578   case proc_bind_default:
4579     // serial teams might have the proc_bind policy set to proc_bind_default. It
4580     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4581     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4582     break;
4583 
4584   case proc_bind_master: {
4585     int f;
4586     int n_th = team->t.t_nproc;
4587     for (f = 1; f < n_th; f++) {
4588       kmp_info_t *th = team->t.t_threads[f];
4589       KMP_DEBUG_ASSERT(th != NULL);
4590       th->th.th_first_place = first_place;
4591       th->th.th_last_place = last_place;
4592       th->th.th_new_place = masters_place;
4593       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4594           team->t.t_display_affinity != 1) {
4595         team->t.t_display_affinity = 1;
4596       }
4597 
4598       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4599                      "partition = [%d,%d]\n",
4600                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4601                      f, masters_place, first_place, last_place));
4602     }
4603   } break;
4604 
4605   case proc_bind_close: {
4606     int f;
4607     int n_th = team->t.t_nproc;
4608     int n_places;
4609     if (first_place <= last_place) {
4610       n_places = last_place - first_place + 1;
4611     } else {
4612       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4613     }
4614     if (n_th <= n_places) {
4615       int place = masters_place;
4616       for (f = 1; f < n_th; f++) {
4617         kmp_info_t *th = team->t.t_threads[f];
4618         KMP_DEBUG_ASSERT(th != NULL);
4619 
4620         if (place == last_place) {
4621           place = first_place;
4622         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4623           place = 0;
4624         } else {
4625           place++;
4626         }
4627         th->th.th_first_place = first_place;
4628         th->th.th_last_place = last_place;
4629         th->th.th_new_place = place;
4630         if (__kmp_display_affinity && place != th->th.th_current_place &&
4631             team->t.t_display_affinity != 1) {
4632           team->t.t_display_affinity = 1;
4633         }
4634 
4635         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4636                        "partition = [%d,%d]\n",
4637                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4638                        team->t.t_id, f, place, first_place, last_place));
4639       }
4640     } else {
4641       int S, rem, gap, s_count;
4642       S = n_th / n_places;
4643       s_count = 0;
4644       rem = n_th - (S * n_places);
4645       gap = rem > 0 ? n_places / rem : n_places;
4646       int place = masters_place;
4647       int gap_ct = gap;
4648       for (f = 0; f < n_th; f++) {
4649         kmp_info_t *th = team->t.t_threads[f];
4650         KMP_DEBUG_ASSERT(th != NULL);
4651 
4652         th->th.th_first_place = first_place;
4653         th->th.th_last_place = last_place;
4654         th->th.th_new_place = place;
4655         if (__kmp_display_affinity && place != th->th.th_current_place &&
4656             team->t.t_display_affinity != 1) {
4657           team->t.t_display_affinity = 1;
4658         }
4659         s_count++;
4660 
4661         if ((s_count == S) && rem && (gap_ct == gap)) {
4662           // do nothing, add an extra thread to place on next iteration
4663         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4664           // we added an extra thread to this place; move to next place
4665           if (place == last_place) {
4666             place = first_place;
4667           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4668             place = 0;
4669           } else {
4670             place++;
4671           }
4672           s_count = 0;
4673           gap_ct = 1;
4674           rem--;
4675         } else if (s_count == S) { // place full; don't add extra
4676           if (place == last_place) {
4677             place = first_place;
4678           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4679             place = 0;
4680           } else {
4681             place++;
4682           }
4683           gap_ct++;
4684           s_count = 0;
4685         }
4686 
4687         KA_TRACE(100,
4688                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4689                   "partition = [%d,%d]\n",
4690                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4691                   th->th.th_new_place, first_place, last_place));
4692       }
4693       KMP_DEBUG_ASSERT(place == masters_place);
4694     }
4695   } break;
4696 
4697   case proc_bind_spread: {
4698     int f;
4699     int n_th = team->t.t_nproc;
4700     int n_places;
4701     int thidx;
4702     if (first_place <= last_place) {
4703       n_places = last_place - first_place + 1;
4704     } else {
4705       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4706     }
4707     if (n_th <= n_places) {
4708       int place = -1;
4709 
4710       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4711         int S = n_places / n_th;
4712         int s_count, rem, gap, gap_ct;
4713 
4714         place = masters_place;
4715         rem = n_places - n_th * S;
4716         gap = rem ? n_th / rem : 1;
4717         gap_ct = gap;
4718         thidx = n_th;
4719         if (update_master_only == 1)
4720           thidx = 1;
4721         for (f = 0; f < thidx; f++) {
4722           kmp_info_t *th = team->t.t_threads[f];
4723           KMP_DEBUG_ASSERT(th != NULL);
4724 
4725           th->th.th_first_place = place;
4726           th->th.th_new_place = place;
4727           if (__kmp_display_affinity && place != th->th.th_current_place &&
4728               team->t.t_display_affinity != 1) {
4729             team->t.t_display_affinity = 1;
4730           }
4731           s_count = 1;
4732           while (s_count < S) {
4733             if (place == last_place) {
4734               place = first_place;
4735             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4736               place = 0;
4737             } else {
4738               place++;
4739             }
4740             s_count++;
4741           }
4742           if (rem && (gap_ct == gap)) {
4743             if (place == last_place) {
4744               place = first_place;
4745             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4746               place = 0;
4747             } else {
4748               place++;
4749             }
4750             rem--;
4751             gap_ct = 0;
4752           }
4753           th->th.th_last_place = place;
4754           gap_ct++;
4755 
4756           if (place == last_place) {
4757             place = first_place;
4758           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4759             place = 0;
4760           } else {
4761             place++;
4762           }
4763 
4764           KA_TRACE(100,
4765                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4766                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4767                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4768                     f, th->th.th_new_place, th->th.th_first_place,
4769                     th->th.th_last_place, __kmp_affinity_num_masks));
4770         }
4771       } else {
4772         /* Having uniform space of available computation places I can create
4773            T partitions of round(P/T) size and put threads into the first
4774            place of each partition. */
4775         double current = static_cast<double>(masters_place);
4776         double spacing =
4777             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4778         int first, last;
4779         kmp_info_t *th;
4780 
4781         thidx = n_th + 1;
4782         if (update_master_only == 1)
4783           thidx = 1;
4784         for (f = 0; f < thidx; f++) {
4785           first = static_cast<int>(current);
4786           last = static_cast<int>(current + spacing) - 1;
4787           KMP_DEBUG_ASSERT(last >= first);
4788           if (first >= n_places) {
4789             if (masters_place) {
4790               first -= n_places;
4791               last -= n_places;
4792               if (first == (masters_place + 1)) {
4793                 KMP_DEBUG_ASSERT(f == n_th);
4794                 first--;
4795               }
4796               if (last == masters_place) {
4797                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4798                 last--;
4799               }
4800             } else {
4801               KMP_DEBUG_ASSERT(f == n_th);
4802               first = 0;
4803               last = 0;
4804             }
4805           }
4806           if (last >= n_places) {
4807             last = (n_places - 1);
4808           }
4809           place = first;
4810           current += spacing;
4811           if (f < n_th) {
4812             KMP_DEBUG_ASSERT(0 <= first);
4813             KMP_DEBUG_ASSERT(n_places > first);
4814             KMP_DEBUG_ASSERT(0 <= last);
4815             KMP_DEBUG_ASSERT(n_places > last);
4816             KMP_DEBUG_ASSERT(last_place >= first_place);
4817             th = team->t.t_threads[f];
4818             KMP_DEBUG_ASSERT(th);
4819             th->th.th_first_place = first;
4820             th->th.th_new_place = place;
4821             th->th.th_last_place = last;
4822             if (__kmp_display_affinity && place != th->th.th_current_place &&
4823                 team->t.t_display_affinity != 1) {
4824               team->t.t_display_affinity = 1;
4825             }
4826             KA_TRACE(100,
4827                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4828                       "partition = [%d,%d], spacing = %.4f\n",
4829                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4830                       team->t.t_id, f, th->th.th_new_place,
4831                       th->th.th_first_place, th->th.th_last_place, spacing));
4832           }
4833         }
4834       }
4835       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4836     } else {
4837       int S, rem, gap, s_count;
4838       S = n_th / n_places;
4839       s_count = 0;
4840       rem = n_th - (S * n_places);
4841       gap = rem > 0 ? n_places / rem : n_places;
4842       int place = masters_place;
4843       int gap_ct = gap;
4844       thidx = n_th;
4845       if (update_master_only == 1)
4846         thidx = 1;
4847       for (f = 0; f < thidx; f++) {
4848         kmp_info_t *th = team->t.t_threads[f];
4849         KMP_DEBUG_ASSERT(th != NULL);
4850 
4851         th->th.th_first_place = place;
4852         th->th.th_last_place = place;
4853         th->th.th_new_place = place;
4854         if (__kmp_display_affinity && place != th->th.th_current_place &&
4855             team->t.t_display_affinity != 1) {
4856           team->t.t_display_affinity = 1;
4857         }
4858         s_count++;
4859 
4860         if ((s_count == S) && rem && (gap_ct == gap)) {
4861           // do nothing, add an extra thread to place on next iteration
4862         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4863           // we added an extra thread to this place; move on to next place
4864           if (place == last_place) {
4865             place = first_place;
4866           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4867             place = 0;
4868           } else {
4869             place++;
4870           }
4871           s_count = 0;
4872           gap_ct = 1;
4873           rem--;
4874         } else if (s_count == S) { // place is full; don't add extra thread
4875           if (place == last_place) {
4876             place = first_place;
4877           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4878             place = 0;
4879           } else {
4880             place++;
4881           }
4882           gap_ct++;
4883           s_count = 0;
4884         }
4885 
4886         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4887                        "partition = [%d,%d]\n",
4888                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4889                        team->t.t_id, f, th->th.th_new_place,
4890                        th->th.th_first_place, th->th.th_last_place));
4891       }
4892       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4893     }
4894   } break;
4895 
4896   default:
4897     break;
4898   }
4899 
4900   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4901 }
4902 
4903 #endif // KMP_AFFINITY_SUPPORTED
4904 
4905 /* allocate a new team data structure to use.  take one off of the free pool if
4906    available */
4907 kmp_team_t *
4908 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4909 #if OMPT_SUPPORT
4910                     ompt_data_t ompt_parallel_data,
4911 #endif
4912                     kmp_proc_bind_t new_proc_bind,
4913                     kmp_internal_control_t *new_icvs,
4914                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4915   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4916   int f;
4917   kmp_team_t *team;
4918   int use_hot_team = !root->r.r_active;
4919   int level = 0;
4920 
4921   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4922   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4923   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4924   KMP_MB();
4925 
4926 #if KMP_NESTED_HOT_TEAMS
4927   kmp_hot_team_ptr_t *hot_teams;
4928   if (master) {
4929     team = master->th.th_team;
4930     level = team->t.t_active_level;
4931     if (master->th.th_teams_microtask) { // in teams construct?
4932       if (master->th.th_teams_size.nteams > 1 &&
4933           ( // #teams > 1
4934               team->t.t_pkfn ==
4935                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4936               master->th.th_teams_level <
4937                   team->t.t_level)) { // or nested parallel inside the teams
4938         ++level; // not increment if #teams==1, or for outer fork of the teams;
4939         // increment otherwise
4940       }
4941     }
4942     hot_teams = master->th.th_hot_teams;
4943     if (level < __kmp_hot_teams_max_level && hot_teams &&
4944         hot_teams[level].hot_team) {
4945       // hot team has already been allocated for given level
4946       use_hot_team = 1;
4947     } else {
4948       use_hot_team = 0;
4949     }
4950   } else {
4951     // check we won't access uninitialized hot_teams, just in case
4952     KMP_DEBUG_ASSERT(new_nproc == 1);
4953   }
4954 #endif
4955   // Optimization to use a "hot" team
4956   if (use_hot_team && new_nproc > 1) {
4957     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4958 #if KMP_NESTED_HOT_TEAMS
4959     team = hot_teams[level].hot_team;
4960 #else
4961     team = root->r.r_hot_team;
4962 #endif
4963 #if KMP_DEBUG
4964     if (__kmp_tasking_mode != tskm_immediate_exec) {
4965       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4966                     "task_team[1] = %p before reinit\n",
4967                     team->t.t_task_team[0], team->t.t_task_team[1]));
4968     }
4969 #endif
4970 
4971     // Has the number of threads changed?
4972     /* Let's assume the most common case is that the number of threads is
4973        unchanged, and put that case first. */
4974     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4975       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4976       // This case can mean that omp_set_num_threads() was called and the hot
4977       // team size was already reduced, so we check the special flag
4978       if (team->t.t_size_changed == -1) {
4979         team->t.t_size_changed = 1;
4980       } else {
4981         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4982       }
4983 
4984       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4985       kmp_r_sched_t new_sched = new_icvs->sched;
4986       // set master's schedule as new run-time schedule
4987       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4988 
4989       __kmp_reinitialize_team(team, new_icvs,
4990                               root->r.r_uber_thread->th.th_ident);
4991 
4992       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4993                     team->t.t_threads[0], team));
4994       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4995 
4996 #if KMP_AFFINITY_SUPPORTED
4997       if ((team->t.t_size_changed == 0) &&
4998           (team->t.t_proc_bind == new_proc_bind)) {
4999         if (new_proc_bind == proc_bind_spread) {
5000           __kmp_partition_places(
5001               team, 1); // add flag to update only master for spread
5002         }
5003         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5004                        "proc_bind = %d, partition = [%d,%d]\n",
5005                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5006                        team->t.t_last_place));
5007       } else {
5008         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5009         __kmp_partition_places(team);
5010       }
5011 #else
5012       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5013 #endif /* KMP_AFFINITY_SUPPORTED */
5014     } else if (team->t.t_nproc > new_nproc) {
5015       KA_TRACE(20,
5016                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5017                 new_nproc));
5018 
5019       team->t.t_size_changed = 1;
5020 #if KMP_NESTED_HOT_TEAMS
5021       if (__kmp_hot_teams_mode == 0) {
5022         // AC: saved number of threads should correspond to team's value in this
5023         // mode, can be bigger in mode 1, when hot team has threads in reserve
5024         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5025         hot_teams[level].hot_team_nth = new_nproc;
5026 #endif // KMP_NESTED_HOT_TEAMS
5027         /* release the extra threads we don't need any more */
5028         for (f = new_nproc; f < team->t.t_nproc; f++) {
5029           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5030           if (__kmp_tasking_mode != tskm_immediate_exec) {
5031             // When decreasing team size, threads no longer in the team should
5032             // unref task team.
5033             team->t.t_threads[f]->th.th_task_team = NULL;
5034           }
5035           __kmp_free_thread(team->t.t_threads[f]);
5036           team->t.t_threads[f] = NULL;
5037         }
5038 #if KMP_NESTED_HOT_TEAMS
5039       } // (__kmp_hot_teams_mode == 0)
5040       else {
5041         // When keeping extra threads in team, switch threads to wait on own
5042         // b_go flag
5043         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5044           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5045           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5046           for (int b = 0; b < bs_last_barrier; ++b) {
5047             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5048               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5049             }
5050             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5051           }
5052         }
5053       }
5054 #endif // KMP_NESTED_HOT_TEAMS
5055       team->t.t_nproc = new_nproc;
5056       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5057       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5058       __kmp_reinitialize_team(team, new_icvs,
5059                               root->r.r_uber_thread->th.th_ident);
5060 
5061       // Update remaining threads
5062       for (f = 0; f < new_nproc; ++f) {
5063         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5064       }
5065 
5066       // restore the current task state of the master thread: should be the
5067       // implicit task
5068       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5069                     team->t.t_threads[0], team));
5070 
5071       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5072 
5073 #ifdef KMP_DEBUG
5074       for (f = 0; f < team->t.t_nproc; f++) {
5075         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5076                          team->t.t_threads[f]->th.th_team_nproc ==
5077                              team->t.t_nproc);
5078       }
5079 #endif
5080 
5081       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5082 #if KMP_AFFINITY_SUPPORTED
5083       __kmp_partition_places(team);
5084 #endif
5085     } else { // team->t.t_nproc < new_nproc
5086 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5087       kmp_affin_mask_t *old_mask;
5088       if (KMP_AFFINITY_CAPABLE()) {
5089         KMP_CPU_ALLOC(old_mask);
5090       }
5091 #endif
5092 
5093       KA_TRACE(20,
5094                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5095                 new_nproc));
5096 
5097       team->t.t_size_changed = 1;
5098 
5099 #if KMP_NESTED_HOT_TEAMS
5100       int avail_threads = hot_teams[level].hot_team_nth;
5101       if (new_nproc < avail_threads)
5102         avail_threads = new_nproc;
5103       kmp_info_t **other_threads = team->t.t_threads;
5104       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5105         // Adjust barrier data of reserved threads (if any) of the team
5106         // Other data will be set in __kmp_initialize_info() below.
5107         int b;
5108         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5109         for (b = 0; b < bs_last_barrier; ++b) {
5110           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5111           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5112 #if USE_DEBUGGER
5113           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5114 #endif
5115         }
5116       }
5117       if (hot_teams[level].hot_team_nth >= new_nproc) {
5118         // we have all needed threads in reserve, no need to allocate any
5119         // this only possible in mode 1, cannot have reserved threads in mode 0
5120         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5121         team->t.t_nproc = new_nproc; // just get reserved threads involved
5122       } else {
5123         // we may have some threads in reserve, but not enough
5124         team->t.t_nproc =
5125             hot_teams[level]
5126                 .hot_team_nth; // get reserved threads involved if any
5127         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5128 #endif // KMP_NESTED_HOT_TEAMS
5129         if (team->t.t_max_nproc < new_nproc) {
5130           /* reallocate larger arrays */
5131           __kmp_reallocate_team_arrays(team, new_nproc);
5132           __kmp_reinitialize_team(team, new_icvs, NULL);
5133         }
5134 
5135 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5136         /* Temporarily set full mask for master thread before creation of
5137            workers. The reason is that workers inherit the affinity from master,
5138            so if a lot of workers are created on the single core quickly, they
5139            don't get a chance to set their own affinity for a long time. */
5140         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5141 #endif
5142 
5143         /* allocate new threads for the hot team */
5144         for (f = team->t.t_nproc; f < new_nproc; f++) {
5145           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5146           KMP_DEBUG_ASSERT(new_worker);
5147           team->t.t_threads[f] = new_worker;
5148 
5149           KA_TRACE(20,
5150                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5151                     "join=%llu, plain=%llu\n",
5152                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5153                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5154                     team->t.t_bar[bs_plain_barrier].b_arrived));
5155 
5156           { // Initialize barrier data for new threads.
5157             int b;
5158             kmp_balign_t *balign = new_worker->th.th_bar;
5159             for (b = 0; b < bs_last_barrier; ++b) {
5160               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5161               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5162                                KMP_BARRIER_PARENT_FLAG);
5163 #if USE_DEBUGGER
5164               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5165 #endif
5166             }
5167           }
5168         }
5169 
5170 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5171         if (KMP_AFFINITY_CAPABLE()) {
5172           /* Restore initial master thread's affinity mask */
5173           __kmp_set_system_affinity(old_mask, TRUE);
5174           KMP_CPU_FREE(old_mask);
5175         }
5176 #endif
5177 #if KMP_NESTED_HOT_TEAMS
5178       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5179 #endif // KMP_NESTED_HOT_TEAMS
5180       /* make sure everyone is syncronized */
5181       int old_nproc = team->t.t_nproc; // save old value and use to update only
5182       // new threads below
5183       __kmp_initialize_team(team, new_nproc, new_icvs,
5184                             root->r.r_uber_thread->th.th_ident);
5185 
5186       /* reinitialize the threads */
5187       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5188       for (f = 0; f < team->t.t_nproc; ++f)
5189         __kmp_initialize_info(team->t.t_threads[f], team, f,
5190                               __kmp_gtid_from_tid(f, team));
5191 
5192       if (level) { // set th_task_state for new threads in nested hot team
5193         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5194         // only need to set the th_task_state for the new threads. th_task_state
5195         // for master thread will not be accurate until after this in
5196         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5197         // correct value.
5198         for (f = old_nproc; f < team->t.t_nproc; ++f)
5199           team->t.t_threads[f]->th.th_task_state =
5200               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5201       } else { // set th_task_state for new threads in non-nested hot team
5202         int old_state =
5203             team->t.t_threads[0]->th.th_task_state; // copy master's state
5204         for (f = old_nproc; f < team->t.t_nproc; ++f)
5205           team->t.t_threads[f]->th.th_task_state = old_state;
5206       }
5207 
5208 #ifdef KMP_DEBUG
5209       for (f = 0; f < team->t.t_nproc; ++f) {
5210         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5211                          team->t.t_threads[f]->th.th_team_nproc ==
5212                              team->t.t_nproc);
5213       }
5214 #endif
5215 
5216       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5217 #if KMP_AFFINITY_SUPPORTED
5218       __kmp_partition_places(team);
5219 #endif
5220     } // Check changes in number of threads
5221 
5222     kmp_info_t *master = team->t.t_threads[0];
5223     if (master->th.th_teams_microtask) {
5224       for (f = 1; f < new_nproc; ++f) {
5225         // propagate teams construct specific info to workers
5226         kmp_info_t *thr = team->t.t_threads[f];
5227         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5228         thr->th.th_teams_level = master->th.th_teams_level;
5229         thr->th.th_teams_size = master->th.th_teams_size;
5230       }
5231     }
5232 #if KMP_NESTED_HOT_TEAMS
5233     if (level) {
5234       // Sync barrier state for nested hot teams, not needed for outermost hot
5235       // team.
5236       for (f = 1; f < new_nproc; ++f) {
5237         kmp_info_t *thr = team->t.t_threads[f];
5238         int b;
5239         kmp_balign_t *balign = thr->th.th_bar;
5240         for (b = 0; b < bs_last_barrier; ++b) {
5241           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5242           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5243 #if USE_DEBUGGER
5244           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5245 #endif
5246         }
5247       }
5248     }
5249 #endif // KMP_NESTED_HOT_TEAMS
5250 
5251     /* reallocate space for arguments if necessary */
5252     __kmp_alloc_argv_entries(argc, team, TRUE);
5253     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5254     // The hot team re-uses the previous task team,
5255     // if untouched during the previous release->gather phase.
5256 
5257     KF_TRACE(10, (" hot_team = %p\n", team));
5258 
5259 #if KMP_DEBUG
5260     if (__kmp_tasking_mode != tskm_immediate_exec) {
5261       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5262                     "task_team[1] = %p after reinit\n",
5263                     team->t.t_task_team[0], team->t.t_task_team[1]));
5264     }
5265 #endif
5266 
5267 #if OMPT_SUPPORT
5268     __ompt_team_assign_id(team, ompt_parallel_data);
5269 #endif
5270 
5271     KMP_MB();
5272 
5273     return team;
5274   }
5275 
5276   /* next, let's try to take one from the team pool */
5277   KMP_MB();
5278   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5279     /* TODO: consider resizing undersized teams instead of reaping them, now
5280        that we have a resizing mechanism */
5281     if (team->t.t_max_nproc >= max_nproc) {
5282       /* take this team from the team pool */
5283       __kmp_team_pool = team->t.t_next_pool;
5284 
5285       /* setup the team for fresh use */
5286       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5287 
5288       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5289                     "task_team[1] %p to NULL\n",
5290                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5291       team->t.t_task_team[0] = NULL;
5292       team->t.t_task_team[1] = NULL;
5293 
5294       /* reallocate space for arguments if necessary */
5295       __kmp_alloc_argv_entries(argc, team, TRUE);
5296       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5297 
5298       KA_TRACE(
5299           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5300                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5301       { // Initialize barrier data.
5302         int b;
5303         for (b = 0; b < bs_last_barrier; ++b) {
5304           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5305 #if USE_DEBUGGER
5306           team->t.t_bar[b].b_master_arrived = 0;
5307           team->t.t_bar[b].b_team_arrived = 0;
5308 #endif
5309         }
5310       }
5311 
5312       team->t.t_proc_bind = new_proc_bind;
5313 
5314       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5315                     team->t.t_id));
5316 
5317 #if OMPT_SUPPORT
5318       __ompt_team_assign_id(team, ompt_parallel_data);
5319 #endif
5320 
5321       KMP_MB();
5322 
5323       return team;
5324     }
5325 
5326     /* reap team if it is too small, then loop back and check the next one */
5327     // not sure if this is wise, but, will be redone during the hot-teams
5328     // rewrite.
5329     /* TODO: Use technique to find the right size hot-team, don't reap them */
5330     team = __kmp_reap_team(team);
5331     __kmp_team_pool = team;
5332   }
5333 
5334   /* nothing available in the pool, no matter, make a new team! */
5335   KMP_MB();
5336   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5337 
5338   /* and set it up */
5339   team->t.t_max_nproc = max_nproc;
5340   /* NOTE well, for some reason allocating one big buffer and dividing it up
5341      seems to really hurt performance a lot on the P4, so, let's not use this */
5342   __kmp_allocate_team_arrays(team, max_nproc);
5343 
5344   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5345   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5346 
5347   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5348                 "%p to NULL\n",
5349                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5350   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5351   // memory, no need to duplicate
5352   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5353   // memory, no need to duplicate
5354 
5355   if (__kmp_storage_map) {
5356     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5357   }
5358 
5359   /* allocate space for arguments */
5360   __kmp_alloc_argv_entries(argc, team, FALSE);
5361   team->t.t_argc = argc;
5362 
5363   KA_TRACE(20,
5364            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5365             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5366   { // Initialize barrier data.
5367     int b;
5368     for (b = 0; b < bs_last_barrier; ++b) {
5369       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5370 #if USE_DEBUGGER
5371       team->t.t_bar[b].b_master_arrived = 0;
5372       team->t.t_bar[b].b_team_arrived = 0;
5373 #endif
5374     }
5375   }
5376 
5377   team->t.t_proc_bind = new_proc_bind;
5378 
5379 #if OMPT_SUPPORT
5380   __ompt_team_assign_id(team, ompt_parallel_data);
5381   team->t.ompt_serialized_team_info = NULL;
5382 #endif
5383 
5384   KMP_MB();
5385 
5386   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5387                 team->t.t_id));
5388 
5389   return team;
5390 }
5391 
5392 /* TODO implement hot-teams at all levels */
5393 /* TODO implement lazy thread release on demand (disband request) */
5394 
5395 /* free the team.  return it to the team pool.  release all the threads
5396  * associated with it */
5397 void __kmp_free_team(kmp_root_t *root,
5398                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5399   int f;
5400   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5401                 team->t.t_id));
5402 
5403   /* verify state */
5404   KMP_DEBUG_ASSERT(root);
5405   KMP_DEBUG_ASSERT(team);
5406   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5407   KMP_DEBUG_ASSERT(team->t.t_threads);
5408 
5409   int use_hot_team = team == root->r.r_hot_team;
5410 #if KMP_NESTED_HOT_TEAMS
5411   int level;
5412   kmp_hot_team_ptr_t *hot_teams;
5413   if (master) {
5414     level = team->t.t_active_level - 1;
5415     if (master->th.th_teams_microtask) { // in teams construct?
5416       if (master->th.th_teams_size.nteams > 1) {
5417         ++level; // level was not increased in teams construct for
5418         // team_of_masters
5419       }
5420       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5421           master->th.th_teams_level == team->t.t_level) {
5422         ++level; // level was not increased in teams construct for
5423         // team_of_workers before the parallel
5424       } // team->t.t_level will be increased inside parallel
5425     }
5426     hot_teams = master->th.th_hot_teams;
5427     if (level < __kmp_hot_teams_max_level) {
5428       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5429       use_hot_team = 1;
5430     }
5431   }
5432 #endif // KMP_NESTED_HOT_TEAMS
5433 
5434   /* team is done working */
5435   TCW_SYNC_PTR(team->t.t_pkfn,
5436                NULL); // Important for Debugging Support Library.
5437 #if KMP_OS_WINDOWS
5438   team->t.t_copyin_counter = 0; // init counter for possible reuse
5439 #endif
5440   // Do not reset pointer to parent team to NULL for hot teams.
5441 
5442   /* if we are non-hot team, release our threads */
5443   if (!use_hot_team) {
5444     if (__kmp_tasking_mode != tskm_immediate_exec) {
5445       // Wait for threads to reach reapable state
5446       for (f = 1; f < team->t.t_nproc; ++f) {
5447         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5448         kmp_info_t *th = team->t.t_threads[f];
5449         volatile kmp_uint32 *state = &th->th.th_reap_state;
5450         while (*state != KMP_SAFE_TO_REAP) {
5451 #if KMP_OS_WINDOWS
5452           // On Windows a thread can be killed at any time, check this
5453           DWORD ecode;
5454           if (!__kmp_is_thread_alive(th, &ecode)) {
5455             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5456             break;
5457           }
5458 #endif
5459           // first check if thread is sleeping
5460           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5461           if (fl.is_sleeping())
5462             fl.resume(__kmp_gtid_from_thread(th));
5463           KMP_CPU_PAUSE();
5464         }
5465       }
5466 
5467       // Delete task teams
5468       int tt_idx;
5469       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5470         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5471         if (task_team != NULL) {
5472           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5473             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5474             team->t.t_threads[f]->th.th_task_team = NULL;
5475           }
5476           KA_TRACE(
5477               20,
5478               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5479                __kmp_get_gtid(), task_team, team->t.t_id));
5480 #if KMP_NESTED_HOT_TEAMS
5481           __kmp_free_task_team(master, task_team);
5482 #endif
5483           team->t.t_task_team[tt_idx] = NULL;
5484         }
5485       }
5486     }
5487 
5488     // Reset pointer to parent team only for non-hot teams.
5489     team->t.t_parent = NULL;
5490     team->t.t_level = 0;
5491     team->t.t_active_level = 0;
5492 
5493     /* free the worker threads */
5494     for (f = 1; f < team->t.t_nproc; ++f) {
5495       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5496       __kmp_free_thread(team->t.t_threads[f]);
5497       team->t.t_threads[f] = NULL;
5498     }
5499 
5500     /* put the team back in the team pool */
5501     /* TODO limit size of team pool, call reap_team if pool too large */
5502     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5503     __kmp_team_pool = (volatile kmp_team_t *)team;
5504   } else { // Check if team was created for the masters in a teams construct
5505     // See if first worker is a CG root
5506     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5507                      team->t.t_threads[1]->th.th_cg_roots);
5508     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5509       // Clean up the CG root nodes on workers so that this team can be re-used
5510       for (f = 1; f < team->t.t_nproc; ++f) {
5511         kmp_info_t *thr = team->t.t_threads[f];
5512         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5513                          thr->th.th_cg_roots->cg_root == thr);
5514         // Pop current CG root off list
5515         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5516         thr->th.th_cg_roots = tmp->up;
5517         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5518                        " up to node %p. cg_nthreads was %d\n",
5519                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5520         int i = tmp->cg_nthreads--;
5521         if (i == 1) {
5522           __kmp_free(tmp); // free CG if we are the last thread in it
5523         }
5524         // Restore current task's thread_limit from CG root
5525         if (thr->th.th_cg_roots)
5526           thr->th.th_current_task->td_icvs.thread_limit =
5527               thr->th.th_cg_roots->cg_thread_limit;
5528       }
5529     }
5530   }
5531 
5532   KMP_MB();
5533 }
5534 
5535 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5536 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5537   kmp_team_t *next_pool = team->t.t_next_pool;
5538 
5539   KMP_DEBUG_ASSERT(team);
5540   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5541   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5542   KMP_DEBUG_ASSERT(team->t.t_threads);
5543   KMP_DEBUG_ASSERT(team->t.t_argv);
5544 
5545   /* TODO clean the threads that are a part of this? */
5546 
5547   /* free stuff */
5548   __kmp_free_team_arrays(team);
5549   if (team->t.t_argv != &team->t.t_inline_argv[0])
5550     __kmp_free((void *)team->t.t_argv);
5551   __kmp_free(team);
5552 
5553   KMP_MB();
5554   return next_pool;
5555 }
5556 
5557 // Free the thread.  Don't reap it, just place it on the pool of available
5558 // threads.
5559 //
5560 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5561 // binding for the affinity mechanism to be useful.
5562 //
5563 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5564 // However, we want to avoid a potential performance problem by always
5565 // scanning through the list to find the correct point at which to insert
5566 // the thread (potential N**2 behavior).  To do this we keep track of the
5567 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5568 // With single-level parallelism, threads will always be added to the tail
5569 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5570 // parallelism, all bets are off and we may need to scan through the entire
5571 // free list.
5572 //
5573 // This change also has a potentially large performance benefit, for some
5574 // applications.  Previously, as threads were freed from the hot team, they
5575 // would be placed back on the free list in inverse order.  If the hot team
5576 // grew back to it's original size, then the freed thread would be placed
5577 // back on the hot team in reverse order.  This could cause bad cache
5578 // locality problems on programs where the size of the hot team regularly
5579 // grew and shrunk.
5580 //
5581 // Now, for single-level parallelism, the OMP tid is always == gtid.
5582 void __kmp_free_thread(kmp_info_t *this_th) {
5583   int gtid;
5584   kmp_info_t **scan;
5585 
5586   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5587                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5588 
5589   KMP_DEBUG_ASSERT(this_th);
5590 
5591   // When moving thread to pool, switch thread to wait on own b_go flag, and
5592   // uninitialized (NULL team).
5593   int b;
5594   kmp_balign_t *balign = this_th->th.th_bar;
5595   for (b = 0; b < bs_last_barrier; ++b) {
5596     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5597       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5598     balign[b].bb.team = NULL;
5599     balign[b].bb.leaf_kids = 0;
5600   }
5601   this_th->th.th_task_state = 0;
5602   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5603 
5604   /* put thread back on the free pool */
5605   TCW_PTR(this_th->th.th_team, NULL);
5606   TCW_PTR(this_th->th.th_root, NULL);
5607   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5608 
5609   while (this_th->th.th_cg_roots) {
5610     this_th->th.th_cg_roots->cg_nthreads--;
5611     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5612                    " %p of thread  %p to %d\n",
5613                    this_th, this_th->th.th_cg_roots,
5614                    this_th->th.th_cg_roots->cg_root,
5615                    this_th->th.th_cg_roots->cg_nthreads));
5616     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5617     if (tmp->cg_root == this_th) { // Thread is a cg_root
5618       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5619       KA_TRACE(
5620           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5621       this_th->th.th_cg_roots = tmp->up;
5622       __kmp_free(tmp);
5623     } else { // Worker thread
5624       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5625         __kmp_free(tmp);
5626       }
5627       this_th->th.th_cg_roots = NULL;
5628       break;
5629     }
5630   }
5631 
5632   /* If the implicit task assigned to this thread can be used by other threads
5633    * -> multiple threads can share the data and try to free the task at
5634    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5635    * with higher probability when hot team is disabled but can occurs even when
5636    * the hot team is enabled */
5637   __kmp_free_implicit_task(this_th);
5638   this_th->th.th_current_task = NULL;
5639 
5640   // If the __kmp_thread_pool_insert_pt is already past the new insert
5641   // point, then we need to re-scan the entire list.
5642   gtid = this_th->th.th_info.ds.ds_gtid;
5643   if (__kmp_thread_pool_insert_pt != NULL) {
5644     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5645     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5646       __kmp_thread_pool_insert_pt = NULL;
5647     }
5648   }
5649 
5650   // Scan down the list to find the place to insert the thread.
5651   // scan is the address of a link in the list, possibly the address of
5652   // __kmp_thread_pool itself.
5653   //
5654   // In the absence of nested parallelism, the for loop will have 0 iterations.
5655   if (__kmp_thread_pool_insert_pt != NULL) {
5656     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5657   } else {
5658     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5659   }
5660   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5661        scan = &((*scan)->th.th_next_pool))
5662     ;
5663 
5664   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5665   // to its address.
5666   TCW_PTR(this_th->th.th_next_pool, *scan);
5667   __kmp_thread_pool_insert_pt = *scan = this_th;
5668   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5669                    (this_th->th.th_info.ds.ds_gtid <
5670                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5671   TCW_4(this_th->th.th_in_pool, TRUE);
5672   __kmp_suspend_initialize_thread(this_th);
5673   __kmp_lock_suspend_mx(this_th);
5674   if (this_th->th.th_active == TRUE) {
5675     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5676     this_th->th.th_active_in_pool = TRUE;
5677   }
5678 #if KMP_DEBUG
5679   else {
5680     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5681   }
5682 #endif
5683   __kmp_unlock_suspend_mx(this_th);
5684 
5685   TCW_4(__kmp_nth, __kmp_nth - 1);
5686 
5687 #ifdef KMP_ADJUST_BLOCKTIME
5688   /* Adjust blocktime back to user setting or default if necessary */
5689   /* Middle initialization might never have occurred                */
5690   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5691     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5692     if (__kmp_nth <= __kmp_avail_proc) {
5693       __kmp_zero_bt = FALSE;
5694     }
5695   }
5696 #endif /* KMP_ADJUST_BLOCKTIME */
5697 
5698   KMP_MB();
5699 }
5700 
5701 /* ------------------------------------------------------------------------ */
5702 
5703 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5704   int gtid = this_thr->th.th_info.ds.ds_gtid;
5705   /*    void                 *stack_data;*/
5706   kmp_team_t **volatile pteam;
5707 
5708   KMP_MB();
5709   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5710 
5711   if (__kmp_env_consistency_check) {
5712     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5713   }
5714 
5715 #if OMPT_SUPPORT
5716   ompt_data_t *thread_data;
5717   if (ompt_enabled.enabled) {
5718     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5719     *thread_data = ompt_data_none;
5720 
5721     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5722     this_thr->th.ompt_thread_info.wait_id = 0;
5723     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5724     this_thr->th.ompt_thread_info.parallel_flags = 0;
5725     if (ompt_enabled.ompt_callback_thread_begin) {
5726       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5727           ompt_thread_worker, thread_data);
5728     }
5729     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5730   }
5731 #endif
5732 
5733   /* This is the place where threads wait for work */
5734   while (!TCR_4(__kmp_global.g.g_done)) {
5735     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5736     KMP_MB();
5737 
5738     /* wait for work to do */
5739     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5740 
5741     /* No tid yet since not part of a team */
5742     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5743 
5744 #if OMPT_SUPPORT
5745     if (ompt_enabled.enabled) {
5746       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5747     }
5748 #endif
5749 
5750     pteam = &this_thr->th.th_team;
5751 
5752     /* have we been allocated? */
5753     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5754       /* we were just woken up, so run our new task */
5755       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5756         int rc;
5757         KA_TRACE(20,
5758                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5759                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5760                   (*pteam)->t.t_pkfn));
5761 
5762         updateHWFPControl(*pteam);
5763 
5764 #if OMPT_SUPPORT
5765         if (ompt_enabled.enabled) {
5766           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5767         }
5768 #endif
5769 
5770         rc = (*pteam)->t.t_invoke(gtid);
5771         KMP_ASSERT(rc);
5772 
5773         KMP_MB();
5774         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5775                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5776                       (*pteam)->t.t_pkfn));
5777       }
5778 #if OMPT_SUPPORT
5779       if (ompt_enabled.enabled) {
5780         /* no frame set while outside task */
5781         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5782 
5783         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5784       }
5785 #endif
5786       /* join barrier after parallel region */
5787       __kmp_join_barrier(gtid);
5788     }
5789   }
5790   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5791 
5792 #if OMPT_SUPPORT
5793   if (ompt_enabled.ompt_callback_thread_end) {
5794     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5795   }
5796 #endif
5797 
5798   this_thr->th.th_task_team = NULL;
5799   /* run the destructors for the threadprivate data for this thread */
5800   __kmp_common_destroy_gtid(gtid);
5801 
5802   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5803   KMP_MB();
5804   return this_thr;
5805 }
5806 
5807 /* ------------------------------------------------------------------------ */
5808 
5809 void __kmp_internal_end_dest(void *specific_gtid) {
5810 #if KMP_COMPILER_ICC
5811 #pragma warning(push)
5812 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5813 // significant bits
5814 #endif
5815   // Make sure no significant bits are lost
5816   int gtid = (kmp_intptr_t)specific_gtid - 1;
5817 #if KMP_COMPILER_ICC
5818 #pragma warning(pop)
5819 #endif
5820 
5821   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5822   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5823    * this is because 0 is reserved for the nothing-stored case */
5824 
5825   __kmp_internal_end_thread(gtid);
5826 }
5827 
5828 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5829 
5830 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5831   __kmp_internal_end_atexit();
5832 }
5833 
5834 #endif
5835 
5836 /* [Windows] josh: when the atexit handler is called, there may still be more
5837    than one thread alive */
5838 void __kmp_internal_end_atexit(void) {
5839   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5840   /* [Windows]
5841      josh: ideally, we want to completely shutdown the library in this atexit
5842      handler, but stat code that depends on thread specific data for gtid fails
5843      because that data becomes unavailable at some point during the shutdown, so
5844      we call __kmp_internal_end_thread instead. We should eventually remove the
5845      dependency on __kmp_get_specific_gtid in the stat code and use
5846      __kmp_internal_end_library to cleanly shutdown the library.
5847 
5848      // TODO: Can some of this comment about GVS be removed?
5849      I suspect that the offending stat code is executed when the calling thread
5850      tries to clean up a dead root thread's data structures, resulting in GVS
5851      code trying to close the GVS structures for that thread, but since the stat
5852      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5853      the calling thread is cleaning up itself instead of another thread, it get
5854      confused. This happens because allowing a thread to unregister and cleanup
5855      another thread is a recent modification for addressing an issue.
5856      Based on the current design (20050722), a thread may end up
5857      trying to unregister another thread only if thread death does not trigger
5858      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5859      thread specific data destructor function to detect thread death. For
5860      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5861      is nothing.  Thus, the workaround is applicable only for Windows static
5862      stat library. */
5863   __kmp_internal_end_library(-1);
5864 #if KMP_OS_WINDOWS
5865   __kmp_close_console();
5866 #endif
5867 }
5868 
5869 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5870   // It is assumed __kmp_forkjoin_lock is acquired.
5871 
5872   int gtid;
5873 
5874   KMP_DEBUG_ASSERT(thread != NULL);
5875 
5876   gtid = thread->th.th_info.ds.ds_gtid;
5877 
5878   if (!is_root) {
5879     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5880       /* Assume the threads are at the fork barrier here */
5881       KA_TRACE(
5882           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5883                gtid));
5884       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5885        * (GEH) */
5886       ANNOTATE_HAPPENS_BEFORE(thread);
5887       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5888       __kmp_release_64(&flag);
5889     }
5890 
5891     // Terminate OS thread.
5892     __kmp_reap_worker(thread);
5893 
5894     // The thread was killed asynchronously.  If it was actively
5895     // spinning in the thread pool, decrement the global count.
5896     //
5897     // There is a small timing hole here - if the worker thread was just waking
5898     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5899     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5900     // the global counter might not get updated.
5901     //
5902     // Currently, this can only happen as the library is unloaded,
5903     // so there are no harmful side effects.
5904     if (thread->th.th_active_in_pool) {
5905       thread->th.th_active_in_pool = FALSE;
5906       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5907       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5908     }
5909   }
5910 
5911   __kmp_free_implicit_task(thread);
5912 
5913 // Free the fast memory for tasking
5914 #if USE_FAST_MEMORY
5915   __kmp_free_fast_memory(thread);
5916 #endif /* USE_FAST_MEMORY */
5917 
5918   __kmp_suspend_uninitialize_thread(thread);
5919 
5920   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5921   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5922 
5923   --__kmp_all_nth;
5924 // __kmp_nth was decremented when thread is added to the pool.
5925 
5926 #ifdef KMP_ADJUST_BLOCKTIME
5927   /* Adjust blocktime back to user setting or default if necessary */
5928   /* Middle initialization might never have occurred                */
5929   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5930     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5931     if (__kmp_nth <= __kmp_avail_proc) {
5932       __kmp_zero_bt = FALSE;
5933     }
5934   }
5935 #endif /* KMP_ADJUST_BLOCKTIME */
5936 
5937   /* free the memory being used */
5938   if (__kmp_env_consistency_check) {
5939     if (thread->th.th_cons) {
5940       __kmp_free_cons_stack(thread->th.th_cons);
5941       thread->th.th_cons = NULL;
5942     }
5943   }
5944 
5945   if (thread->th.th_pri_common != NULL) {
5946     __kmp_free(thread->th.th_pri_common);
5947     thread->th.th_pri_common = NULL;
5948   }
5949 
5950   if (thread->th.th_task_state_memo_stack != NULL) {
5951     __kmp_free(thread->th.th_task_state_memo_stack);
5952     thread->th.th_task_state_memo_stack = NULL;
5953   }
5954 
5955 #if KMP_USE_BGET
5956   if (thread->th.th_local.bget_data != NULL) {
5957     __kmp_finalize_bget(thread);
5958   }
5959 #endif
5960 
5961 #if KMP_AFFINITY_SUPPORTED
5962   if (thread->th.th_affin_mask != NULL) {
5963     KMP_CPU_FREE(thread->th.th_affin_mask);
5964     thread->th.th_affin_mask = NULL;
5965   }
5966 #endif /* KMP_AFFINITY_SUPPORTED */
5967 
5968 #if KMP_USE_HIER_SCHED
5969   if (thread->th.th_hier_bar_data != NULL) {
5970     __kmp_free(thread->th.th_hier_bar_data);
5971     thread->th.th_hier_bar_data = NULL;
5972   }
5973 #endif
5974 
5975   __kmp_reap_team(thread->th.th_serial_team);
5976   thread->th.th_serial_team = NULL;
5977   __kmp_free(thread);
5978 
5979   KMP_MB();
5980 
5981 } // __kmp_reap_thread
5982 
5983 static void __kmp_internal_end(void) {
5984   int i;
5985 
5986   /* First, unregister the library */
5987   __kmp_unregister_library();
5988 
5989 #if KMP_OS_WINDOWS
5990   /* In Win static library, we can't tell when a root actually dies, so we
5991      reclaim the data structures for any root threads that have died but not
5992      unregistered themselves, in order to shut down cleanly.
5993      In Win dynamic library we also can't tell when a thread dies.  */
5994   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5995 // dead roots
5996 #endif
5997 
5998   for (i = 0; i < __kmp_threads_capacity; i++)
5999     if (__kmp_root[i])
6000       if (__kmp_root[i]->r.r_active)
6001         break;
6002   KMP_MB(); /* Flush all pending memory write invalidates.  */
6003   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6004 
6005   if (i < __kmp_threads_capacity) {
6006 #if KMP_USE_MONITOR
6007     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6008     KMP_MB(); /* Flush all pending memory write invalidates.  */
6009 
6010     // Need to check that monitor was initialized before reaping it. If we are
6011     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6012     // __kmp_monitor will appear to contain valid data, but it is only valid in
6013     // the parent process, not the child.
6014     // New behavior (201008): instead of keying off of the flag
6015     // __kmp_init_parallel, the monitor thread creation is keyed off
6016     // of the new flag __kmp_init_monitor.
6017     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6018     if (TCR_4(__kmp_init_monitor)) {
6019       __kmp_reap_monitor(&__kmp_monitor);
6020       TCW_4(__kmp_init_monitor, 0);
6021     }
6022     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6023     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6024 #endif // KMP_USE_MONITOR
6025   } else {
6026 /* TODO move this to cleanup code */
6027 #ifdef KMP_DEBUG
6028     /* make sure that everything has properly ended */
6029     for (i = 0; i < __kmp_threads_capacity; i++) {
6030       if (__kmp_root[i]) {
6031         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6032         //                    there can be uber threads alive here
6033         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6034       }
6035     }
6036 #endif
6037 
6038     KMP_MB();
6039 
6040     // Reap the worker threads.
6041     // This is valid for now, but be careful if threads are reaped sooner.
6042     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6043       // Get the next thread from the pool.
6044       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6045       __kmp_thread_pool = thread->th.th_next_pool;
6046       // Reap it.
6047       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6048       thread->th.th_next_pool = NULL;
6049       thread->th.th_in_pool = FALSE;
6050       __kmp_reap_thread(thread, 0);
6051     }
6052     __kmp_thread_pool_insert_pt = NULL;
6053 
6054     // Reap teams.
6055     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6056       // Get the next team from the pool.
6057       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6058       __kmp_team_pool = team->t.t_next_pool;
6059       // Reap it.
6060       team->t.t_next_pool = NULL;
6061       __kmp_reap_team(team);
6062     }
6063 
6064     __kmp_reap_task_teams();
6065 
6066 #if KMP_OS_UNIX
6067     // Threads that are not reaped should not access any resources since they
6068     // are going to be deallocated soon, so the shutdown sequence should wait
6069     // until all threads either exit the final spin-waiting loop or begin
6070     // sleeping after the given blocktime.
6071     for (i = 0; i < __kmp_threads_capacity; i++) {
6072       kmp_info_t *thr = __kmp_threads[i];
6073       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6074         KMP_CPU_PAUSE();
6075     }
6076 #endif
6077 
6078     for (i = 0; i < __kmp_threads_capacity; ++i) {
6079       // TBD: Add some checking...
6080       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6081     }
6082 
6083     /* Make sure all threadprivate destructors get run by joining with all
6084        worker threads before resetting this flag */
6085     TCW_SYNC_4(__kmp_init_common, FALSE);
6086 
6087     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6088     KMP_MB();
6089 
6090 #if KMP_USE_MONITOR
6091     // See note above: One of the possible fixes for CQ138434 / CQ140126
6092     //
6093     // FIXME: push both code fragments down and CSE them?
6094     // push them into __kmp_cleanup() ?
6095     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6096     if (TCR_4(__kmp_init_monitor)) {
6097       __kmp_reap_monitor(&__kmp_monitor);
6098       TCW_4(__kmp_init_monitor, 0);
6099     }
6100     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6101     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6102 #endif
6103   } /* else !__kmp_global.t_active */
6104   TCW_4(__kmp_init_gtid, FALSE);
6105   KMP_MB(); /* Flush all pending memory write invalidates.  */
6106 
6107   __kmp_cleanup();
6108 #if OMPT_SUPPORT
6109   ompt_fini();
6110 #endif
6111 }
6112 
6113 void __kmp_internal_end_library(int gtid_req) {
6114   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6115   /* this shouldn't be a race condition because __kmp_internal_end() is the
6116      only place to clear __kmp_serial_init */
6117   /* we'll check this later too, after we get the lock */
6118   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6119   // redundant, because the next check will work in any case.
6120   if (__kmp_global.g.g_abort) {
6121     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6122     /* TODO abort? */
6123     return;
6124   }
6125   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6126     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6127     return;
6128   }
6129 
6130   KMP_MB(); /* Flush all pending memory write invalidates.  */
6131   /* find out who we are and what we should do */
6132   {
6133     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6134     KA_TRACE(
6135         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6136     if (gtid == KMP_GTID_SHUTDOWN) {
6137       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6138                     "already shutdown\n"));
6139       return;
6140     } else if (gtid == KMP_GTID_MONITOR) {
6141       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6142                     "registered, or system shutdown\n"));
6143       return;
6144     } else if (gtid == KMP_GTID_DNE) {
6145       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6146                     "shutdown\n"));
6147       /* we don't know who we are, but we may still shutdown the library */
6148     } else if (KMP_UBER_GTID(gtid)) {
6149       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6150       if (__kmp_root[gtid]->r.r_active) {
6151         __kmp_global.g.g_abort = -1;
6152         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6153         __kmp_unregister_library();
6154         KA_TRACE(10,
6155                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6156                   gtid));
6157         return;
6158       } else {
6159         KA_TRACE(
6160             10,
6161             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6162         __kmp_unregister_root_current_thread(gtid);
6163       }
6164     } else {
6165 /* worker threads may call this function through the atexit handler, if they
6166  * call exit() */
6167 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6168    TODO: do a thorough shutdown instead */
6169 #ifdef DUMP_DEBUG_ON_EXIT
6170       if (__kmp_debug_buf)
6171         __kmp_dump_debug_buffer();
6172 #endif
6173       // added unregister library call here when we switch to shm linux
6174       // if we don't, it will leave lots of files in /dev/shm
6175       // cleanup shared memory file before exiting.
6176       __kmp_unregister_library();
6177       return;
6178     }
6179   }
6180   /* synchronize the termination process */
6181   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6182 
6183   /* have we already finished */
6184   if (__kmp_global.g.g_abort) {
6185     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6186     /* TODO abort? */
6187     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6188     return;
6189   }
6190   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6191     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6192     return;
6193   }
6194 
6195   /* We need this lock to enforce mutex between this reading of
6196      __kmp_threads_capacity and the writing by __kmp_register_root.
6197      Alternatively, we can use a counter of roots that is atomically updated by
6198      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6199      __kmp_internal_end_*.  */
6200   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6201 
6202   /* now we can safely conduct the actual termination */
6203   __kmp_internal_end();
6204 
6205   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6206   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6207 
6208   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6209 
6210 #ifdef DUMP_DEBUG_ON_EXIT
6211   if (__kmp_debug_buf)
6212     __kmp_dump_debug_buffer();
6213 #endif
6214 
6215 #if KMP_OS_WINDOWS
6216   __kmp_close_console();
6217 #endif
6218 
6219   __kmp_fini_allocator();
6220 
6221 } // __kmp_internal_end_library
6222 
6223 void __kmp_internal_end_thread(int gtid_req) {
6224   int i;
6225 
6226   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6227   /* this shouldn't be a race condition because __kmp_internal_end() is the
6228    * only place to clear __kmp_serial_init */
6229   /* we'll check this later too, after we get the lock */
6230   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6231   // redundant, because the next check will work in any case.
6232   if (__kmp_global.g.g_abort) {
6233     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6234     /* TODO abort? */
6235     return;
6236   }
6237   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6238     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6239     return;
6240   }
6241 
6242   KMP_MB(); /* Flush all pending memory write invalidates.  */
6243 
6244   /* find out who we are and what we should do */
6245   {
6246     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6247     KA_TRACE(10,
6248              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6249     if (gtid == KMP_GTID_SHUTDOWN) {
6250       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6251                     "already shutdown\n"));
6252       return;
6253     } else if (gtid == KMP_GTID_MONITOR) {
6254       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6255                     "registered, or system shutdown\n"));
6256       return;
6257     } else if (gtid == KMP_GTID_DNE) {
6258       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6259                     "shutdown\n"));
6260       return;
6261       /* we don't know who we are */
6262     } else if (KMP_UBER_GTID(gtid)) {
6263       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6264       if (__kmp_root[gtid]->r.r_active) {
6265         __kmp_global.g.g_abort = -1;
6266         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6267         KA_TRACE(10,
6268                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6269                   gtid));
6270         return;
6271       } else {
6272         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6273                       gtid));
6274         __kmp_unregister_root_current_thread(gtid);
6275       }
6276     } else {
6277       /* just a worker thread, let's leave */
6278       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6279 
6280       if (gtid >= 0) {
6281         __kmp_threads[gtid]->th.th_task_team = NULL;
6282       }
6283 
6284       KA_TRACE(10,
6285                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6286                 gtid));
6287       return;
6288     }
6289   }
6290 #if KMP_DYNAMIC_LIB
6291   if (__kmp_pause_status != kmp_hard_paused)
6292   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6293   // because we will better shutdown later in the library destructor.
6294   {
6295     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6296     return;
6297   }
6298 #endif
6299   /* synchronize the termination process */
6300   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6301 
6302   /* have we already finished */
6303   if (__kmp_global.g.g_abort) {
6304     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6305     /* TODO abort? */
6306     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6307     return;
6308   }
6309   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6310     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6311     return;
6312   }
6313 
6314   /* We need this lock to enforce mutex between this reading of
6315      __kmp_threads_capacity and the writing by __kmp_register_root.
6316      Alternatively, we can use a counter of roots that is atomically updated by
6317      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6318      __kmp_internal_end_*.  */
6319 
6320   /* should we finish the run-time?  are all siblings done? */
6321   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6322 
6323   for (i = 0; i < __kmp_threads_capacity; ++i) {
6324     if (KMP_UBER_GTID(i)) {
6325       KA_TRACE(
6326           10,
6327           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6328       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6329       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6330       return;
6331     }
6332   }
6333 
6334   /* now we can safely conduct the actual termination */
6335 
6336   __kmp_internal_end();
6337 
6338   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6339   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6340 
6341   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6342 
6343 #ifdef DUMP_DEBUG_ON_EXIT
6344   if (__kmp_debug_buf)
6345     __kmp_dump_debug_buffer();
6346 #endif
6347 } // __kmp_internal_end_thread
6348 
6349 // -----------------------------------------------------------------------------
6350 // Library registration stuff.
6351 
6352 static long __kmp_registration_flag = 0;
6353 // Random value used to indicate library initialization.
6354 static char *__kmp_registration_str = NULL;
6355 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6356 
6357 static inline char *__kmp_reg_status_name() {
6358   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6359      each thread. If registration and unregistration go in different threads
6360      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6361      env var can not be found, because the name will contain different pid. */
6362 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6363   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6364                           (int)getuid());
6365 #else
6366   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6367 #endif
6368 } // __kmp_reg_status_get
6369 
6370 void __kmp_register_library_startup(void) {
6371 
6372   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6373   int done = 0;
6374   union {
6375     double dtime;
6376     long ltime;
6377   } time;
6378 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6379   __kmp_initialize_system_tick();
6380 #endif
6381   __kmp_read_system_time(&time.dtime);
6382   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6383   __kmp_registration_str =
6384       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6385                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6386 
6387   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6388                 __kmp_registration_str));
6389 
6390   while (!done) {
6391 
6392     char *value = NULL; // Actual value of the environment variable.
6393 
6394 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6395     char *shm_name = __kmp_str_format("/%s", name);
6396     int shm_preexist = 0;
6397     char *data1;
6398     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6399     if ((fd1 == -1) && (errno == EEXIST)) {
6400       // file didn't open because it already exists.
6401       // try opening existing file
6402       fd1 = shm_open(shm_name, O_RDWR, 0666);
6403       if (fd1 == -1) { // file didn't open
6404         // error out here
6405         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6406                     __kmp_msg_null);
6407       } else {
6408         // able to open existing file
6409         shm_preexist = 1;
6410       }
6411     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6412       // already exists.
6413       // error out here.
6414       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6415                   __kmp_msg_null);
6416     }
6417     if (shm_preexist == 0) {
6418       // we created SHM now set size
6419       if (ftruncate(fd1, SHM_SIZE) == -1) {
6420         // error occured setting size;
6421         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6422                     KMP_ERR(errno), __kmp_msg_null);
6423       }
6424     }
6425     data1 =
6426         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6427     if (data1 == MAP_FAILED) {
6428       // failed to map shared memory
6429       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6430                   __kmp_msg_null);
6431     }
6432     if (shm_preexist == 0) { // set data to SHM, set value
6433       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6434     }
6435     // Read value from either what we just wrote or existing file.
6436     value = __kmp_str_format("%s", data1); // read value from SHM
6437     munmap(data1, SHM_SIZE);
6438     close(fd1);
6439 #else // Windows and unix with static library
6440     // Set environment variable, but do not overwrite if it is exist.
6441     __kmp_env_set(name, __kmp_registration_str, 0);
6442     // read value to see if it got set
6443     value = __kmp_env_get(name);
6444 #endif
6445 
6446     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6447       done = 1; // Ok, environment variable set successfully, exit the loop.
6448     } else {
6449       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6450       // Check whether it alive or dead.
6451       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6452       char *tail = value;
6453       char *flag_addr_str = NULL;
6454       char *flag_val_str = NULL;
6455       char const *file_name = NULL;
6456       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6457       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6458       file_name = tail;
6459       if (tail != NULL) {
6460         long *flag_addr = 0;
6461         long flag_val = 0;
6462         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6463         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6464         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6465           // First, check whether environment-encoded address is mapped into
6466           // addr space.
6467           // If so, dereference it to see if it still has the right value.
6468           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6469             neighbor = 1;
6470           } else {
6471             // If not, then we know the other copy of the library is no longer
6472             // running.
6473             neighbor = 2;
6474           }
6475         }
6476       }
6477       switch (neighbor) {
6478       case 0: // Cannot parse environment variable -- neighbor status unknown.
6479         // Assume it is the incompatible format of future version of the
6480         // library. Assume the other library is alive.
6481         // WARN( ... ); // TODO: Issue a warning.
6482         file_name = "unknown library";
6483         KMP_FALLTHROUGH();
6484       // Attention! Falling to the next case. That's intentional.
6485       case 1: { // Neighbor is alive.
6486         // Check it is allowed.
6487         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6488         if (!__kmp_str_match_true(duplicate_ok)) {
6489           // That's not allowed. Issue fatal error.
6490           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6491                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6492         }
6493         KMP_INTERNAL_FREE(duplicate_ok);
6494         __kmp_duplicate_library_ok = 1;
6495         done = 1; // Exit the loop.
6496       } break;
6497       case 2: { // Neighbor is dead.
6498 
6499 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6500         // close shared memory.
6501         shm_unlink(shm_name); // this removes file in /dev/shm
6502 #else
6503         // Clear the variable and try to register library again.
6504         __kmp_env_unset(name);
6505 #endif
6506       } break;
6507       default: { KMP_DEBUG_ASSERT(0); } break;
6508       }
6509     }
6510     KMP_INTERNAL_FREE((void *)value);
6511 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6512     KMP_INTERNAL_FREE((void *)shm_name);
6513 #endif
6514   } // while
6515   KMP_INTERNAL_FREE((void *)name);
6516 
6517 } // func __kmp_register_library_startup
6518 
6519 void __kmp_unregister_library(void) {
6520 
6521   char *name = __kmp_reg_status_name();
6522   char *value = NULL;
6523 
6524 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6525   char *shm_name = __kmp_str_format("/%s", name);
6526   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6527   if (fd1 == -1) {
6528     // file did not open. return.
6529     return;
6530   }
6531   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6532   if (data1 != MAP_FAILED) {
6533     value = __kmp_str_format("%s", data1); // read value from SHM
6534     munmap(data1, SHM_SIZE);
6535   }
6536   close(fd1);
6537 #else
6538   value = __kmp_env_get(name);
6539 #endif
6540 
6541   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6542   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6543   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6544 //  Ok, this is our variable. Delete it.
6545 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6546     shm_unlink(shm_name); // this removes file in /dev/shm
6547 #else
6548     __kmp_env_unset(name);
6549 #endif
6550   }
6551 
6552 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6553   KMP_INTERNAL_FREE(shm_name);
6554 #endif
6555 
6556   KMP_INTERNAL_FREE(__kmp_registration_str);
6557   KMP_INTERNAL_FREE(value);
6558   KMP_INTERNAL_FREE(name);
6559 
6560   __kmp_registration_flag = 0;
6561   __kmp_registration_str = NULL;
6562 
6563 } // __kmp_unregister_library
6564 
6565 // End of Library registration stuff.
6566 // -----------------------------------------------------------------------------
6567 
6568 #if KMP_MIC_SUPPORTED
6569 
6570 static void __kmp_check_mic_type() {
6571   kmp_cpuid_t cpuid_state = {0};
6572   kmp_cpuid_t *cs_p = &cpuid_state;
6573   __kmp_x86_cpuid(1, 0, cs_p);
6574   // We don't support mic1 at the moment
6575   if ((cs_p->eax & 0xff0) == 0xB10) {
6576     __kmp_mic_type = mic2;
6577   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6578     __kmp_mic_type = mic3;
6579   } else {
6580     __kmp_mic_type = non_mic;
6581   }
6582 }
6583 
6584 #endif /* KMP_MIC_SUPPORTED */
6585 
6586 static void __kmp_do_serial_initialize(void) {
6587   int i, gtid;
6588   int size;
6589 
6590   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6591 
6592   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6593   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6594   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6595   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6596   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6597 
6598 #if OMPT_SUPPORT
6599   ompt_pre_init();
6600 #endif
6601 
6602   __kmp_validate_locks();
6603 
6604   /* Initialize internal memory allocator */
6605   __kmp_init_allocator();
6606 
6607   /* Register the library startup via an environment variable and check to see
6608      whether another copy of the library is already registered. */
6609 
6610   __kmp_register_library_startup();
6611 
6612   /* TODO reinitialization of library */
6613   if (TCR_4(__kmp_global.g.g_done)) {
6614     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6615   }
6616 
6617   __kmp_global.g.g_abort = 0;
6618   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6619 
6620 /* initialize the locks */
6621 #if KMP_USE_ADAPTIVE_LOCKS
6622 #if KMP_DEBUG_ADAPTIVE_LOCKS
6623   __kmp_init_speculative_stats();
6624 #endif
6625 #endif
6626 #if KMP_STATS_ENABLED
6627   __kmp_stats_init();
6628 #endif
6629   __kmp_init_lock(&__kmp_global_lock);
6630   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6631   __kmp_init_lock(&__kmp_debug_lock);
6632   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6633   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6634   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6635   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6636   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6637   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6638   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6639   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6640   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6641   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6642   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6643   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6644   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6645   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6646   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6647 #if KMP_USE_MONITOR
6648   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6649 #endif
6650   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6651 
6652   /* conduct initialization and initial setup of configuration */
6653 
6654   __kmp_runtime_initialize();
6655 
6656 #if KMP_MIC_SUPPORTED
6657   __kmp_check_mic_type();
6658 #endif
6659 
6660 // Some global variable initialization moved here from kmp_env_initialize()
6661 #ifdef KMP_DEBUG
6662   kmp_diag = 0;
6663 #endif
6664   __kmp_abort_delay = 0;
6665 
6666   // From __kmp_init_dflt_team_nth()
6667   /* assume the entire machine will be used */
6668   __kmp_dflt_team_nth_ub = __kmp_xproc;
6669   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6670     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6671   }
6672   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6673     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6674   }
6675   __kmp_max_nth = __kmp_sys_max_nth;
6676   __kmp_cg_max_nth = __kmp_sys_max_nth;
6677   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6678   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6679     __kmp_teams_max_nth = __kmp_sys_max_nth;
6680   }
6681 
6682   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6683   // part
6684   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6685 #if KMP_USE_MONITOR
6686   __kmp_monitor_wakeups =
6687       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6688   __kmp_bt_intervals =
6689       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6690 #endif
6691   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6692   __kmp_library = library_throughput;
6693   // From KMP_SCHEDULE initialization
6694   __kmp_static = kmp_sch_static_balanced;
6695 // AC: do not use analytical here, because it is non-monotonous
6696 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6697 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6698 // need to repeat assignment
6699 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6700 // bit control and barrier method control parts
6701 #if KMP_FAST_REDUCTION_BARRIER
6702 #define kmp_reduction_barrier_gather_bb ((int)1)
6703 #define kmp_reduction_barrier_release_bb ((int)1)
6704 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6705 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6706 #endif // KMP_FAST_REDUCTION_BARRIER
6707   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6708     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6709     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6710     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6711     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6712 #if KMP_FAST_REDUCTION_BARRIER
6713     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6714       // lin_64 ): hyper,1
6715       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6716       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6717       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6718       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6719     }
6720 #endif // KMP_FAST_REDUCTION_BARRIER
6721   }
6722 #if KMP_FAST_REDUCTION_BARRIER
6723 #undef kmp_reduction_barrier_release_pat
6724 #undef kmp_reduction_barrier_gather_pat
6725 #undef kmp_reduction_barrier_release_bb
6726 #undef kmp_reduction_barrier_gather_bb
6727 #endif // KMP_FAST_REDUCTION_BARRIER
6728 #if KMP_MIC_SUPPORTED
6729   if (__kmp_mic_type == mic2) { // KNC
6730     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6731     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6732     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6733         1; // forkjoin release
6734     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6735     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6736   }
6737 #if KMP_FAST_REDUCTION_BARRIER
6738   if (__kmp_mic_type == mic2) { // KNC
6739     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6740     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6741   }
6742 #endif // KMP_FAST_REDUCTION_BARRIER
6743 #endif // KMP_MIC_SUPPORTED
6744 
6745 // From KMP_CHECKS initialization
6746 #ifdef KMP_DEBUG
6747   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6748 #else
6749   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6750 #endif
6751 
6752   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6753   __kmp_foreign_tp = TRUE;
6754 
6755   __kmp_global.g.g_dynamic = FALSE;
6756   __kmp_global.g.g_dynamic_mode = dynamic_default;
6757 
6758   __kmp_env_initialize(NULL);
6759 
6760 // Print all messages in message catalog for testing purposes.
6761 #ifdef KMP_DEBUG
6762   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6763   if (__kmp_str_match_true(val)) {
6764     kmp_str_buf_t buffer;
6765     __kmp_str_buf_init(&buffer);
6766     __kmp_i18n_dump_catalog(&buffer);
6767     __kmp_printf("%s", buffer.str);
6768     __kmp_str_buf_free(&buffer);
6769   }
6770   __kmp_env_free(&val);
6771 #endif
6772 
6773   __kmp_threads_capacity =
6774       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6775   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6776   __kmp_tp_capacity = __kmp_default_tp_capacity(
6777       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6778 
6779   // If the library is shut down properly, both pools must be NULL. Just in
6780   // case, set them to NULL -- some memory may leak, but subsequent code will
6781   // work even if pools are not freed.
6782   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6783   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6784   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6785   __kmp_thread_pool = NULL;
6786   __kmp_thread_pool_insert_pt = NULL;
6787   __kmp_team_pool = NULL;
6788 
6789   /* Allocate all of the variable sized records */
6790   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6791    * expandable */
6792   /* Since allocation is cache-aligned, just add extra padding at the end */
6793   size =
6794       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6795       CACHE_LINE;
6796   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6797   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6798                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6799 
6800   /* init thread counts */
6801   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6802                    0); // Asserts fail if the library is reinitializing and
6803   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6804   __kmp_all_nth = 0;
6805   __kmp_nth = 0;
6806 
6807   /* setup the uber master thread and hierarchy */
6808   gtid = __kmp_register_root(TRUE);
6809   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6810   KMP_ASSERT(KMP_UBER_GTID(gtid));
6811   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6812 
6813   KMP_MB(); /* Flush all pending memory write invalidates.  */
6814 
6815   __kmp_common_initialize();
6816 
6817 #if KMP_OS_UNIX
6818   /* invoke the child fork handler */
6819   __kmp_register_atfork();
6820 #endif
6821 
6822 #if !KMP_DYNAMIC_LIB
6823   {
6824     /* Invoke the exit handler when the program finishes, only for static
6825        library. For dynamic library, we already have _fini and DllMain. */
6826     int rc = atexit(__kmp_internal_end_atexit);
6827     if (rc != 0) {
6828       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6829                   __kmp_msg_null);
6830     }
6831   }
6832 #endif
6833 
6834 #if KMP_HANDLE_SIGNALS
6835 #if KMP_OS_UNIX
6836   /* NOTE: make sure that this is called before the user installs their own
6837      signal handlers so that the user handlers are called first. this way they
6838      can return false, not call our handler, avoid terminating the library, and
6839      continue execution where they left off. */
6840   __kmp_install_signals(FALSE);
6841 #endif /* KMP_OS_UNIX */
6842 #if KMP_OS_WINDOWS
6843   __kmp_install_signals(TRUE);
6844 #endif /* KMP_OS_WINDOWS */
6845 #endif
6846 
6847   /* we have finished the serial initialization */
6848   __kmp_init_counter++;
6849 
6850   __kmp_init_serial = TRUE;
6851 
6852   if (__kmp_settings) {
6853     __kmp_env_print();
6854   }
6855 
6856   if (__kmp_display_env || __kmp_display_env_verbose) {
6857     __kmp_env_print_2();
6858   }
6859 
6860 #if OMPT_SUPPORT
6861   ompt_post_init();
6862 #endif
6863 
6864   KMP_MB();
6865 
6866   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6867 }
6868 
6869 void __kmp_serial_initialize(void) {
6870   if (__kmp_init_serial) {
6871     return;
6872   }
6873   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6874   if (__kmp_init_serial) {
6875     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6876     return;
6877   }
6878   __kmp_do_serial_initialize();
6879   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6880 }
6881 
6882 static void __kmp_do_middle_initialize(void) {
6883   int i, j;
6884   int prev_dflt_team_nth;
6885 
6886   if (!__kmp_init_serial) {
6887     __kmp_do_serial_initialize();
6888   }
6889 
6890   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6891 
6892   // Save the previous value for the __kmp_dflt_team_nth so that
6893   // we can avoid some reinitialization if it hasn't changed.
6894   prev_dflt_team_nth = __kmp_dflt_team_nth;
6895 
6896 #if KMP_AFFINITY_SUPPORTED
6897   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6898   // number of cores on the machine.
6899   __kmp_affinity_initialize();
6900 
6901   // Run through the __kmp_threads array and set the affinity mask
6902   // for each root thread that is currently registered with the RTL.
6903   for (i = 0; i < __kmp_threads_capacity; i++) {
6904     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6905       __kmp_affinity_set_init_mask(i, TRUE);
6906     }
6907   }
6908 #endif /* KMP_AFFINITY_SUPPORTED */
6909 
6910   KMP_ASSERT(__kmp_xproc > 0);
6911   if (__kmp_avail_proc == 0) {
6912     __kmp_avail_proc = __kmp_xproc;
6913   }
6914 
6915   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6916   // correct them now
6917   j = 0;
6918   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6919     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6920         __kmp_avail_proc;
6921     j++;
6922   }
6923 
6924   if (__kmp_dflt_team_nth == 0) {
6925 #ifdef KMP_DFLT_NTH_CORES
6926     // Default #threads = #cores
6927     __kmp_dflt_team_nth = __kmp_ncores;
6928     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6929                   "__kmp_ncores (%d)\n",
6930                   __kmp_dflt_team_nth));
6931 #else
6932     // Default #threads = #available OS procs
6933     __kmp_dflt_team_nth = __kmp_avail_proc;
6934     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6935                   "__kmp_avail_proc(%d)\n",
6936                   __kmp_dflt_team_nth));
6937 #endif /* KMP_DFLT_NTH_CORES */
6938   }
6939 
6940   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6941     __kmp_dflt_team_nth = KMP_MIN_NTH;
6942   }
6943   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6944     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6945   }
6946 
6947   // There's no harm in continuing if the following check fails,
6948   // but it indicates an error in the previous logic.
6949   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6950 
6951   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6952     // Run through the __kmp_threads array and set the num threads icv for each
6953     // root thread that is currently registered with the RTL (which has not
6954     // already explicitly set its nthreads-var with a call to
6955     // omp_set_num_threads()).
6956     for (i = 0; i < __kmp_threads_capacity; i++) {
6957       kmp_info_t *thread = __kmp_threads[i];
6958       if (thread == NULL)
6959         continue;
6960       if (thread->th.th_current_task->td_icvs.nproc != 0)
6961         continue;
6962 
6963       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6964     }
6965   }
6966   KA_TRACE(
6967       20,
6968       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6969        __kmp_dflt_team_nth));
6970 
6971 #ifdef KMP_ADJUST_BLOCKTIME
6972   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6973   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6974     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6975     if (__kmp_nth > __kmp_avail_proc) {
6976       __kmp_zero_bt = TRUE;
6977     }
6978   }
6979 #endif /* KMP_ADJUST_BLOCKTIME */
6980 
6981   /* we have finished middle initialization */
6982   TCW_SYNC_4(__kmp_init_middle, TRUE);
6983 
6984   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6985 }
6986 
6987 void __kmp_middle_initialize(void) {
6988   if (__kmp_init_middle) {
6989     return;
6990   }
6991   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6992   if (__kmp_init_middle) {
6993     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6994     return;
6995   }
6996   __kmp_do_middle_initialize();
6997   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6998 }
6999 
7000 void __kmp_parallel_initialize(void) {
7001   int gtid = __kmp_entry_gtid(); // this might be a new root
7002 
7003   /* synchronize parallel initialization (for sibling) */
7004   if (TCR_4(__kmp_init_parallel))
7005     return;
7006   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7007   if (TCR_4(__kmp_init_parallel)) {
7008     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7009     return;
7010   }
7011 
7012   /* TODO reinitialization after we have already shut down */
7013   if (TCR_4(__kmp_global.g.g_done)) {
7014     KA_TRACE(
7015         10,
7016         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7017     __kmp_infinite_loop();
7018   }
7019 
7020   /* jc: The lock __kmp_initz_lock is already held, so calling
7021      __kmp_serial_initialize would cause a deadlock.  So we call
7022      __kmp_do_serial_initialize directly. */
7023   if (!__kmp_init_middle) {
7024     __kmp_do_middle_initialize();
7025   }
7026   __kmp_resume_if_hard_paused();
7027 
7028   /* begin initialization */
7029   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7030   KMP_ASSERT(KMP_UBER_GTID(gtid));
7031 
7032 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7033   // Save the FP control regs.
7034   // Worker threads will set theirs to these values at thread startup.
7035   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7036   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7037   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7038 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7039 
7040 #if KMP_OS_UNIX
7041 #if KMP_HANDLE_SIGNALS
7042   /*  must be after __kmp_serial_initialize  */
7043   __kmp_install_signals(TRUE);
7044 #endif
7045 #endif
7046 
7047   __kmp_suspend_initialize();
7048 
7049 #if defined(USE_LOAD_BALANCE)
7050   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7051     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7052   }
7053 #else
7054   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7055     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7056   }
7057 #endif
7058 
7059   if (__kmp_version) {
7060     __kmp_print_version_2();
7061   }
7062 
7063   /* we have finished parallel initialization */
7064   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7065 
7066   KMP_MB();
7067   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7068 
7069   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7070 }
7071 
7072 /* ------------------------------------------------------------------------ */
7073 
7074 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7075                                    kmp_team_t *team) {
7076   kmp_disp_t *dispatch;
7077 
7078   KMP_MB();
7079 
7080   /* none of the threads have encountered any constructs, yet. */
7081   this_thr->th.th_local.this_construct = 0;
7082 #if KMP_CACHE_MANAGE
7083   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7084 #endif /* KMP_CACHE_MANAGE */
7085   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7086   KMP_DEBUG_ASSERT(dispatch);
7087   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7088   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7089   // this_thr->th.th_info.ds.ds_tid ] );
7090 
7091   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7092   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7093   if (__kmp_env_consistency_check)
7094     __kmp_push_parallel(gtid, team->t.t_ident);
7095 
7096   KMP_MB(); /* Flush all pending memory write invalidates.  */
7097 }
7098 
7099 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7100                                   kmp_team_t *team) {
7101   if (__kmp_env_consistency_check)
7102     __kmp_pop_parallel(gtid, team->t.t_ident);
7103 
7104   __kmp_finish_implicit_task(this_thr);
7105 }
7106 
7107 int __kmp_invoke_task_func(int gtid) {
7108   int rc;
7109   int tid = __kmp_tid_from_gtid(gtid);
7110   kmp_info_t *this_thr = __kmp_threads[gtid];
7111   kmp_team_t *team = this_thr->th.th_team;
7112 
7113   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7114 #if USE_ITT_BUILD
7115   if (__itt_stack_caller_create_ptr) {
7116     __kmp_itt_stack_callee_enter(
7117         (__itt_caller)
7118             team->t.t_stack_id); // inform ittnotify about entering user's code
7119   }
7120 #endif /* USE_ITT_BUILD */
7121 #if INCLUDE_SSC_MARKS
7122   SSC_MARK_INVOKING();
7123 #endif
7124 
7125 #if OMPT_SUPPORT
7126   void *dummy;
7127   void **exit_frame_p;
7128   ompt_data_t *my_task_data;
7129   ompt_data_t *my_parallel_data;
7130   int ompt_team_size;
7131 
7132   if (ompt_enabled.enabled) {
7133     exit_frame_p = &(
7134         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7135   } else {
7136     exit_frame_p = &dummy;
7137   }
7138 
7139   my_task_data =
7140       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7141   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7142   if (ompt_enabled.ompt_callback_implicit_task) {
7143     ompt_team_size = team->t.t_nproc;
7144     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7145         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7146         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7147     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7148   }
7149 #endif
7150 
7151 #if KMP_STATS_ENABLED
7152   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7153   if (previous_state == stats_state_e::TEAMS_REGION) {
7154     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7155   } else {
7156     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7157   }
7158   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7159 #endif
7160 
7161   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7162                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7163 #if OMPT_SUPPORT
7164                               ,
7165                               exit_frame_p
7166 #endif
7167                               );
7168 #if OMPT_SUPPORT
7169   *exit_frame_p = NULL;
7170    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7171 #endif
7172 
7173 #if KMP_STATS_ENABLED
7174   if (previous_state == stats_state_e::TEAMS_REGION) {
7175     KMP_SET_THREAD_STATE(previous_state);
7176   }
7177   KMP_POP_PARTITIONED_TIMER();
7178 #endif
7179 
7180 #if USE_ITT_BUILD
7181   if (__itt_stack_caller_create_ptr) {
7182     __kmp_itt_stack_callee_leave(
7183         (__itt_caller)
7184             team->t.t_stack_id); // inform ittnotify about leaving user's code
7185   }
7186 #endif /* USE_ITT_BUILD */
7187   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7188 
7189   return rc;
7190 }
7191 
7192 void __kmp_teams_master(int gtid) {
7193   // This routine is called by all master threads in teams construct
7194   kmp_info_t *thr = __kmp_threads[gtid];
7195   kmp_team_t *team = thr->th.th_team;
7196   ident_t *loc = team->t.t_ident;
7197   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7198   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7199   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7200   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7201                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7202 
7203   // This thread is a new CG root.  Set up the proper variables.
7204   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7205   tmp->cg_root = thr; // Make thr the CG root
7206   // Init to thread limit that was stored when league masters were forked
7207   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7208   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7209   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7210                  " cg_nthreads to 1\n",
7211                  thr, tmp));
7212   tmp->up = thr->th.th_cg_roots;
7213   thr->th.th_cg_roots = tmp;
7214 
7215 // Launch league of teams now, but not let workers execute
7216 // (they hang on fork barrier until next parallel)
7217 #if INCLUDE_SSC_MARKS
7218   SSC_MARK_FORKING();
7219 #endif
7220   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7221                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7222                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7223 #if INCLUDE_SSC_MARKS
7224   SSC_MARK_JOINING();
7225 #endif
7226   // If the team size was reduced from the limit, set it to the new size
7227   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7228     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7229   // AC: last parameter "1" eliminates join barrier which won't work because
7230   // worker threads are in a fork barrier waiting for more parallel regions
7231   __kmp_join_call(loc, gtid
7232 #if OMPT_SUPPORT
7233                   ,
7234                   fork_context_intel
7235 #endif
7236                   ,
7237                   1);
7238 }
7239 
7240 int __kmp_invoke_teams_master(int gtid) {
7241   kmp_info_t *this_thr = __kmp_threads[gtid];
7242   kmp_team_t *team = this_thr->th.th_team;
7243 #if KMP_DEBUG
7244   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7245     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7246                      (void *)__kmp_teams_master);
7247 #endif
7248   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7249 #if OMPT_SUPPORT
7250   int tid = __kmp_tid_from_gtid(gtid);
7251   ompt_data_t *task_data =
7252       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7253   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7254   if (ompt_enabled.ompt_callback_implicit_task) {
7255     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7256         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7257         ompt_task_initial);
7258     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7259   }
7260 #endif
7261   __kmp_teams_master(gtid);
7262 #if OMPT_SUPPORT
7263   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7264 #endif
7265   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7266   return 1;
7267 }
7268 
7269 /* this sets the requested number of threads for the next parallel region
7270    encountered by this team. since this should be enclosed in the forkjoin
7271    critical section it should avoid race conditions with asymmetrical nested
7272    parallelism */
7273 
7274 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7275   kmp_info_t *thr = __kmp_threads[gtid];
7276 
7277   if (num_threads > 0)
7278     thr->th.th_set_nproc = num_threads;
7279 }
7280 
7281 /* this sets the requested number of teams for the teams region and/or
7282    the number of threads for the next parallel region encountered  */
7283 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7284                           int num_threads) {
7285   kmp_info_t *thr = __kmp_threads[gtid];
7286   KMP_DEBUG_ASSERT(num_teams >= 0);
7287   KMP_DEBUG_ASSERT(num_threads >= 0);
7288 
7289   if (num_teams == 0)
7290     num_teams = 1; // default number of teams is 1.
7291   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7292     if (!__kmp_reserve_warn) {
7293       __kmp_reserve_warn = 1;
7294       __kmp_msg(kmp_ms_warning,
7295                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7296                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7297     }
7298     num_teams = __kmp_teams_max_nth;
7299   }
7300   // Set number of teams (number of threads in the outer "parallel" of the
7301   // teams)
7302   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7303 
7304   // Remember the number of threads for inner parallel regions
7305   if (!TCR_4(__kmp_init_middle))
7306     __kmp_middle_initialize(); // get internal globals calculated
7307   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7308   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7309   if (num_threads == 0) {
7310     num_threads = __kmp_avail_proc / num_teams;
7311     // adjust num_threads w/o warning as it is not user setting
7312     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7313     // no thread_limit clause specified -  do not change thread-limit-var ICV
7314     if (num_threads > __kmp_dflt_team_nth) {
7315       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7316     }
7317     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7318       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7319     } // prevent team size to exceed thread-limit-var
7320     if (num_teams * num_threads > __kmp_teams_max_nth) {
7321       num_threads = __kmp_teams_max_nth / num_teams;
7322     }
7323   } else {
7324     // This thread will be the master of the league masters
7325     // Store new thread limit; old limit is saved in th_cg_roots list
7326     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7327     // num_threads = min(num_threads, nthreads-var)
7328     if (num_threads > __kmp_dflt_team_nth) {
7329       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7330     }
7331     if (num_teams * num_threads > __kmp_teams_max_nth) {
7332       int new_threads = __kmp_teams_max_nth / num_teams;
7333       if (!__kmp_reserve_warn) { // user asked for too many threads
7334         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7335         __kmp_msg(kmp_ms_warning,
7336                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7337                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7338       }
7339       num_threads = new_threads;
7340     }
7341   }
7342   thr->th.th_teams_size.nth = num_threads;
7343 }
7344 
7345 // Set the proc_bind var to use in the following parallel region.
7346 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7347   kmp_info_t *thr = __kmp_threads[gtid];
7348   thr->th.th_set_proc_bind = proc_bind;
7349 }
7350 
7351 /* Launch the worker threads into the microtask. */
7352 
7353 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7354   kmp_info_t *this_thr = __kmp_threads[gtid];
7355 
7356 #ifdef KMP_DEBUG
7357   int f;
7358 #endif /* KMP_DEBUG */
7359 
7360   KMP_DEBUG_ASSERT(team);
7361   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7362   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7363   KMP_MB(); /* Flush all pending memory write invalidates.  */
7364 
7365   team->t.t_construct = 0; /* no single directives seen yet */
7366   team->t.t_ordered.dt.t_value =
7367       0; /* thread 0 enters the ordered section first */
7368 
7369   /* Reset the identifiers on the dispatch buffer */
7370   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7371   if (team->t.t_max_nproc > 1) {
7372     int i;
7373     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7374       team->t.t_disp_buffer[i].buffer_index = i;
7375       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7376     }
7377   } else {
7378     team->t.t_disp_buffer[0].buffer_index = 0;
7379     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7380   }
7381 
7382   KMP_MB(); /* Flush all pending memory write invalidates.  */
7383   KMP_ASSERT(this_thr->th.th_team == team);
7384 
7385 #ifdef KMP_DEBUG
7386   for (f = 0; f < team->t.t_nproc; f++) {
7387     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7388                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7389   }
7390 #endif /* KMP_DEBUG */
7391 
7392   /* release the worker threads so they may begin working */
7393   __kmp_fork_barrier(gtid, 0);
7394 }
7395 
7396 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7397   kmp_info_t *this_thr = __kmp_threads[gtid];
7398 
7399   KMP_DEBUG_ASSERT(team);
7400   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7401   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7402   KMP_MB(); /* Flush all pending memory write invalidates.  */
7403 
7404 /* Join barrier after fork */
7405 
7406 #ifdef KMP_DEBUG
7407   if (__kmp_threads[gtid] &&
7408       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7409     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7410                  __kmp_threads[gtid]);
7411     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7412                  "team->t.t_nproc=%d\n",
7413                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7414                  team->t.t_nproc);
7415     __kmp_print_structure();
7416   }
7417   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7418                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7419 #endif /* KMP_DEBUG */
7420 
7421   __kmp_join_barrier(gtid); /* wait for everyone */
7422 #if OMPT_SUPPORT
7423   if (ompt_enabled.enabled &&
7424       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7425     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7426     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7427     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7428 #if OMPT_OPTIONAL
7429     void *codeptr = NULL;
7430     if (KMP_MASTER_TID(ds_tid) &&
7431         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7432          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7433       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7434 
7435     if (ompt_enabled.ompt_callback_sync_region_wait) {
7436       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7437           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7438           codeptr);
7439     }
7440     if (ompt_enabled.ompt_callback_sync_region) {
7441       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7442           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7443           codeptr);
7444     }
7445 #endif
7446     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7447       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7448           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7449     }
7450   }
7451 #endif
7452 
7453   KMP_MB(); /* Flush all pending memory write invalidates.  */
7454   KMP_ASSERT(this_thr->th.th_team == team);
7455 }
7456 
7457 /* ------------------------------------------------------------------------ */
7458 
7459 #ifdef USE_LOAD_BALANCE
7460 
7461 // Return the worker threads actively spinning in the hot team, if we
7462 // are at the outermost level of parallelism.  Otherwise, return 0.
7463 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7464   int i;
7465   int retval;
7466   kmp_team_t *hot_team;
7467 
7468   if (root->r.r_active) {
7469     return 0;
7470   }
7471   hot_team = root->r.r_hot_team;
7472   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7473     return hot_team->t.t_nproc - 1; // Don't count master thread
7474   }
7475 
7476   // Skip the master thread - it is accounted for elsewhere.
7477   retval = 0;
7478   for (i = 1; i < hot_team->t.t_nproc; i++) {
7479     if (hot_team->t.t_threads[i]->th.th_active) {
7480       retval++;
7481     }
7482   }
7483   return retval;
7484 }
7485 
7486 // Perform an automatic adjustment to the number of
7487 // threads used by the next parallel region.
7488 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7489   int retval;
7490   int pool_active;
7491   int hot_team_active;
7492   int team_curr_active;
7493   int system_active;
7494 
7495   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7496                 set_nproc));
7497   KMP_DEBUG_ASSERT(root);
7498   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7499                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7500   KMP_DEBUG_ASSERT(set_nproc > 1);
7501 
7502   if (set_nproc == 1) {
7503     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7504     return 1;
7505   }
7506 
7507   // Threads that are active in the thread pool, active in the hot team for this
7508   // particular root (if we are at the outer par level), and the currently
7509   // executing thread (to become the master) are available to add to the new
7510   // team, but are currently contributing to the system load, and must be
7511   // accounted for.
7512   pool_active = __kmp_thread_pool_active_nth;
7513   hot_team_active = __kmp_active_hot_team_nproc(root);
7514   team_curr_active = pool_active + hot_team_active + 1;
7515 
7516   // Check the system load.
7517   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7518   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7519                 "hot team active = %d\n",
7520                 system_active, pool_active, hot_team_active));
7521 
7522   if (system_active < 0) {
7523     // There was an error reading the necessary info from /proc, so use the
7524     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7525     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7526     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7527     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7528 
7529     // Make this call behave like the thread limit algorithm.
7530     retval = __kmp_avail_proc - __kmp_nth +
7531              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7532     if (retval > set_nproc) {
7533       retval = set_nproc;
7534     }
7535     if (retval < KMP_MIN_NTH) {
7536       retval = KMP_MIN_NTH;
7537     }
7538 
7539     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7540                   retval));
7541     return retval;
7542   }
7543 
7544   // There is a slight delay in the load balance algorithm in detecting new
7545   // running procs. The real system load at this instant should be at least as
7546   // large as the #active omp thread that are available to add to the team.
7547   if (system_active < team_curr_active) {
7548     system_active = team_curr_active;
7549   }
7550   retval = __kmp_avail_proc - system_active + team_curr_active;
7551   if (retval > set_nproc) {
7552     retval = set_nproc;
7553   }
7554   if (retval < KMP_MIN_NTH) {
7555     retval = KMP_MIN_NTH;
7556   }
7557 
7558   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7559   return retval;
7560 } // __kmp_load_balance_nproc()
7561 
7562 #endif /* USE_LOAD_BALANCE */
7563 
7564 /* ------------------------------------------------------------------------ */
7565 
7566 /* NOTE: this is called with the __kmp_init_lock held */
7567 void __kmp_cleanup(void) {
7568   int f;
7569 
7570   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7571 
7572   if (TCR_4(__kmp_init_parallel)) {
7573 #if KMP_HANDLE_SIGNALS
7574     __kmp_remove_signals();
7575 #endif
7576     TCW_4(__kmp_init_parallel, FALSE);
7577   }
7578 
7579   if (TCR_4(__kmp_init_middle)) {
7580 #if KMP_AFFINITY_SUPPORTED
7581     __kmp_affinity_uninitialize();
7582 #endif /* KMP_AFFINITY_SUPPORTED */
7583     __kmp_cleanup_hierarchy();
7584     TCW_4(__kmp_init_middle, FALSE);
7585   }
7586 
7587   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7588 
7589   if (__kmp_init_serial) {
7590     __kmp_runtime_destroy();
7591     __kmp_init_serial = FALSE;
7592   }
7593 
7594   __kmp_cleanup_threadprivate_caches();
7595 
7596   for (f = 0; f < __kmp_threads_capacity; f++) {
7597     if (__kmp_root[f] != NULL) {
7598       __kmp_free(__kmp_root[f]);
7599       __kmp_root[f] = NULL;
7600     }
7601   }
7602   __kmp_free(__kmp_threads);
7603   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7604   // there is no need in freeing __kmp_root.
7605   __kmp_threads = NULL;
7606   __kmp_root = NULL;
7607   __kmp_threads_capacity = 0;
7608 
7609 #if KMP_USE_DYNAMIC_LOCK
7610   __kmp_cleanup_indirect_user_locks();
7611 #else
7612   __kmp_cleanup_user_locks();
7613 #endif
7614 
7615 #if KMP_AFFINITY_SUPPORTED
7616   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7617   __kmp_cpuinfo_file = NULL;
7618 #endif /* KMP_AFFINITY_SUPPORTED */
7619 
7620 #if KMP_USE_ADAPTIVE_LOCKS
7621 #if KMP_DEBUG_ADAPTIVE_LOCKS
7622   __kmp_print_speculative_stats();
7623 #endif
7624 #endif
7625   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7626   __kmp_nested_nth.nth = NULL;
7627   __kmp_nested_nth.size = 0;
7628   __kmp_nested_nth.used = 0;
7629   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7630   __kmp_nested_proc_bind.bind_types = NULL;
7631   __kmp_nested_proc_bind.size = 0;
7632   __kmp_nested_proc_bind.used = 0;
7633   if (__kmp_affinity_format) {
7634     KMP_INTERNAL_FREE(__kmp_affinity_format);
7635     __kmp_affinity_format = NULL;
7636   }
7637 
7638   __kmp_i18n_catclose();
7639 
7640 #if KMP_USE_HIER_SCHED
7641   __kmp_hier_scheds.deallocate();
7642 #endif
7643 
7644 #if KMP_STATS_ENABLED
7645   __kmp_stats_fini();
7646 #endif
7647 
7648   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7649 }
7650 
7651 /* ------------------------------------------------------------------------ */
7652 
7653 int __kmp_ignore_mppbeg(void) {
7654   char *env;
7655 
7656   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7657     if (__kmp_str_match_false(env))
7658       return FALSE;
7659   }
7660   // By default __kmpc_begin() is no-op.
7661   return TRUE;
7662 }
7663 
7664 int __kmp_ignore_mppend(void) {
7665   char *env;
7666 
7667   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7668     if (__kmp_str_match_false(env))
7669       return FALSE;
7670   }
7671   // By default __kmpc_end() is no-op.
7672   return TRUE;
7673 }
7674 
7675 void __kmp_internal_begin(void) {
7676   int gtid;
7677   kmp_root_t *root;
7678 
7679   /* this is a very important step as it will register new sibling threads
7680      and assign these new uber threads a new gtid */
7681   gtid = __kmp_entry_gtid();
7682   root = __kmp_threads[gtid]->th.th_root;
7683   KMP_ASSERT(KMP_UBER_GTID(gtid));
7684 
7685   if (root->r.r_begin)
7686     return;
7687   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7688   if (root->r.r_begin) {
7689     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7690     return;
7691   }
7692 
7693   root->r.r_begin = TRUE;
7694 
7695   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7696 }
7697 
7698 /* ------------------------------------------------------------------------ */
7699 
7700 void __kmp_user_set_library(enum library_type arg) {
7701   int gtid;
7702   kmp_root_t *root;
7703   kmp_info_t *thread;
7704 
7705   /* first, make sure we are initialized so we can get our gtid */
7706 
7707   gtid = __kmp_entry_gtid();
7708   thread = __kmp_threads[gtid];
7709 
7710   root = thread->th.th_root;
7711 
7712   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7713                 library_serial));
7714   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7715                                   thread */
7716     KMP_WARNING(SetLibraryIncorrectCall);
7717     return;
7718   }
7719 
7720   switch (arg) {
7721   case library_serial:
7722     thread->th.th_set_nproc = 0;
7723     set__nproc(thread, 1);
7724     break;
7725   case library_turnaround:
7726     thread->th.th_set_nproc = 0;
7727     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7728                                            : __kmp_dflt_team_nth_ub);
7729     break;
7730   case library_throughput:
7731     thread->th.th_set_nproc = 0;
7732     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7733                                            : __kmp_dflt_team_nth_ub);
7734     break;
7735   default:
7736     KMP_FATAL(UnknownLibraryType, arg);
7737   }
7738 
7739   __kmp_aux_set_library(arg);
7740 }
7741 
7742 void __kmp_aux_set_stacksize(size_t arg) {
7743   if (!__kmp_init_serial)
7744     __kmp_serial_initialize();
7745 
7746 #if KMP_OS_DARWIN
7747   if (arg & (0x1000 - 1)) {
7748     arg &= ~(0x1000 - 1);
7749     if (arg + 0x1000) /* check for overflow if we round up */
7750       arg += 0x1000;
7751   }
7752 #endif
7753   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7754 
7755   /* only change the default stacksize before the first parallel region */
7756   if (!TCR_4(__kmp_init_parallel)) {
7757     size_t value = arg; /* argument is in bytes */
7758 
7759     if (value < __kmp_sys_min_stksize)
7760       value = __kmp_sys_min_stksize;
7761     else if (value > KMP_MAX_STKSIZE)
7762       value = KMP_MAX_STKSIZE;
7763 
7764     __kmp_stksize = value;
7765 
7766     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7767   }
7768 
7769   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7770 }
7771 
7772 /* set the behaviour of the runtime library */
7773 /* TODO this can cause some odd behaviour with sibling parallelism... */
7774 void __kmp_aux_set_library(enum library_type arg) {
7775   __kmp_library = arg;
7776 
7777   switch (__kmp_library) {
7778   case library_serial: {
7779     KMP_INFORM(LibraryIsSerial);
7780   } break;
7781   case library_turnaround:
7782     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7783       __kmp_use_yield = 2; // only yield when oversubscribed
7784     break;
7785   case library_throughput:
7786     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7787       __kmp_dflt_blocktime = 200;
7788     break;
7789   default:
7790     KMP_FATAL(UnknownLibraryType, arg);
7791   }
7792 }
7793 
7794 /* Getting team information common for all team API */
7795 // Returns NULL if not in teams construct
7796 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7797   kmp_info_t *thr = __kmp_entry_thread();
7798   teams_serialized = 0;
7799   if (thr->th.th_teams_microtask) {
7800     kmp_team_t *team = thr->th.th_team;
7801     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7802     int ii = team->t.t_level;
7803     teams_serialized = team->t.t_serialized;
7804     int level = tlevel + 1;
7805     KMP_DEBUG_ASSERT(ii >= tlevel);
7806     while (ii > level) {
7807       for (teams_serialized = team->t.t_serialized;
7808            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7809       }
7810       if (team->t.t_serialized && (!teams_serialized)) {
7811         team = team->t.t_parent;
7812         continue;
7813       }
7814       if (ii > level) {
7815         team = team->t.t_parent;
7816         ii--;
7817       }
7818     }
7819     return team;
7820   }
7821   return NULL;
7822 }
7823 
7824 int __kmp_aux_get_team_num() {
7825   int serialized;
7826   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7827   if (team) {
7828     if (serialized > 1) {
7829       return 0; // teams region is serialized ( 1 team of 1 thread ).
7830     } else {
7831       return team->t.t_master_tid;
7832     }
7833   }
7834   return 0;
7835 }
7836 
7837 int __kmp_aux_get_num_teams() {
7838   int serialized;
7839   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7840   if (team) {
7841     if (serialized > 1) {
7842       return 1;
7843     } else {
7844       return team->t.t_parent->t.t_nproc;
7845     }
7846   }
7847   return 1;
7848 }
7849 
7850 /* ------------------------------------------------------------------------ */
7851 
7852 /*
7853  * Affinity Format Parser
7854  *
7855  * Field is in form of: %[[[0].]size]type
7856  * % and type are required (%% means print a literal '%')
7857  * type is either single char or long name surrounded by {},
7858  * e.g., N or {num_threads}
7859  * 0 => leading zeros
7860  * . => right justified when size is specified
7861  * by default output is left justified
7862  * size is the *minimum* field length
7863  * All other characters are printed as is
7864  *
7865  * Available field types:
7866  * L {thread_level}      - omp_get_level()
7867  * n {thread_num}        - omp_get_thread_num()
7868  * h {host}              - name of host machine
7869  * P {process_id}        - process id (integer)
7870  * T {thread_identifier} - native thread identifier (integer)
7871  * N {num_threads}       - omp_get_num_threads()
7872  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7873  * a {thread_affinity}   - comma separated list of integers or integer ranges
7874  *                         (values of affinity mask)
7875  *
7876  * Implementation-specific field types can be added
7877  * If a type is unknown, print "undefined"
7878 */
7879 
7880 // Structure holding the short name, long name, and corresponding data type
7881 // for snprintf.  A table of these will represent the entire valid keyword
7882 // field types.
7883 typedef struct kmp_affinity_format_field_t {
7884   char short_name; // from spec e.g., L -> thread level
7885   const char *long_name; // from spec thread_level -> thread level
7886   char field_format; // data type for snprintf (typically 'd' or 's'
7887   // for integer or string)
7888 } kmp_affinity_format_field_t;
7889 
7890 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7891 #if KMP_AFFINITY_SUPPORTED
7892     {'A', "thread_affinity", 's'},
7893 #endif
7894     {'t', "team_num", 'd'},
7895     {'T', "num_teams", 'd'},
7896     {'L', "nesting_level", 'd'},
7897     {'n', "thread_num", 'd'},
7898     {'N', "num_threads", 'd'},
7899     {'a', "ancestor_tnum", 'd'},
7900     {'H', "host", 's'},
7901     {'P', "process_id", 'd'},
7902     {'i', "native_thread_id", 'd'}};
7903 
7904 // Return the number of characters it takes to hold field
7905 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7906                                             const char **ptr,
7907                                             kmp_str_buf_t *field_buffer) {
7908   int rc, format_index, field_value;
7909   const char *width_left, *width_right;
7910   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7911   static const int FORMAT_SIZE = 20;
7912   char format[FORMAT_SIZE] = {0};
7913   char absolute_short_name = 0;
7914 
7915   KMP_DEBUG_ASSERT(gtid >= 0);
7916   KMP_DEBUG_ASSERT(th);
7917   KMP_DEBUG_ASSERT(**ptr == '%');
7918   KMP_DEBUG_ASSERT(field_buffer);
7919 
7920   __kmp_str_buf_clear(field_buffer);
7921 
7922   // Skip the initial %
7923   (*ptr)++;
7924 
7925   // Check for %% first
7926   if (**ptr == '%') {
7927     __kmp_str_buf_cat(field_buffer, "%", 1);
7928     (*ptr)++; // skip over the second %
7929     return 1;
7930   }
7931 
7932   // Parse field modifiers if they are present
7933   pad_zeros = false;
7934   if (**ptr == '0') {
7935     pad_zeros = true;
7936     (*ptr)++; // skip over 0
7937   }
7938   right_justify = false;
7939   if (**ptr == '.') {
7940     right_justify = true;
7941     (*ptr)++; // skip over .
7942   }
7943   // Parse width of field: [width_left, width_right)
7944   width_left = width_right = NULL;
7945   if (**ptr >= '0' && **ptr <= '9') {
7946     width_left = *ptr;
7947     SKIP_DIGITS(*ptr);
7948     width_right = *ptr;
7949   }
7950 
7951   // Create the format for KMP_SNPRINTF based on flags parsed above
7952   format_index = 0;
7953   format[format_index++] = '%';
7954   if (!right_justify)
7955     format[format_index++] = '-';
7956   if (pad_zeros)
7957     format[format_index++] = '0';
7958   if (width_left && width_right) {
7959     int i = 0;
7960     // Only allow 8 digit number widths.
7961     // This also prevents overflowing format variable
7962     while (i < 8 && width_left < width_right) {
7963       format[format_index++] = *width_left;
7964       width_left++;
7965       i++;
7966     }
7967   }
7968 
7969   // Parse a name (long or short)
7970   // Canonicalize the name into absolute_short_name
7971   found_valid_name = false;
7972   parse_long_name = (**ptr == '{');
7973   if (parse_long_name)
7974     (*ptr)++; // skip initial left brace
7975   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7976                              sizeof(__kmp_affinity_format_table[0]);
7977        ++i) {
7978     char short_name = __kmp_affinity_format_table[i].short_name;
7979     const char *long_name = __kmp_affinity_format_table[i].long_name;
7980     char field_format = __kmp_affinity_format_table[i].field_format;
7981     if (parse_long_name) {
7982       int length = KMP_STRLEN(long_name);
7983       if (strncmp(*ptr, long_name, length) == 0) {
7984         found_valid_name = true;
7985         (*ptr) += length; // skip the long name
7986       }
7987     } else if (**ptr == short_name) {
7988       found_valid_name = true;
7989       (*ptr)++; // skip the short name
7990     }
7991     if (found_valid_name) {
7992       format[format_index++] = field_format;
7993       format[format_index++] = '\0';
7994       absolute_short_name = short_name;
7995       break;
7996     }
7997   }
7998   if (parse_long_name) {
7999     if (**ptr != '}') {
8000       absolute_short_name = 0;
8001     } else {
8002       (*ptr)++; // skip over the right brace
8003     }
8004   }
8005 
8006   // Attempt to fill the buffer with the requested
8007   // value using snprintf within __kmp_str_buf_print()
8008   switch (absolute_short_name) {
8009   case 't':
8010     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8011     break;
8012   case 'T':
8013     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8014     break;
8015   case 'L':
8016     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8017     break;
8018   case 'n':
8019     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8020     break;
8021   case 'H': {
8022     static const int BUFFER_SIZE = 256;
8023     char buf[BUFFER_SIZE];
8024     __kmp_expand_host_name(buf, BUFFER_SIZE);
8025     rc = __kmp_str_buf_print(field_buffer, format, buf);
8026   } break;
8027   case 'P':
8028     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8029     break;
8030   case 'i':
8031     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8032     break;
8033   case 'N':
8034     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8035     break;
8036   case 'a':
8037     field_value =
8038         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8039     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8040     break;
8041 #if KMP_AFFINITY_SUPPORTED
8042   case 'A': {
8043     kmp_str_buf_t buf;
8044     __kmp_str_buf_init(&buf);
8045     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8046     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8047     __kmp_str_buf_free(&buf);
8048   } break;
8049 #endif
8050   default:
8051     // According to spec, If an implementation does not have info for field
8052     // type, then "undefined" is printed
8053     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8054     // Skip the field
8055     if (parse_long_name) {
8056       SKIP_TOKEN(*ptr);
8057       if (**ptr == '}')
8058         (*ptr)++;
8059     } else {
8060       (*ptr)++;
8061     }
8062   }
8063 
8064   KMP_ASSERT(format_index <= FORMAT_SIZE);
8065   return rc;
8066 }
8067 
8068 /*
8069  * Return number of characters needed to hold the affinity string
8070  * (not including null byte character)
8071  * The resultant string is printed to buffer, which the caller can then
8072  * handle afterwards
8073 */
8074 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8075                                   kmp_str_buf_t *buffer) {
8076   const char *parse_ptr;
8077   size_t retval;
8078   const kmp_info_t *th;
8079   kmp_str_buf_t field;
8080 
8081   KMP_DEBUG_ASSERT(buffer);
8082   KMP_DEBUG_ASSERT(gtid >= 0);
8083 
8084   __kmp_str_buf_init(&field);
8085   __kmp_str_buf_clear(buffer);
8086 
8087   th = __kmp_threads[gtid];
8088   retval = 0;
8089 
8090   // If format is NULL or zero-length string, then we use
8091   // affinity-format-var ICV
8092   parse_ptr = format;
8093   if (parse_ptr == NULL || *parse_ptr == '\0') {
8094     parse_ptr = __kmp_affinity_format;
8095   }
8096   KMP_DEBUG_ASSERT(parse_ptr);
8097 
8098   while (*parse_ptr != '\0') {
8099     // Parse a field
8100     if (*parse_ptr == '%') {
8101       // Put field in the buffer
8102       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8103       __kmp_str_buf_catbuf(buffer, &field);
8104       retval += rc;
8105     } else {
8106       // Put literal character in buffer
8107       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8108       retval++;
8109       parse_ptr++;
8110     }
8111   }
8112   __kmp_str_buf_free(&field);
8113   return retval;
8114 }
8115 
8116 // Displays the affinity string to stdout
8117 void __kmp_aux_display_affinity(int gtid, const char *format) {
8118   kmp_str_buf_t buf;
8119   __kmp_str_buf_init(&buf);
8120   __kmp_aux_capture_affinity(gtid, format, &buf);
8121   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8122   __kmp_str_buf_free(&buf);
8123 }
8124 
8125 /* ------------------------------------------------------------------------ */
8126 
8127 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8128   int blocktime = arg; /* argument is in milliseconds */
8129 #if KMP_USE_MONITOR
8130   int bt_intervals;
8131 #endif
8132   int bt_set;
8133 
8134   __kmp_save_internal_controls(thread);
8135 
8136   /* Normalize and set blocktime for the teams */
8137   if (blocktime < KMP_MIN_BLOCKTIME)
8138     blocktime = KMP_MIN_BLOCKTIME;
8139   else if (blocktime > KMP_MAX_BLOCKTIME)
8140     blocktime = KMP_MAX_BLOCKTIME;
8141 
8142   set__blocktime_team(thread->th.th_team, tid, blocktime);
8143   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8144 
8145 #if KMP_USE_MONITOR
8146   /* Calculate and set blocktime intervals for the teams */
8147   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8148 
8149   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8150   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8151 #endif
8152 
8153   /* Set whether blocktime has been set to "TRUE" */
8154   bt_set = TRUE;
8155 
8156   set__bt_set_team(thread->th.th_team, tid, bt_set);
8157   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8158 #if KMP_USE_MONITOR
8159   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8160                 "bt_intervals=%d, monitor_updates=%d\n",
8161                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8162                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8163                 __kmp_monitor_wakeups));
8164 #else
8165   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8166                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8167                 thread->th.th_team->t.t_id, tid, blocktime));
8168 #endif
8169 }
8170 
8171 void __kmp_aux_set_defaults(char const *str, int len) {
8172   if (!__kmp_init_serial) {
8173     __kmp_serial_initialize();
8174   }
8175   __kmp_env_initialize(str);
8176 
8177   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8178     __kmp_env_print();
8179   }
8180 } // __kmp_aux_set_defaults
8181 
8182 /* ------------------------------------------------------------------------ */
8183 /* internal fast reduction routines */
8184 
8185 PACKED_REDUCTION_METHOD_T
8186 __kmp_determine_reduction_method(
8187     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8188     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8189     kmp_critical_name *lck) {
8190 
8191   // Default reduction method: critical construct ( lck != NULL, like in current
8192   // PAROPT )
8193   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8194   // can be selected by RTL
8195   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8196   // can be selected by RTL
8197   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8198   // among generated by PAROPT.
8199 
8200   PACKED_REDUCTION_METHOD_T retval;
8201 
8202   int team_size;
8203 
8204   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8205   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8206 
8207 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8208   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8209 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8210 
8211   retval = critical_reduce_block;
8212 
8213   // another choice of getting a team size (with 1 dynamic deference) is slower
8214   team_size = __kmp_get_team_num_threads(global_tid);
8215   if (team_size == 1) {
8216 
8217     retval = empty_reduce_block;
8218 
8219   } else {
8220 
8221     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8222 
8223 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8224     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8225 
8226 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8227     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8228 
8229     int teamsize_cutoff = 4;
8230 
8231 #if KMP_MIC_SUPPORTED
8232     if (__kmp_mic_type != non_mic) {
8233       teamsize_cutoff = 8;
8234     }
8235 #endif
8236     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8237     if (tree_available) {
8238       if (team_size <= teamsize_cutoff) {
8239         if (atomic_available) {
8240           retval = atomic_reduce_block;
8241         }
8242       } else {
8243         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8244       }
8245     } else if (atomic_available) {
8246       retval = atomic_reduce_block;
8247     }
8248 #else
8249 #error "Unknown or unsupported OS"
8250 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8251        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8252 
8253 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8254 
8255 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8256 
8257     // basic tuning
8258 
8259     if (atomic_available) {
8260       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8261         retval = atomic_reduce_block;
8262       }
8263     } // otherwise: use critical section
8264 
8265 #elif KMP_OS_DARWIN
8266 
8267     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8268     if (atomic_available && (num_vars <= 3)) {
8269       retval = atomic_reduce_block;
8270     } else if (tree_available) {
8271       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8272           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8273         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8274       }
8275     } // otherwise: use critical section
8276 
8277 #else
8278 #error "Unknown or unsupported OS"
8279 #endif
8280 
8281 #else
8282 #error "Unknown or unsupported architecture"
8283 #endif
8284   }
8285 
8286   // KMP_FORCE_REDUCTION
8287 
8288   // If the team is serialized (team_size == 1), ignore the forced reduction
8289   // method and stay with the unsynchronized method (empty_reduce_block)
8290   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8291       team_size != 1) {
8292 
8293     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8294 
8295     int atomic_available, tree_available;
8296 
8297     switch ((forced_retval = __kmp_force_reduction_method)) {
8298     case critical_reduce_block:
8299       KMP_ASSERT(lck); // lck should be != 0
8300       break;
8301 
8302     case atomic_reduce_block:
8303       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8304       if (!atomic_available) {
8305         KMP_WARNING(RedMethodNotSupported, "atomic");
8306         forced_retval = critical_reduce_block;
8307       }
8308       break;
8309 
8310     case tree_reduce_block:
8311       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8312       if (!tree_available) {
8313         KMP_WARNING(RedMethodNotSupported, "tree");
8314         forced_retval = critical_reduce_block;
8315       } else {
8316 #if KMP_FAST_REDUCTION_BARRIER
8317         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8318 #endif
8319       }
8320       break;
8321 
8322     default:
8323       KMP_ASSERT(0); // "unsupported method specified"
8324     }
8325 
8326     retval = forced_retval;
8327   }
8328 
8329   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8330 
8331 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8332 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8333 
8334   return (retval);
8335 }
8336 // this function is for testing set/get/determine reduce method
8337 kmp_int32 __kmp_get_reduce_method(void) {
8338   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8339 }
8340 
8341 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8342 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8343 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8344 
8345 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8346 // OpenMP is used subsequently.
8347 void __kmp_hard_pause() {
8348   __kmp_pause_status = kmp_hard_paused;
8349   __kmp_internal_end_thread(-1);
8350 }
8351 
8352 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8353 void __kmp_resume_if_soft_paused() {
8354   if (__kmp_pause_status == kmp_soft_paused) {
8355     __kmp_pause_status = kmp_not_paused;
8356 
8357     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8358       kmp_info_t *thread = __kmp_threads[gtid];
8359       if (thread) { // Wake it if sleeping
8360         kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8361         if (fl.is_sleeping())
8362           fl.resume(gtid);
8363         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8364           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8365         } else { // thread holds the lock and may sleep soon
8366           do { // until either the thread sleeps, or we can get the lock
8367             if (fl.is_sleeping()) {
8368               fl.resume(gtid);
8369               break;
8370             } else if (__kmp_try_suspend_mx(thread)) {
8371               __kmp_unlock_suspend_mx(thread);
8372               break;
8373             }
8374           } while (1);
8375         }
8376       }
8377     }
8378   }
8379 }
8380 
8381 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8382 // TODO: add warning messages
8383 int __kmp_pause_resource(kmp_pause_status_t level) {
8384   if (level == kmp_not_paused) { // requesting resume
8385     if (__kmp_pause_status == kmp_not_paused) {
8386       // error message about runtime not being paused, so can't resume
8387       return 1;
8388     } else {
8389       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8390                        __kmp_pause_status == kmp_hard_paused);
8391       __kmp_pause_status = kmp_not_paused;
8392       return 0;
8393     }
8394   } else if (level == kmp_soft_paused) { // requesting soft pause
8395     if (__kmp_pause_status != kmp_not_paused) {
8396       // error message about already being paused
8397       return 1;
8398     } else {
8399       __kmp_soft_pause();
8400       return 0;
8401     }
8402   } else if (level == kmp_hard_paused) { // requesting hard pause
8403     if (__kmp_pause_status != kmp_not_paused) {
8404       // error message about already being paused
8405       return 1;
8406     } else {
8407       __kmp_hard_pause();
8408       return 0;
8409     }
8410   } else {
8411     // error message about invalid level
8412     return 1;
8413   }
8414 }
8415 
8416 
8417 void __kmp_omp_display_env(int verbose) {
8418   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8419   if (__kmp_init_serial == 0)
8420     __kmp_do_serial_initialize();
8421   __kmp_display_env_impl(!verbose, verbose);
8422   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8423 }
8424