1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 #if OMP_PROFILING_SUPPORT
36 #include "llvm/Support/TimeProfiler.h"
37 static char *ProfileTraceFile = nullptr;
38 #endif
39 
40 /* these are temporary issues to be dealt with */
41 #define KMP_USE_PRCTL 0
42 
43 #if KMP_OS_WINDOWS
44 #include <process.h>
45 #endif
46 
47 #include "tsan_annotations.h"
48 
49 #if KMP_OS_WINDOWS
50 // windows does not need include files as it doesn't use shared memory
51 #else
52 #include <sys/mman.h>
53 #include <sys/stat.h>
54 #include <fcntl.h>
55 #define SHM_SIZE 1024
56 #endif
57 
58 #if defined(KMP_GOMP_COMPAT)
59 char const __kmp_version_alt_comp[] =
60     KMP_VERSION_PREFIX "alternative compiler support: yes";
61 #endif /* defined(KMP_GOMP_COMPAT) */
62 
63 char const __kmp_version_omp_api[] =
64     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
65 
66 #ifdef KMP_DEBUG
67 char const __kmp_version_lock[] =
68     KMP_VERSION_PREFIX "lock type: run time selectable";
69 #endif /* KMP_DEBUG */
70 
71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
72 
73 /* ------------------------------------------------------------------------ */
74 
75 #if KMP_USE_MONITOR
76 kmp_info_t __kmp_monitor;
77 #endif
78 
79 /* Forward declarations */
80 
81 void __kmp_cleanup(void);
82 
83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
84                                   int gtid);
85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
86                                   kmp_internal_control_t *new_icvs,
87                                   ident_t *loc);
88 #if KMP_AFFINITY_SUPPORTED
89 static void __kmp_partition_places(kmp_team_t *team,
90                                    int update_master_only = 0);
91 #endif
92 static void __kmp_do_serial_initialize(void);
93 void __kmp_fork_barrier(int gtid, int tid);
94 void __kmp_join_barrier(int gtid);
95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
96                           kmp_internal_control_t *new_icvs, ident_t *loc);
97 
98 #ifdef USE_LOAD_BALANCE
99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
100 #endif
101 
102 static int __kmp_expand_threads(int nNeed);
103 #if KMP_OS_WINDOWS
104 static int __kmp_unregister_root_other_thread(int gtid);
105 #endif
106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
108 
109 /* Calculate the identifier of the current thread */
110 /* fast (and somewhat portable) way to get unique identifier of executing
111    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
112 int __kmp_get_global_thread_id() {
113   int i;
114   kmp_info_t **other_threads;
115   size_t stack_data;
116   char *stack_addr;
117   size_t stack_size;
118   char *stack_base;
119 
120   KA_TRACE(
121       1000,
122       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
123        __kmp_nth, __kmp_all_nth));
124 
125   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
126      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
127      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
128      __kmp_init_gtid for this to work. */
129 
130   if (!TCR_4(__kmp_init_gtid))
131     return KMP_GTID_DNE;
132 
133 #ifdef KMP_TDATA_GTID
134   if (TCR_4(__kmp_gtid_mode) >= 3) {
135     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
136     return __kmp_gtid;
137   }
138 #endif
139   if (TCR_4(__kmp_gtid_mode) >= 2) {
140     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
141     return __kmp_gtid_get_specific();
142   }
143   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
144 
145   stack_addr = (char *)&stack_data;
146   other_threads = __kmp_threads;
147 
148   /* ATT: The code below is a source of potential bugs due to unsynchronized
149      access to __kmp_threads array. For example:
150      1. Current thread loads other_threads[i] to thr and checks it, it is
151         non-NULL.
152      2. Current thread is suspended by OS.
153      3. Another thread unregisters and finishes (debug versions of free()
154         may fill memory with something like 0xEF).
155      4. Current thread is resumed.
156      5. Current thread reads junk from *thr.
157      TODO: Fix it.  --ln  */
158 
159   for (i = 0; i < __kmp_threads_capacity; i++) {
160 
161     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
162     if (!thr)
163       continue;
164 
165     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
166     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
167 
168     /* stack grows down -- search through all of the active threads */
169 
170     if (stack_addr <= stack_base) {
171       size_t stack_diff = stack_base - stack_addr;
172 
173       if (stack_diff <= stack_size) {
174         /* The only way we can be closer than the allocated */
175         /* stack size is if we are running on this thread. */
176         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
177         return i;
178       }
179     }
180   }
181 
182   /* get specific to try and determine our gtid */
183   KA_TRACE(1000,
184            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
185             "thread, using TLS\n"));
186   i = __kmp_gtid_get_specific();
187 
188   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
189 
190   /* if we havn't been assigned a gtid, then return code */
191   if (i < 0)
192     return i;
193 
194   /* dynamically updated stack window for uber threads to avoid get_specific
195      call */
196   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
197     KMP_FATAL(StackOverflow, i);
198   }
199 
200   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201   if (stack_addr > stack_base) {
202     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
203     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
205                 stack_base);
206   } else {
207     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
208             stack_base - stack_addr);
209   }
210 
211   /* Reprint stack bounds for ubermaster since they have been refined */
212   if (__kmp_storage_map) {
213     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
214     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
215     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
216                                  other_threads[i]->th.th_info.ds.ds_stacksize,
217                                  "th_%d stack (refinement)", i);
218   }
219   return i;
220 }
221 
222 int __kmp_get_global_thread_id_reg() {
223   int gtid;
224 
225   if (!__kmp_init_serial) {
226     gtid = KMP_GTID_DNE;
227   } else
228 #ifdef KMP_TDATA_GTID
229       if (TCR_4(__kmp_gtid_mode) >= 3) {
230     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
231     gtid = __kmp_gtid;
232   } else
233 #endif
234       if (TCR_4(__kmp_gtid_mode) >= 2) {
235     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
236     gtid = __kmp_gtid_get_specific();
237   } else {
238     KA_TRACE(1000,
239              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
240     gtid = __kmp_get_global_thread_id();
241   }
242 
243   /* we must be a new uber master sibling thread */
244   if (gtid == KMP_GTID_DNE) {
245     KA_TRACE(10,
246              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
247               "Registering a new gtid.\n"));
248     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
249     if (!__kmp_init_serial) {
250       __kmp_do_serial_initialize();
251       gtid = __kmp_gtid_get_specific();
252     } else {
253       gtid = __kmp_register_root(FALSE);
254     }
255     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
256     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
257   }
258 
259   KMP_DEBUG_ASSERT(gtid >= 0);
260 
261   return gtid;
262 }
263 
264 /* caller must hold forkjoin_lock */
265 void __kmp_check_stack_overlap(kmp_info_t *th) {
266   int f;
267   char *stack_beg = NULL;
268   char *stack_end = NULL;
269   int gtid;
270 
271   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
272   if (__kmp_storage_map) {
273     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
274     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
275 
276     gtid = __kmp_gtid_from_thread(th);
277 
278     if (gtid == KMP_GTID_MONITOR) {
279       __kmp_print_storage_map_gtid(
280           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281           "th_%s stack (%s)", "mon",
282           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283     } else {
284       __kmp_print_storage_map_gtid(
285           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286           "th_%d stack (%s)", gtid,
287           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288     }
289   }
290 
291   /* No point in checking ubermaster threads since they use refinement and
292    * cannot overlap */
293   gtid = __kmp_gtid_from_thread(th);
294   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
295     KA_TRACE(10,
296              ("__kmp_check_stack_overlap: performing extensive checking\n"));
297     if (stack_beg == NULL) {
298       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
299       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
300     }
301 
302     for (f = 0; f < __kmp_threads_capacity; f++) {
303       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
304 
305       if (f_th && f_th != th) {
306         char *other_stack_end =
307             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
308         char *other_stack_beg =
309             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
310         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
311             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
312 
313           /* Print the other stack values before the abort */
314           if (__kmp_storage_map)
315             __kmp_print_storage_map_gtid(
316                 -1, other_stack_beg, other_stack_end,
317                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
318                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
319 
320           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
321                       __kmp_msg_null);
322         }
323       }
324     }
325   }
326   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
327 }
328 
329 /* ------------------------------------------------------------------------ */
330 
331 void __kmp_infinite_loop(void) {
332   static int done = FALSE;
333 
334   while (!done) {
335     KMP_YIELD(TRUE);
336   }
337 }
338 
339 #define MAX_MESSAGE 512
340 
341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
342                                   char const *format, ...) {
343   char buffer[MAX_MESSAGE];
344   va_list ap;
345 
346   va_start(ap, format);
347   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
348                p2, (unsigned long)size, format);
349   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
350   __kmp_vprintf(kmp_err, buffer, ap);
351 #if KMP_PRINT_DATA_PLACEMENT
352   int node;
353   if (gtid >= 0) {
354     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
355       if (__kmp_storage_map_verbose) {
356         node = __kmp_get_host_node(p1);
357         if (node < 0) /* doesn't work, so don't try this next time */
358           __kmp_storage_map_verbose = FALSE;
359         else {
360           char *last;
361           int lastNode;
362           int localProc = __kmp_get_cpu_from_gtid(gtid);
363 
364           const int page_size = KMP_GET_PAGE_SIZE();
365 
366           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
367           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
368           if (localProc >= 0)
369             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
370                                  localProc >> 1);
371           else
372             __kmp_printf_no_lock("  GTID %d\n", gtid);
373 #if KMP_USE_PRCTL
374           /* The more elaborate format is disabled for now because of the prctl
375            * hanging bug. */
376           do {
377             last = p1;
378             lastNode = node;
379             /* This loop collates adjacent pages with the same host node. */
380             do {
381               (char *)p1 += page_size;
382             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
383             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
384                                  lastNode);
385           } while (p1 <= p2);
386 #else
387           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
388                                (char *)p1 + (page_size - 1),
389                                __kmp_get_host_node(p1));
390           if (p1 < p2) {
391             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
392                                  (char *)p2 + (page_size - 1),
393                                  __kmp_get_host_node(p2));
394           }
395 #endif
396         }
397       }
398     } else
399       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
400   }
401 #endif /* KMP_PRINT_DATA_PLACEMENT */
402   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
403 }
404 
405 void __kmp_warn(char const *format, ...) {
406   char buffer[MAX_MESSAGE];
407   va_list ap;
408 
409   if (__kmp_generate_warnings == kmp_warnings_off) {
410     return;
411   }
412 
413   va_start(ap, format);
414 
415   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
416   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
417   __kmp_vprintf(kmp_err, buffer, ap);
418   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
419 
420   va_end(ap);
421 }
422 
423 void __kmp_abort_process() {
424   // Later threads may stall here, but that's ok because abort() will kill them.
425   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
426 
427   if (__kmp_debug_buf) {
428     __kmp_dump_debug_buffer();
429   }
430 
431   if (KMP_OS_WINDOWS) {
432     // Let other threads know of abnormal termination and prevent deadlock
433     // if abort happened during library initialization or shutdown
434     __kmp_global.g.g_abort = SIGABRT;
435 
436     /* On Windows* OS by default abort() causes pop-up error box, which stalls
437        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
438        boxes. _set_abort_behavior() works well, but this function is not
439        available in VS7 (this is not problem for DLL, but it is a problem for
440        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
441        help, at least in some versions of MS C RTL.
442 
443        It seems following sequence is the only way to simulate abort() and
444        avoid pop-up error box. */
445     raise(SIGABRT);
446     _exit(3); // Just in case, if signal ignored, exit anyway.
447   } else {
448     __kmp_unregister_library();
449     abort();
450   }
451 
452   __kmp_infinite_loop();
453   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
454 
455 } // __kmp_abort_process
456 
457 void __kmp_abort_thread(void) {
458   // TODO: Eliminate g_abort global variable and this function.
459   // In case of abort just call abort(), it will kill all the threads.
460   __kmp_infinite_loop();
461 } // __kmp_abort_thread
462 
463 /* Print out the storage map for the major kmp_info_t thread data structures
464    that are allocated together. */
465 
466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
467   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
468                                gtid);
469 
470   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
471                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
472 
473   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
474                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
475 
476   __kmp_print_storage_map_gtid(
477       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
478       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
481                                &thr->th.th_bar[bs_plain_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
483                                gtid);
484 
485   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
486                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
487                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
488                                gtid);
489 
490 #if KMP_FAST_REDUCTION_BARRIER
491   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
492                                &thr->th.th_bar[bs_reduction_barrier + 1],
493                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
494                                gtid);
495 #endif // KMP_FAST_REDUCTION_BARRIER
496 }
497 
498 /* Print out the storage map for the major kmp_team_t team data structures
499    that are allocated together. */
500 
501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
502                                          int team_id, int num_thr) {
503   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
504   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
505                                header, team_id);
506 
507   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
508                                &team->t.t_bar[bs_last_barrier],
509                                sizeof(kmp_balign_team_t) * bs_last_barrier,
510                                "%s_%d.t_bar", header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
513                                &team->t.t_bar[bs_plain_barrier + 1],
514                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
515                                header, team_id);
516 
517   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
518                                &team->t.t_bar[bs_forkjoin_barrier + 1],
519                                sizeof(kmp_balign_team_t),
520                                "%s_%d.t_bar[forkjoin]", header, team_id);
521 
522 #if KMP_FAST_REDUCTION_BARRIER
523   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
524                                &team->t.t_bar[bs_reduction_barrier + 1],
525                                sizeof(kmp_balign_team_t),
526                                "%s_%d.t_bar[reduction]", header, team_id);
527 #endif // KMP_FAST_REDUCTION_BARRIER
528 
529   __kmp_print_storage_map_gtid(
530       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
531       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
532 
533   __kmp_print_storage_map_gtid(
534       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
535       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
536 
537   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
538                                &team->t.t_disp_buffer[num_disp_buff],
539                                sizeof(dispatch_shared_info_t) * num_disp_buff,
540                                "%s_%d.t_disp_buffer", header, team_id);
541 }
542 
543 static void __kmp_init_allocator() { __kmp_init_memkind(); }
544 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
545 
546 /* ------------------------------------------------------------------------ */
547 
548 #if KMP_DYNAMIC_LIB
549 #if KMP_OS_WINDOWS
550 
551 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
552   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
553 
554   switch (fdwReason) {
555 
556   case DLL_PROCESS_ATTACH:
557     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
558 
559     return TRUE;
560 
561   case DLL_PROCESS_DETACH:
562     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
563 
564     // According to Windows* documentation for DllMain entry point:
565     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
566     //   lpReserved == NULL when FreeLibrary() is called,
567     //   lpReserved != NULL when the process is terminated.
568     // When FreeLibrary() is called, worker threads remain alive. So the
569     // runtime's state is consistent and executing proper shutdown is OK.
570     // When the process is terminated, worker threads have exited or been
571     // forcefully terminated by the OS and only the shutdown thread remains.
572     // This can leave the runtime in an inconsistent state.
573     // Hence, only attempt proper cleanup when FreeLibrary() is called.
574     // Otherwise, rely on OS to reclaim resources.
575     if (lpReserved == NULL)
576       __kmp_internal_end_library(__kmp_gtid_get_specific());
577 
578     return TRUE;
579 
580   case DLL_THREAD_ATTACH:
581     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
582 
583     /* if we want to register new siblings all the time here call
584      * __kmp_get_gtid(); */
585     return TRUE;
586 
587   case DLL_THREAD_DETACH:
588     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
589 
590     __kmp_internal_end_thread(__kmp_gtid_get_specific());
591     return TRUE;
592   }
593 
594   return TRUE;
595 }
596 
597 #endif /* KMP_OS_WINDOWS */
598 #endif /* KMP_DYNAMIC_LIB */
599 
600 /* __kmp_parallel_deo -- Wait until it's our turn. */
601 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
602   int gtid = *gtid_ref;
603 #ifdef BUILD_PARALLEL_ORDERED
604   kmp_team_t *team = __kmp_team_from_gtid(gtid);
605 #endif /* BUILD_PARALLEL_ORDERED */
606 
607   if (__kmp_env_consistency_check) {
608     if (__kmp_threads[gtid]->th.th_root->r.r_active)
609 #if KMP_USE_DYNAMIC_LOCK
610       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
611 #else
612       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
613 #endif
614   }
615 #ifdef BUILD_PARALLEL_ORDERED
616   if (!team->t.t_serialized) {
617     KMP_MB();
618     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
619              NULL);
620     KMP_MB();
621   }
622 #endif /* BUILD_PARALLEL_ORDERED */
623 }
624 
625 /* __kmp_parallel_dxo -- Signal the next task. */
626 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
627   int gtid = *gtid_ref;
628 #ifdef BUILD_PARALLEL_ORDERED
629   int tid = __kmp_tid_from_gtid(gtid);
630   kmp_team_t *team = __kmp_team_from_gtid(gtid);
631 #endif /* BUILD_PARALLEL_ORDERED */
632 
633   if (__kmp_env_consistency_check) {
634     if (__kmp_threads[gtid]->th.th_root->r.r_active)
635       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
636   }
637 #ifdef BUILD_PARALLEL_ORDERED
638   if (!team->t.t_serialized) {
639     KMP_MB(); /* Flush all pending memory write invalidates.  */
640 
641     /* use the tid of the next thread in this team */
642     /* TODO replace with general release procedure */
643     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
644 
645     KMP_MB(); /* Flush all pending memory write invalidates.  */
646   }
647 #endif /* BUILD_PARALLEL_ORDERED */
648 }
649 
650 /* ------------------------------------------------------------------------ */
651 /* The BARRIER for a SINGLE process section is always explicit   */
652 
653 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
654   int status;
655   kmp_info_t *th;
656   kmp_team_t *team;
657 
658   if (!TCR_4(__kmp_init_parallel))
659     __kmp_parallel_initialize();
660   __kmp_resume_if_soft_paused();
661 
662   th = __kmp_threads[gtid];
663   team = th->th.th_team;
664   status = 0;
665 
666   th->th.th_ident = id_ref;
667 
668   if (team->t.t_serialized) {
669     status = 1;
670   } else {
671     kmp_int32 old_this = th->th.th_local.this_construct;
672 
673     ++th->th.th_local.this_construct;
674     /* try to set team count to thread count--success means thread got the
675        single block */
676     /* TODO: Should this be acquire or release? */
677     if (team->t.t_construct == old_this) {
678       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
679                                               th->th.th_local.this_construct);
680     }
681 #if USE_ITT_BUILD
682     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
683         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
684         team->t.t_active_level ==
685             1) { // Only report metadata by master of active team at level 1
686       __kmp_itt_metadata_single(id_ref);
687     }
688 #endif /* USE_ITT_BUILD */
689   }
690 
691   if (__kmp_env_consistency_check) {
692     if (status && push_ws) {
693       __kmp_push_workshare(gtid, ct_psingle, id_ref);
694     } else {
695       __kmp_check_workshare(gtid, ct_psingle, id_ref);
696     }
697   }
698 #if USE_ITT_BUILD
699   if (status) {
700     __kmp_itt_single_start(gtid);
701   }
702 #endif /* USE_ITT_BUILD */
703   return status;
704 }
705 
706 void __kmp_exit_single(int gtid) {
707 #if USE_ITT_BUILD
708   __kmp_itt_single_end(gtid);
709 #endif /* USE_ITT_BUILD */
710   if (__kmp_env_consistency_check)
711     __kmp_pop_workshare(gtid, ct_psingle, NULL);
712 }
713 
714 /* determine if we can go parallel or must use a serialized parallel region and
715  * how many threads we can use
716  * set_nproc is the number of threads requested for the team
717  * returns 0 if we should serialize or only use one thread,
718  * otherwise the number of threads to use
719  * The forkjoin lock is held by the caller. */
720 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
721                                  int master_tid, int set_nthreads,
722                                  int enter_teams) {
723   int capacity;
724   int new_nthreads;
725   KMP_DEBUG_ASSERT(__kmp_init_serial);
726   KMP_DEBUG_ASSERT(root && parent_team);
727   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
728 
729   // If dyn-var is set, dynamically adjust the number of desired threads,
730   // according to the method specified by dynamic_mode.
731   new_nthreads = set_nthreads;
732   if (!get__dynamic_2(parent_team, master_tid)) {
733     ;
734   }
735 #ifdef USE_LOAD_BALANCE
736   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
737     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
738     if (new_nthreads == 1) {
739       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
740                     "reservation to 1 thread\n",
741                     master_tid));
742       return 1;
743     }
744     if (new_nthreads < set_nthreads) {
745       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
746                     "reservation to %d threads\n",
747                     master_tid, new_nthreads));
748     }
749   }
750 #endif /* USE_LOAD_BALANCE */
751   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
752     new_nthreads = __kmp_avail_proc - __kmp_nth +
753                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
754     if (new_nthreads <= 1) {
755       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
756                     "reservation to 1 thread\n",
757                     master_tid));
758       return 1;
759     }
760     if (new_nthreads < set_nthreads) {
761       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
762                     "reservation to %d threads\n",
763                     master_tid, new_nthreads));
764     } else {
765       new_nthreads = set_nthreads;
766     }
767   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
768     if (set_nthreads > 2) {
769       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
770       new_nthreads = (new_nthreads % set_nthreads) + 1;
771       if (new_nthreads == 1) {
772         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
773                       "reservation to 1 thread\n",
774                       master_tid));
775         return 1;
776       }
777       if (new_nthreads < set_nthreads) {
778         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
779                       "reservation to %d threads\n",
780                       master_tid, new_nthreads));
781       }
782     }
783   } else {
784     KMP_ASSERT(0);
785   }
786 
787   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
788   if (__kmp_nth + new_nthreads -
789           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
790       __kmp_max_nth) {
791     int tl_nthreads = __kmp_max_nth - __kmp_nth +
792                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
793     if (tl_nthreads <= 0) {
794       tl_nthreads = 1;
795     }
796 
797     // If dyn-var is false, emit a 1-time warning.
798     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
799       __kmp_reserve_warn = 1;
800       __kmp_msg(kmp_ms_warning,
801                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
802                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
803     }
804     if (tl_nthreads == 1) {
805       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
806                     "reduced reservation to 1 thread\n",
807                     master_tid));
808       return 1;
809     }
810     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
811                   "reservation to %d threads\n",
812                   master_tid, tl_nthreads));
813     new_nthreads = tl_nthreads;
814   }
815 
816   // Respect OMP_THREAD_LIMIT
817   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
818   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
819   if (cg_nthreads + new_nthreads -
820           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
821       max_cg_threads) {
822     int tl_nthreads = max_cg_threads - cg_nthreads +
823                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
824     if (tl_nthreads <= 0) {
825       tl_nthreads = 1;
826     }
827 
828     // If dyn-var is false, emit a 1-time warning.
829     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
830       __kmp_reserve_warn = 1;
831       __kmp_msg(kmp_ms_warning,
832                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
833                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
834     }
835     if (tl_nthreads == 1) {
836       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
837                     "reduced reservation to 1 thread\n",
838                     master_tid));
839       return 1;
840     }
841     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
842                   "reservation to %d threads\n",
843                   master_tid, tl_nthreads));
844     new_nthreads = tl_nthreads;
845   }
846 
847   // Check if the threads array is large enough, or needs expanding.
848   // See comment in __kmp_register_root() about the adjustment if
849   // __kmp_threads[0] == NULL.
850   capacity = __kmp_threads_capacity;
851   if (TCR_PTR(__kmp_threads[0]) == NULL) {
852     --capacity;
853   }
854   if (__kmp_nth + new_nthreads -
855           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
856       capacity) {
857     // Expand the threads array.
858     int slotsRequired = __kmp_nth + new_nthreads -
859                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
860                         capacity;
861     int slotsAdded = __kmp_expand_threads(slotsRequired);
862     if (slotsAdded < slotsRequired) {
863       // The threads array was not expanded enough.
864       new_nthreads -= (slotsRequired - slotsAdded);
865       KMP_ASSERT(new_nthreads >= 1);
866 
867       // If dyn-var is false, emit a 1-time warning.
868       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
869         __kmp_reserve_warn = 1;
870         if (__kmp_tp_cached) {
871           __kmp_msg(kmp_ms_warning,
872                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
873                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
874                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
875         } else {
876           __kmp_msg(kmp_ms_warning,
877                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
878                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
879         }
880       }
881     }
882   }
883 
884 #ifdef KMP_DEBUG
885   if (new_nthreads == 1) {
886     KC_TRACE(10,
887              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
888               "dead roots and rechecking; requested %d threads\n",
889               __kmp_get_gtid(), set_nthreads));
890   } else {
891     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
892                   " %d threads\n",
893                   __kmp_get_gtid(), new_nthreads, set_nthreads));
894   }
895 #endif // KMP_DEBUG
896   return new_nthreads;
897 }
898 
899 /* Allocate threads from the thread pool and assign them to the new team. We are
900    assured that there are enough threads available, because we checked on that
901    earlier within critical section forkjoin */
902 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
903                                     kmp_info_t *master_th, int master_gtid) {
904   int i;
905   int use_hot_team;
906 
907   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
908   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
909   KMP_MB();
910 
911   /* first, let's setup the master thread */
912   master_th->th.th_info.ds.ds_tid = 0;
913   master_th->th.th_team = team;
914   master_th->th.th_team_nproc = team->t.t_nproc;
915   master_th->th.th_team_master = master_th;
916   master_th->th.th_team_serialized = FALSE;
917   master_th->th.th_dispatch = &team->t.t_dispatch[0];
918 
919 /* make sure we are not the optimized hot team */
920 #if KMP_NESTED_HOT_TEAMS
921   use_hot_team = 0;
922   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
923   if (hot_teams) { // hot teams array is not allocated if
924     // KMP_HOT_TEAMS_MAX_LEVEL=0
925     int level = team->t.t_active_level - 1; // index in array of hot teams
926     if (master_th->th.th_teams_microtask) { // are we inside the teams?
927       if (master_th->th.th_teams_size.nteams > 1) {
928         ++level; // level was not increased in teams construct for
929         // team_of_masters
930       }
931       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
932           master_th->th.th_teams_level == team->t.t_level) {
933         ++level; // level was not increased in teams construct for
934         // team_of_workers before the parallel
935       } // team->t.t_level will be increased inside parallel
936     }
937     if (level < __kmp_hot_teams_max_level) {
938       if (hot_teams[level].hot_team) {
939         // hot team has already been allocated for given level
940         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
941         use_hot_team = 1; // the team is ready to use
942       } else {
943         use_hot_team = 0; // AC: threads are not allocated yet
944         hot_teams[level].hot_team = team; // remember new hot team
945         hot_teams[level].hot_team_nth = team->t.t_nproc;
946       }
947     } else {
948       use_hot_team = 0;
949     }
950   }
951 #else
952   use_hot_team = team == root->r.r_hot_team;
953 #endif
954   if (!use_hot_team) {
955 
956     /* install the master thread */
957     team->t.t_threads[0] = master_th;
958     __kmp_initialize_info(master_th, team, 0, master_gtid);
959 
960     /* now, install the worker threads */
961     for (i = 1; i < team->t.t_nproc; i++) {
962 
963       /* fork or reallocate a new thread and install it in team */
964       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
965       team->t.t_threads[i] = thr;
966       KMP_DEBUG_ASSERT(thr);
967       KMP_DEBUG_ASSERT(thr->th.th_team == team);
968       /* align team and thread arrived states */
969       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
970                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
971                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
972                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
973                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
974                     team->t.t_bar[bs_plain_barrier].b_arrived));
975       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
976       thr->th.th_teams_level = master_th->th.th_teams_level;
977       thr->th.th_teams_size = master_th->th.th_teams_size;
978       { // Initialize threads' barrier data.
979         int b;
980         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
981         for (b = 0; b < bs_last_barrier; ++b) {
982           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
983           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
984 #if USE_DEBUGGER
985           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
986 #endif
987         }
988       }
989     }
990 
991 #if KMP_AFFINITY_SUPPORTED
992     __kmp_partition_places(team);
993 #endif
994   }
995 
996   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
997     for (i = 0; i < team->t.t_nproc; i++) {
998       kmp_info_t *thr = team->t.t_threads[i];
999       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1000           thr->th.th_prev_level != team->t.t_level) {
1001         team->t.t_display_affinity = 1;
1002         break;
1003       }
1004     }
1005   }
1006 
1007   KMP_MB();
1008 }
1009 
1010 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1011 // Propagate any changes to the floating point control registers out to the team
1012 // We try to avoid unnecessary writes to the relevant cache line in the team
1013 // structure, so we don't make changes unless they are needed.
1014 inline static void propagateFPControl(kmp_team_t *team) {
1015   if (__kmp_inherit_fp_control) {
1016     kmp_int16 x87_fpu_control_word;
1017     kmp_uint32 mxcsr;
1018 
1019     // Get master values of FPU control flags (both X87 and vector)
1020     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1021     __kmp_store_mxcsr(&mxcsr);
1022     mxcsr &= KMP_X86_MXCSR_MASK;
1023 
1024     // There is no point looking at t_fp_control_saved here.
1025     // If it is TRUE, we still have to update the values if they are different
1026     // from those we now have. If it is FALSE we didn't save anything yet, but
1027     // our objective is the same. We have to ensure that the values in the team
1028     // are the same as those we have.
1029     // So, this code achieves what we need whether or not t_fp_control_saved is
1030     // true. By checking whether the value needs updating we avoid unnecessary
1031     // writes that would put the cache-line into a written state, causing all
1032     // threads in the team to have to read it again.
1033     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1034     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1035     // Although we don't use this value, other code in the runtime wants to know
1036     // whether it should restore them. So we must ensure it is correct.
1037     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1038   } else {
1039     // Similarly here. Don't write to this cache-line in the team structure
1040     // unless we have to.
1041     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1042   }
1043 }
1044 
1045 // Do the opposite, setting the hardware registers to the updated values from
1046 // the team.
1047 inline static void updateHWFPControl(kmp_team_t *team) {
1048   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1049     // Only reset the fp control regs if they have been changed in the team.
1050     // the parallel region that we are exiting.
1051     kmp_int16 x87_fpu_control_word;
1052     kmp_uint32 mxcsr;
1053     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1054     __kmp_store_mxcsr(&mxcsr);
1055     mxcsr &= KMP_X86_MXCSR_MASK;
1056 
1057     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1058       __kmp_clear_x87_fpu_status_word();
1059       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1060     }
1061 
1062     if (team->t.t_mxcsr != mxcsr) {
1063       __kmp_load_mxcsr(&team->t.t_mxcsr);
1064     }
1065   }
1066 }
1067 #else
1068 #define propagateFPControl(x) ((void)0)
1069 #define updateHWFPControl(x) ((void)0)
1070 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1071 
1072 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1073                                      int realloc); // forward declaration
1074 
1075 /* Run a parallel region that has been serialized, so runs only in a team of the
1076    single master thread. */
1077 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1078   kmp_info_t *this_thr;
1079   kmp_team_t *serial_team;
1080 
1081   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1082 
1083   /* Skip all this code for autopar serialized loops since it results in
1084      unacceptable overhead */
1085   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1086     return;
1087 
1088   if (!TCR_4(__kmp_init_parallel))
1089     __kmp_parallel_initialize();
1090   __kmp_resume_if_soft_paused();
1091 
1092   this_thr = __kmp_threads[global_tid];
1093   serial_team = this_thr->th.th_serial_team;
1094 
1095   /* utilize the serialized team held by this thread */
1096   KMP_DEBUG_ASSERT(serial_team);
1097   KMP_MB();
1098 
1099   if (__kmp_tasking_mode != tskm_immediate_exec) {
1100     KMP_DEBUG_ASSERT(
1101         this_thr->th.th_task_team ==
1102         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1103     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1104                      NULL);
1105     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1106                   "team %p, new task_team = NULL\n",
1107                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1108     this_thr->th.th_task_team = NULL;
1109   }
1110 
1111   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1112   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1113     proc_bind = proc_bind_false;
1114   } else if (proc_bind == proc_bind_default) {
1115     // No proc_bind clause was specified, so use the current value
1116     // of proc-bind-var for this parallel region.
1117     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1118   }
1119   // Reset for next parallel region
1120   this_thr->th.th_set_proc_bind = proc_bind_default;
1121 
1122 #if OMPT_SUPPORT
1123   ompt_data_t ompt_parallel_data = ompt_data_none;
1124   ompt_data_t *implicit_task_data;
1125   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1126   if (ompt_enabled.enabled &&
1127       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1128 
1129     ompt_task_info_t *parent_task_info;
1130     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1131 
1132     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1133     if (ompt_enabled.ompt_callback_parallel_begin) {
1134       int team_size = 1;
1135 
1136       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1137           &(parent_task_info->task_data), &(parent_task_info->frame),
1138           &ompt_parallel_data, team_size,
1139           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1140     }
1141   }
1142 #endif // OMPT_SUPPORT
1143 
1144   if (this_thr->th.th_team != serial_team) {
1145     // Nested level will be an index in the nested nthreads array
1146     int level = this_thr->th.th_team->t.t_level;
1147 
1148     if (serial_team->t.t_serialized) {
1149       /* this serial team was already used
1150          TODO increase performance by making this locks more specific */
1151       kmp_team_t *new_team;
1152 
1153       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1154 
1155       new_team =
1156           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1157 #if OMPT_SUPPORT
1158                               ompt_parallel_data,
1159 #endif
1160                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1161                               0 USE_NESTED_HOT_ARG(NULL));
1162       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1163       KMP_ASSERT(new_team);
1164 
1165       /* setup new serialized team and install it */
1166       new_team->t.t_threads[0] = this_thr;
1167       new_team->t.t_parent = this_thr->th.th_team;
1168       serial_team = new_team;
1169       this_thr->th.th_serial_team = serial_team;
1170 
1171       KF_TRACE(
1172           10,
1173           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1174            global_tid, serial_team));
1175 
1176       /* TODO the above breaks the requirement that if we run out of resources,
1177          then we can still guarantee that serialized teams are ok, since we may
1178          need to allocate a new one */
1179     } else {
1180       KF_TRACE(
1181           10,
1182           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1183            global_tid, serial_team));
1184     }
1185 
1186     /* we have to initialize this serial team */
1187     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1188     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1189     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1190     serial_team->t.t_ident = loc;
1191     serial_team->t.t_serialized = 1;
1192     serial_team->t.t_nproc = 1;
1193     serial_team->t.t_parent = this_thr->th.th_team;
1194     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1195     this_thr->th.th_team = serial_team;
1196     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1197 
1198     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1199                   this_thr->th.th_current_task));
1200     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1201     this_thr->th.th_current_task->td_flags.executing = 0;
1202 
1203     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1204 
1205     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1206        implicit task for each serialized task represented by
1207        team->t.t_serialized? */
1208     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1209               &this_thr->th.th_current_task->td_parent->td_icvs);
1210 
1211     // Thread value exists in the nested nthreads array for the next nested
1212     // level
1213     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1214       this_thr->th.th_current_task->td_icvs.nproc =
1215           __kmp_nested_nth.nth[level + 1];
1216     }
1217 
1218     if (__kmp_nested_proc_bind.used &&
1219         (level + 1 < __kmp_nested_proc_bind.used)) {
1220       this_thr->th.th_current_task->td_icvs.proc_bind =
1221           __kmp_nested_proc_bind.bind_types[level + 1];
1222     }
1223 
1224 #if USE_DEBUGGER
1225     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1226 #endif
1227     this_thr->th.th_info.ds.ds_tid = 0;
1228 
1229     /* set thread cache values */
1230     this_thr->th.th_team_nproc = 1;
1231     this_thr->th.th_team_master = this_thr;
1232     this_thr->th.th_team_serialized = 1;
1233 
1234     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1235     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1236     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1237 
1238     propagateFPControl(serial_team);
1239 
1240     /* check if we need to allocate dispatch buffers stack */
1241     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1242     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1243       serial_team->t.t_dispatch->th_disp_buffer =
1244           (dispatch_private_info_t *)__kmp_allocate(
1245               sizeof(dispatch_private_info_t));
1246     }
1247     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1248 
1249     KMP_MB();
1250 
1251   } else {
1252     /* this serialized team is already being used,
1253      * that's fine, just add another nested level */
1254     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1255     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1256     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1257     ++serial_team->t.t_serialized;
1258     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1259 
1260     // Nested level will be an index in the nested nthreads array
1261     int level = this_thr->th.th_team->t.t_level;
1262     // Thread value exists in the nested nthreads array for the next nested
1263     // level
1264     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1265       this_thr->th.th_current_task->td_icvs.nproc =
1266           __kmp_nested_nth.nth[level + 1];
1267     }
1268     serial_team->t.t_level++;
1269     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1270                   "of serial team %p to %d\n",
1271                   global_tid, serial_team, serial_team->t.t_level));
1272 
1273     /* allocate/push dispatch buffers stack */
1274     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1275     {
1276       dispatch_private_info_t *disp_buffer =
1277           (dispatch_private_info_t *)__kmp_allocate(
1278               sizeof(dispatch_private_info_t));
1279       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1280       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1281     }
1282     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1283 
1284     KMP_MB();
1285   }
1286   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1287 
1288   // Perform the display affinity functionality for
1289   // serialized parallel regions
1290   if (__kmp_display_affinity) {
1291     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1292         this_thr->th.th_prev_num_threads != 1) {
1293       // NULL means use the affinity-format-var ICV
1294       __kmp_aux_display_affinity(global_tid, NULL);
1295       this_thr->th.th_prev_level = serial_team->t.t_level;
1296       this_thr->th.th_prev_num_threads = 1;
1297     }
1298   }
1299 
1300   if (__kmp_env_consistency_check)
1301     __kmp_push_parallel(global_tid, NULL);
1302 #if OMPT_SUPPORT
1303   serial_team->t.ompt_team_info.master_return_address = codeptr;
1304   if (ompt_enabled.enabled &&
1305       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1306     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1307         OMPT_GET_FRAME_ADDRESS(0);
1308 
1309     ompt_lw_taskteam_t lw_taskteam;
1310     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1311                             &ompt_parallel_data, codeptr);
1312 
1313     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1314     // don't use lw_taskteam after linking. content was swaped
1315 
1316     /* OMPT implicit task begin */
1317     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1318     if (ompt_enabled.ompt_callback_implicit_task) {
1319       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1320           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1321           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1322           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1323       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1324           __kmp_tid_from_gtid(global_tid);
1325     }
1326 
1327     /* OMPT state */
1328     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1329     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1330         OMPT_GET_FRAME_ADDRESS(0);
1331   }
1332 #endif
1333 }
1334 
1335 /* most of the work for a fork */
1336 /* return true if we really went parallel, false if serialized */
1337 int __kmp_fork_call(ident_t *loc, int gtid,
1338                     enum fork_context_e call_context, // Intel, GNU, ...
1339                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1340                     kmp_va_list ap) {
1341   void **argv;
1342   int i;
1343   int master_tid;
1344   int master_this_cons;
1345   kmp_team_t *team;
1346   kmp_team_t *parent_team;
1347   kmp_info_t *master_th;
1348   kmp_root_t *root;
1349   int nthreads;
1350   int master_active;
1351   int master_set_numthreads;
1352   int level;
1353   int active_level;
1354   int teams_level;
1355 #if KMP_NESTED_HOT_TEAMS
1356   kmp_hot_team_ptr_t **p_hot_teams;
1357 #endif
1358   { // KMP_TIME_BLOCK
1359     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1360     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1361 
1362     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1363     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1364       /* Some systems prefer the stack for the root thread(s) to start with */
1365       /* some gap from the parent stack to prevent false sharing. */
1366       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1367       /* These 2 lines below are so this does not get optimized out */
1368       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1369         __kmp_stkpadding += (short)((kmp_int64)dummy);
1370     }
1371 
1372     /* initialize if needed */
1373     KMP_DEBUG_ASSERT(
1374         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1375     if (!TCR_4(__kmp_init_parallel))
1376       __kmp_parallel_initialize();
1377     __kmp_resume_if_soft_paused();
1378 
1379     /* setup current data */
1380     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1381     // shutdown
1382     parent_team = master_th->th.th_team;
1383     master_tid = master_th->th.th_info.ds.ds_tid;
1384     master_this_cons = master_th->th.th_local.this_construct;
1385     root = master_th->th.th_root;
1386     master_active = root->r.r_active;
1387     master_set_numthreads = master_th->th.th_set_nproc;
1388 
1389 #if OMPT_SUPPORT
1390     ompt_data_t ompt_parallel_data = ompt_data_none;
1391     ompt_data_t *parent_task_data;
1392     ompt_frame_t *ompt_frame;
1393     ompt_data_t *implicit_task_data;
1394     void *return_address = NULL;
1395 
1396     if (ompt_enabled.enabled) {
1397       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1398                                     NULL, NULL);
1399       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1400     }
1401 #endif
1402 
1403     // Nested level will be an index in the nested nthreads array
1404     level = parent_team->t.t_level;
1405     // used to launch non-serial teams even if nested is not allowed
1406     active_level = parent_team->t.t_active_level;
1407     // needed to check nesting inside the teams
1408     teams_level = master_th->th.th_teams_level;
1409 #if KMP_NESTED_HOT_TEAMS
1410     p_hot_teams = &master_th->th.th_hot_teams;
1411     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1412       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1413           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1414       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1415       // it is either actual or not needed (when active_level > 0)
1416       (*p_hot_teams)[0].hot_team_nth = 1;
1417     }
1418 #endif
1419 
1420 #if OMPT_SUPPORT
1421     if (ompt_enabled.enabled) {
1422       if (ompt_enabled.ompt_callback_parallel_begin) {
1423         int team_size = master_set_numthreads
1424                             ? master_set_numthreads
1425                             : get__nproc_2(parent_team, master_tid);
1426         int flags = OMPT_INVOKER(call_context) |
1427                     ((microtask == (microtask_t)__kmp_teams_master)
1428                          ? ompt_parallel_league
1429                          : ompt_parallel_team);
1430         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1431             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1432             return_address);
1433       }
1434       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1435     }
1436 #endif
1437 
1438     master_th->th.th_ident = loc;
1439 
1440     if (master_th->th.th_teams_microtask && ap &&
1441         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1442       // AC: This is start of parallel that is nested inside teams construct.
1443       // The team is actual (hot), all workers are ready at the fork barrier.
1444       // No lock needed to initialize the team a bit, then free workers.
1445       parent_team->t.t_ident = loc;
1446       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1447       parent_team->t.t_argc = argc;
1448       argv = (void **)parent_team->t.t_argv;
1449       for (i = argc - 1; i >= 0; --i)
1450         *argv++ = va_arg(kmp_va_deref(ap), void *);
1451       // Increment our nested depth levels, but not increase the serialization
1452       if (parent_team == master_th->th.th_serial_team) {
1453         // AC: we are in serialized parallel
1454         __kmpc_serialized_parallel(loc, gtid);
1455         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1456 
1457         if (call_context == fork_context_gnu) {
1458           // AC: need to decrement t_serialized for enquiry functions to work
1459           // correctly, will restore at join time
1460           parent_team->t.t_serialized--;
1461           return TRUE;
1462         }
1463 
1464 #if OMPT_SUPPORT
1465         void *dummy;
1466         void **exit_frame_p;
1467 
1468         ompt_lw_taskteam_t lw_taskteam;
1469 
1470         if (ompt_enabled.enabled) {
1471           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1472                                   &ompt_parallel_data, return_address);
1473           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1474 
1475           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1476           // don't use lw_taskteam after linking. content was swaped
1477 
1478           /* OMPT implicit task begin */
1479           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1480           if (ompt_enabled.ompt_callback_implicit_task) {
1481             OMPT_CUR_TASK_INFO(master_th)->thread_num =
1482                 __kmp_tid_from_gtid(gtid);
1483             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1484                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1485                 implicit_task_data, 1,
1486                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1487           }
1488 
1489           /* OMPT state */
1490           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1491         } else {
1492           exit_frame_p = &dummy;
1493         }
1494 #endif
1495         // AC: need to decrement t_serialized for enquiry functions to work
1496         // correctly, will restore at join time
1497         parent_team->t.t_serialized--;
1498 
1499         {
1500           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1501           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1502           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1503 #if OMPT_SUPPORT
1504                                  ,
1505                                  exit_frame_p
1506 #endif
1507           );
1508         }
1509 
1510 #if OMPT_SUPPORT
1511         if (ompt_enabled.enabled) {
1512           *exit_frame_p = NULL;
1513           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1514           if (ompt_enabled.ompt_callback_implicit_task) {
1515             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1516                 ompt_scope_end, NULL, implicit_task_data, 1,
1517                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1518           }
1519           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1520           __ompt_lw_taskteam_unlink(master_th);
1521           if (ompt_enabled.ompt_callback_parallel_end) {
1522             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1523                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1524                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1525                 return_address);
1526           }
1527           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1528         }
1529 #endif
1530         return TRUE;
1531       }
1532 
1533       parent_team->t.t_pkfn = microtask;
1534       parent_team->t.t_invoke = invoker;
1535       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1536       parent_team->t.t_active_level++;
1537       parent_team->t.t_level++;
1538       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1539 
1540 #if OMPT_SUPPORT
1541       if (ompt_enabled.enabled) {
1542         ompt_lw_taskteam_t lw_taskteam;
1543         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1544                                 &ompt_parallel_data, return_address);
1545         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1546       }
1547 #endif
1548 
1549       /* Change number of threads in the team if requested */
1550       if (master_set_numthreads) { // The parallel has num_threads clause
1551         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1552           // AC: only can reduce number of threads dynamically, can't increase
1553           kmp_info_t **other_threads = parent_team->t.t_threads;
1554           parent_team->t.t_nproc = master_set_numthreads;
1555           for (i = 0; i < master_set_numthreads; ++i) {
1556             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1557           }
1558           // Keep extra threads hot in the team for possible next parallels
1559         }
1560         master_th->th.th_set_nproc = 0;
1561       }
1562 
1563 #if USE_DEBUGGER
1564       if (__kmp_debugging) { // Let debugger override number of threads.
1565         int nth = __kmp_omp_num_threads(loc);
1566         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1567           master_set_numthreads = nth;
1568         }
1569       }
1570 #endif
1571 
1572 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1573       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1574            KMP_ITT_DEBUG) &&
1575           __kmp_forkjoin_frames_mode == 3 &&
1576           parent_team->t.t_active_level == 1 // only report frames at level 1
1577           && master_th->th.th_teams_size.nteams == 1) {
1578         kmp_uint64 tmp_time = __itt_get_timestamp();
1579         master_th->th.th_frame_time = tmp_time;
1580         parent_team->t.t_region_time = tmp_time;
1581       }
1582       if (__itt_stack_caller_create_ptr) {
1583         // create new stack stitching id before entering fork barrier
1584         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1585       }
1586 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1587 
1588       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1589                     "master_th=%p, gtid=%d\n",
1590                     root, parent_team, master_th, gtid));
1591       __kmp_internal_fork(loc, gtid, parent_team);
1592       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1593                     "master_th=%p, gtid=%d\n",
1594                     root, parent_team, master_th, gtid));
1595 
1596       if (call_context == fork_context_gnu)
1597         return TRUE;
1598 
1599       /* Invoke microtask for MASTER thread */
1600       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1601                     parent_team->t.t_id, parent_team->t.t_pkfn));
1602 
1603       if (!parent_team->t.t_invoke(gtid)) {
1604         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1605       }
1606       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1607                     parent_team->t.t_id, parent_team->t.t_pkfn));
1608       KMP_MB(); /* Flush all pending memory write invalidates.  */
1609 
1610       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1611 
1612       return TRUE;
1613     } // Parallel closely nested in teams construct
1614 
1615 #if KMP_DEBUG
1616     if (__kmp_tasking_mode != tskm_immediate_exec) {
1617       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1618                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1619     }
1620 #endif
1621 
1622     if (parent_team->t.t_active_level >=
1623         master_th->th.th_current_task->td_icvs.max_active_levels) {
1624       nthreads = 1;
1625     } else {
1626       int enter_teams = ((ap == NULL && active_level == 0) ||
1627                          (ap && teams_level > 0 && teams_level == level));
1628       nthreads =
1629           master_set_numthreads
1630               ? master_set_numthreads
1631               : get__nproc_2(
1632                     parent_team,
1633                     master_tid); // TODO: get nproc directly from current task
1634 
1635       // Check if we need to take forkjoin lock? (no need for serialized
1636       // parallel out of teams construct). This code moved here from
1637       // __kmp_reserve_threads() to speedup nested serialized parallels.
1638       if (nthreads > 1) {
1639         if ((get__max_active_levels(master_th) == 1 &&
1640              (root->r.r_in_parallel && !enter_teams)) ||
1641             (__kmp_library == library_serial)) {
1642           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1643                         " threads\n",
1644                         gtid, nthreads));
1645           nthreads = 1;
1646         }
1647       }
1648       if (nthreads > 1) {
1649         /* determine how many new threads we can use */
1650         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1651         /* AC: If we execute teams from parallel region (on host), then teams
1652            should be created but each can only have 1 thread if nesting is
1653            disabled. If teams called from serial region, then teams and their
1654            threads should be created regardless of the nesting setting. */
1655         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1656                                          nthreads, enter_teams);
1657         if (nthreads == 1) {
1658           // Free lock for single thread execution here; for multi-thread
1659           // execution it will be freed later after team of threads created
1660           // and initialized
1661           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1662         }
1663       }
1664     }
1665     KMP_DEBUG_ASSERT(nthreads > 0);
1666 
1667     // If we temporarily changed the set number of threads then restore it now
1668     master_th->th.th_set_nproc = 0;
1669 
1670     /* create a serialized parallel region? */
1671     if (nthreads == 1) {
1672 /* josh todo: hypothetical question: what do we do for OS X*? */
1673 #if KMP_OS_LINUX &&                                                            \
1674     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1675       void *args[argc];
1676 #else
1677       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1678 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1679           KMP_ARCH_AARCH64) */
1680 
1681       KA_TRACE(20,
1682                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1683 
1684       __kmpc_serialized_parallel(loc, gtid);
1685 
1686       if (call_context == fork_context_intel) {
1687         /* TODO this sucks, use the compiler itself to pass args! :) */
1688         master_th->th.th_serial_team->t.t_ident = loc;
1689         if (!ap) {
1690           // revert change made in __kmpc_serialized_parallel()
1691           master_th->th.th_serial_team->t.t_level--;
1692           // Get args from parent team for teams construct
1693 
1694 #if OMPT_SUPPORT
1695           void *dummy;
1696           void **exit_frame_p;
1697           ompt_task_info_t *task_info;
1698 
1699           ompt_lw_taskteam_t lw_taskteam;
1700 
1701           if (ompt_enabled.enabled) {
1702             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1703                                     &ompt_parallel_data, return_address);
1704 
1705             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1706             // don't use lw_taskteam after linking. content was swaped
1707 
1708             task_info = OMPT_CUR_TASK_INFO(master_th);
1709             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1710             if (ompt_enabled.ompt_callback_implicit_task) {
1711               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1712                   __kmp_tid_from_gtid(gtid);
1713               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1714                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1715                   &(task_info->task_data), 1,
1716                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1717                   ompt_task_implicit);
1718             }
1719 
1720             /* OMPT state */
1721             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1722           } else {
1723             exit_frame_p = &dummy;
1724           }
1725 #endif
1726 
1727           {
1728             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1729             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1730             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1731                                    parent_team->t.t_argv
1732 #if OMPT_SUPPORT
1733                                    ,
1734                                    exit_frame_p
1735 #endif
1736             );
1737           }
1738 
1739 #if OMPT_SUPPORT
1740           if (ompt_enabled.enabled) {
1741             *exit_frame_p = NULL;
1742             if (ompt_enabled.ompt_callback_implicit_task) {
1743               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1744                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1745                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1746                   ompt_task_implicit);
1747             }
1748             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1749             __ompt_lw_taskteam_unlink(master_th);
1750             if (ompt_enabled.ompt_callback_parallel_end) {
1751               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1752                   &ompt_parallel_data, parent_task_data,
1753                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1754                   return_address);
1755             }
1756             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1757           }
1758 #endif
1759         } else if (microtask == (microtask_t)__kmp_teams_master) {
1760           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1761                            master_th->th.th_serial_team);
1762           team = master_th->th.th_team;
1763           // team->t.t_pkfn = microtask;
1764           team->t.t_invoke = invoker;
1765           __kmp_alloc_argv_entries(argc, team, TRUE);
1766           team->t.t_argc = argc;
1767           argv = (void **)team->t.t_argv;
1768           if (ap) {
1769             for (i = argc - 1; i >= 0; --i)
1770               *argv++ = va_arg(kmp_va_deref(ap), void *);
1771           } else {
1772             for (i = 0; i < argc; ++i)
1773               // Get args from parent team for teams construct
1774               argv[i] = parent_team->t.t_argv[i];
1775           }
1776           // AC: revert change made in __kmpc_serialized_parallel()
1777           //     because initial code in teams should have level=0
1778           team->t.t_level--;
1779           // AC: call special invoker for outer "parallel" of teams construct
1780           invoker(gtid);
1781 #if OMPT_SUPPORT
1782           if (ompt_enabled.enabled) {
1783             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1784             if (ompt_enabled.ompt_callback_implicit_task) {
1785               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1786                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1787                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1788             }
1789             if (ompt_enabled.ompt_callback_parallel_end) {
1790               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1791                   &ompt_parallel_data, parent_task_data,
1792                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1793                   return_address);
1794             }
1795             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1796           }
1797 #endif
1798         } else {
1799           argv = args;
1800           for (i = argc - 1; i >= 0; --i)
1801             *argv++ = va_arg(kmp_va_deref(ap), void *);
1802           KMP_MB();
1803 
1804 #if OMPT_SUPPORT
1805           void *dummy;
1806           void **exit_frame_p;
1807           ompt_task_info_t *task_info;
1808 
1809           ompt_lw_taskteam_t lw_taskteam;
1810 
1811           if (ompt_enabled.enabled) {
1812             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1813                                     &ompt_parallel_data, return_address);
1814             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1815             // don't use lw_taskteam after linking. content was swaped
1816             task_info = OMPT_CUR_TASK_INFO(master_th);
1817             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1818 
1819             /* OMPT implicit task begin */
1820             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1821             if (ompt_enabled.ompt_callback_implicit_task) {
1822               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1823                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1824                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1825                   ompt_task_implicit);
1826               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1827                   __kmp_tid_from_gtid(gtid);
1828             }
1829 
1830             /* OMPT state */
1831             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1832           } else {
1833             exit_frame_p = &dummy;
1834           }
1835 #endif
1836 
1837           {
1838             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1839             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1840             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1841 #if OMPT_SUPPORT
1842                                    ,
1843                                    exit_frame_p
1844 #endif
1845             );
1846           }
1847 
1848 #if OMPT_SUPPORT
1849           if (ompt_enabled.enabled) {
1850             *exit_frame_p = NULL;
1851             if (ompt_enabled.ompt_callback_implicit_task) {
1852               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1853                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1854                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1855                   ompt_task_implicit);
1856             }
1857 
1858             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1859             __ompt_lw_taskteam_unlink(master_th);
1860             if (ompt_enabled.ompt_callback_parallel_end) {
1861               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1862                   &ompt_parallel_data, parent_task_data,
1863                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1864                   return_address);
1865             }
1866             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1867           }
1868 #endif
1869         }
1870       } else if (call_context == fork_context_gnu) {
1871 #if OMPT_SUPPORT
1872         ompt_lw_taskteam_t lwt;
1873         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1874                                 return_address);
1875 
1876         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1877         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1878 // don't use lw_taskteam after linking. content was swaped
1879 #endif
1880 
1881         // we were called from GNU native code
1882         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1883         return FALSE;
1884       } else {
1885         KMP_ASSERT2(call_context < fork_context_last,
1886                     "__kmp_fork_call: unknown fork_context parameter");
1887       }
1888 
1889       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1890       KMP_MB();
1891       return FALSE;
1892     } // if (nthreads == 1)
1893 
1894     // GEH: only modify the executing flag in the case when not serialized
1895     //      serialized case is handled in kmpc_serialized_parallel
1896     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1897                   "curtask=%p, curtask_max_aclevel=%d\n",
1898                   parent_team->t.t_active_level, master_th,
1899                   master_th->th.th_current_task,
1900                   master_th->th.th_current_task->td_icvs.max_active_levels));
1901     // TODO: GEH - cannot do this assertion because root thread not set up as
1902     // executing
1903     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1904     master_th->th.th_current_task->td_flags.executing = 0;
1905 
1906     if (!master_th->th.th_teams_microtask || level > teams_level) {
1907       /* Increment our nested depth level */
1908       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1909     }
1910 
1911     // See if we need to make a copy of the ICVs.
1912     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1913     if ((level + 1 < __kmp_nested_nth.used) &&
1914         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1915       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1916     } else {
1917       nthreads_icv = 0; // don't update
1918     }
1919 
1920     // Figure out the proc_bind_policy for the new team.
1921     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1922     kmp_proc_bind_t proc_bind_icv =
1923         proc_bind_default; // proc_bind_default means don't update
1924     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1925       proc_bind = proc_bind_false;
1926     } else {
1927       if (proc_bind == proc_bind_default) {
1928         // No proc_bind clause specified; use current proc-bind-var for this
1929         // parallel region
1930         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1931       }
1932       /* else: The proc_bind policy was specified explicitly on parallel clause.
1933          This overrides proc-bind-var for this parallel region, but does not
1934          change proc-bind-var. */
1935       // Figure the value of proc-bind-var for the child threads.
1936       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1937           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1938            master_th->th.th_current_task->td_icvs.proc_bind)) {
1939         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1940       }
1941     }
1942 
1943     // Reset for next parallel region
1944     master_th->th.th_set_proc_bind = proc_bind_default;
1945 
1946     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1947       kmp_internal_control_t new_icvs;
1948       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1949       new_icvs.next = NULL;
1950       if (nthreads_icv > 0) {
1951         new_icvs.nproc = nthreads_icv;
1952       }
1953       if (proc_bind_icv != proc_bind_default) {
1954         new_icvs.proc_bind = proc_bind_icv;
1955       }
1956 
1957       /* allocate a new parallel team */
1958       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1959       team = __kmp_allocate_team(root, nthreads, nthreads,
1960 #if OMPT_SUPPORT
1961                                  ompt_parallel_data,
1962 #endif
1963                                  proc_bind, &new_icvs,
1964                                  argc USE_NESTED_HOT_ARG(master_th));
1965     } else {
1966       /* allocate a new parallel team */
1967       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1968       team = __kmp_allocate_team(root, nthreads, nthreads,
1969 #if OMPT_SUPPORT
1970                                  ompt_parallel_data,
1971 #endif
1972                                  proc_bind,
1973                                  &master_th->th.th_current_task->td_icvs,
1974                                  argc USE_NESTED_HOT_ARG(master_th));
1975     }
1976     KF_TRACE(
1977         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
1978 
1979     /* setup the new team */
1980     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
1981     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
1982     KMP_CHECK_UPDATE(team->t.t_ident, loc);
1983     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
1984     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
1985 #if OMPT_SUPPORT
1986     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
1987                           return_address);
1988 #endif
1989     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
1990     // TODO: parent_team->t.t_level == INT_MAX ???
1991     if (!master_th->th.th_teams_microtask || level > teams_level) {
1992       int new_level = parent_team->t.t_level + 1;
1993       KMP_CHECK_UPDATE(team->t.t_level, new_level);
1994       new_level = parent_team->t.t_active_level + 1;
1995       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
1996     } else {
1997       // AC: Do not increase parallel level at start of the teams construct
1998       int new_level = parent_team->t.t_level;
1999       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2000       new_level = parent_team->t.t_active_level;
2001       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2002     }
2003     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2004     // set master's schedule as new run-time schedule
2005     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2006 
2007     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2008     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2009 
2010     // Update the floating point rounding in the team if required.
2011     propagateFPControl(team);
2012 
2013     if (__kmp_tasking_mode != tskm_immediate_exec) {
2014       // Set master's task team to team's task team. Unless this is hot team, it
2015       // should be NULL.
2016       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2017                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2018       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2019                     "%p, new task_team %p / team %p\n",
2020                     __kmp_gtid_from_thread(master_th),
2021                     master_th->th.th_task_team, parent_team,
2022                     team->t.t_task_team[master_th->th.th_task_state], team));
2023 
2024       if (active_level || master_th->th.th_task_team) {
2025         // Take a memo of master's task_state
2026         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2027         if (master_th->th.th_task_state_top >=
2028             master_th->th.th_task_state_stack_sz) { // increase size
2029           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2030           kmp_uint8 *old_stack, *new_stack;
2031           kmp_uint32 i;
2032           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2033           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2034             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2035           }
2036           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2037                ++i) { // zero-init rest of stack
2038             new_stack[i] = 0;
2039           }
2040           old_stack = master_th->th.th_task_state_memo_stack;
2041           master_th->th.th_task_state_memo_stack = new_stack;
2042           master_th->th.th_task_state_stack_sz = new_size;
2043           __kmp_free(old_stack);
2044         }
2045         // Store master's task_state on stack
2046         master_th->th
2047             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2048             master_th->th.th_task_state;
2049         master_th->th.th_task_state_top++;
2050 #if KMP_NESTED_HOT_TEAMS
2051         if (master_th->th.th_hot_teams &&
2052             active_level < __kmp_hot_teams_max_level &&
2053             team == master_th->th.th_hot_teams[active_level].hot_team) {
2054           // Restore master's nested state if nested hot team
2055           master_th->th.th_task_state =
2056               master_th->th
2057                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2058         } else {
2059 #endif
2060           master_th->th.th_task_state = 0;
2061 #if KMP_NESTED_HOT_TEAMS
2062         }
2063 #endif
2064       }
2065 #if !KMP_NESTED_HOT_TEAMS
2066       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2067                        (team == root->r.r_hot_team));
2068 #endif
2069     }
2070 
2071     KA_TRACE(
2072         20,
2073         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2074          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2075          team->t.t_nproc));
2076     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2077                      (team->t.t_master_tid == 0 &&
2078                       (team->t.t_parent == root->r.r_root_team ||
2079                        team->t.t_parent->t.t_serialized)));
2080     KMP_MB();
2081 
2082     /* now, setup the arguments */
2083     argv = (void **)team->t.t_argv;
2084     if (ap) {
2085       for (i = argc - 1; i >= 0; --i) {
2086         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2087         KMP_CHECK_UPDATE(*argv, new_argv);
2088         argv++;
2089       }
2090     } else {
2091       for (i = 0; i < argc; ++i) {
2092         // Get args from parent team for teams construct
2093         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2094       }
2095     }
2096 
2097     /* now actually fork the threads */
2098     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2099     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2100       root->r.r_active = TRUE;
2101 
2102     __kmp_fork_team_threads(root, team, master_th, gtid);
2103     __kmp_setup_icv_copy(team, nthreads,
2104                          &master_th->th.th_current_task->td_icvs, loc);
2105 
2106 #if OMPT_SUPPORT
2107     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2108 #endif
2109 
2110     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2111 
2112 #if USE_ITT_BUILD
2113     if (team->t.t_active_level == 1 // only report frames at level 1
2114         && !master_th->th.th_teams_microtask) { // not in teams construct
2115 #if USE_ITT_NOTIFY
2116       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2117           (__kmp_forkjoin_frames_mode == 3 ||
2118            __kmp_forkjoin_frames_mode == 1)) {
2119         kmp_uint64 tmp_time = 0;
2120         if (__itt_get_timestamp_ptr)
2121           tmp_time = __itt_get_timestamp();
2122         // Internal fork - report frame begin
2123         master_th->th.th_frame_time = tmp_time;
2124         if (__kmp_forkjoin_frames_mode == 3)
2125           team->t.t_region_time = tmp_time;
2126       } else
2127 // only one notification scheme (either "submit" or "forking/joined", not both)
2128 #endif /* USE_ITT_NOTIFY */
2129           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2130               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2131         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2132         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2133       }
2134     }
2135 #endif /* USE_ITT_BUILD */
2136 
2137     /* now go on and do the work */
2138     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2139     KMP_MB();
2140     KF_TRACE(10,
2141              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2142               root, team, master_th, gtid));
2143 
2144 #if USE_ITT_BUILD
2145     if (__itt_stack_caller_create_ptr) {
2146       team->t.t_stack_id =
2147           __kmp_itt_stack_caller_create(); // create new stack stitching id
2148       // before entering fork barrier
2149     }
2150 #endif /* USE_ITT_BUILD */
2151 
2152     // AC: skip __kmp_internal_fork at teams construct, let only master
2153     // threads execute
2154     if (ap) {
2155       __kmp_internal_fork(loc, gtid, team);
2156       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2157                     "master_th=%p, gtid=%d\n",
2158                     root, team, master_th, gtid));
2159     }
2160 
2161     if (call_context == fork_context_gnu) {
2162       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2163       return TRUE;
2164     }
2165 
2166     /* Invoke microtask for MASTER thread */
2167     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2168                   team->t.t_id, team->t.t_pkfn));
2169   } // END of timer KMP_fork_call block
2170 
2171 #if KMP_STATS_ENABLED
2172   // If beginning a teams construct, then change thread state
2173   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2174   if (!ap) {
2175     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2176   }
2177 #endif
2178 
2179   if (!team->t.t_invoke(gtid)) {
2180     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2181   }
2182 
2183 #if KMP_STATS_ENABLED
2184   // If was beginning of a teams construct, then reset thread state
2185   if (!ap) {
2186     KMP_SET_THREAD_STATE(previous_state);
2187   }
2188 #endif
2189 
2190   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2191                 team->t.t_id, team->t.t_pkfn));
2192   KMP_MB(); /* Flush all pending memory write invalidates.  */
2193 
2194   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2195 
2196 #if OMPT_SUPPORT
2197   if (ompt_enabled.enabled) {
2198     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2199   }
2200 #endif
2201 
2202   return TRUE;
2203 }
2204 
2205 #if OMPT_SUPPORT
2206 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2207                                             kmp_team_t *team) {
2208   // restore state outside the region
2209   thread->th.ompt_thread_info.state =
2210       ((team->t.t_serialized) ? ompt_state_work_serial
2211                               : ompt_state_work_parallel);
2212 }
2213 
2214 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2215                                    kmp_team_t *team, ompt_data_t *parallel_data,
2216                                    int flags, void *codeptr) {
2217   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2218   if (ompt_enabled.ompt_callback_parallel_end) {
2219     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2220         parallel_data, &(task_info->task_data), flags, codeptr);
2221   }
2222 
2223   task_info->frame.enter_frame = ompt_data_none;
2224   __kmp_join_restore_state(thread, team);
2225 }
2226 #endif
2227 
2228 void __kmp_join_call(ident_t *loc, int gtid
2229 #if OMPT_SUPPORT
2230                      ,
2231                      enum fork_context_e fork_context
2232 #endif
2233                      ,
2234                      int exit_teams) {
2235   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2236   kmp_team_t *team;
2237   kmp_team_t *parent_team;
2238   kmp_info_t *master_th;
2239   kmp_root_t *root;
2240   int master_active;
2241 
2242   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2243 
2244   /* setup current data */
2245   master_th = __kmp_threads[gtid];
2246   root = master_th->th.th_root;
2247   team = master_th->th.th_team;
2248   parent_team = team->t.t_parent;
2249 
2250   master_th->th.th_ident = loc;
2251 
2252 #if OMPT_SUPPORT
2253   void *team_microtask = (void *)team->t.t_pkfn;
2254   // For GOMP interface with serialized parallel, need the
2255   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2256   // and end-parallel events.
2257   if (ompt_enabled.enabled &&
2258       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2259     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2260   }
2261 #endif
2262 
2263 #if KMP_DEBUG
2264   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2265     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2266                   "th_task_team = %p\n",
2267                   __kmp_gtid_from_thread(master_th), team,
2268                   team->t.t_task_team[master_th->th.th_task_state],
2269                   master_th->th.th_task_team));
2270     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2271                      team->t.t_task_team[master_th->th.th_task_state]);
2272   }
2273 #endif
2274 
2275   if (team->t.t_serialized) {
2276     if (master_th->th.th_teams_microtask) {
2277       // We are in teams construct
2278       int level = team->t.t_level;
2279       int tlevel = master_th->th.th_teams_level;
2280       if (level == tlevel) {
2281         // AC: we haven't incremented it earlier at start of teams construct,
2282         //     so do it here - at the end of teams construct
2283         team->t.t_level++;
2284       } else if (level == tlevel + 1) {
2285         // AC: we are exiting parallel inside teams, need to increment
2286         // serialization in order to restore it in the next call to
2287         // __kmpc_end_serialized_parallel
2288         team->t.t_serialized++;
2289       }
2290     }
2291     __kmpc_end_serialized_parallel(loc, gtid);
2292 
2293 #if OMPT_SUPPORT
2294     if (ompt_enabled.enabled) {
2295       __kmp_join_restore_state(master_th, parent_team);
2296     }
2297 #endif
2298 
2299     return;
2300   }
2301 
2302   master_active = team->t.t_master_active;
2303 
2304   if (!exit_teams) {
2305     // AC: No barrier for internal teams at exit from teams construct.
2306     //     But there is barrier for external team (league).
2307     __kmp_internal_join(loc, gtid, team);
2308   } else {
2309     master_th->th.th_task_state =
2310         0; // AC: no tasking in teams (out of any parallel)
2311   }
2312 
2313   KMP_MB();
2314 
2315 #if OMPT_SUPPORT
2316   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2317   void *codeptr = team->t.ompt_team_info.master_return_address;
2318 #endif
2319 
2320 #if USE_ITT_BUILD
2321   if (__itt_stack_caller_create_ptr) {
2322     // destroy the stack stitching id after join barrier
2323     __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2324   }
2325   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2326   if (team->t.t_active_level == 1 &&
2327       (!master_th->th.th_teams_microtask || /* not in teams construct */
2328        master_th->th.th_teams_size.nteams == 1)) {
2329     master_th->th.th_ident = loc;
2330     // only one notification scheme (either "submit" or "forking/joined", not
2331     // both)
2332     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2333         __kmp_forkjoin_frames_mode == 3)
2334       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2335                              master_th->th.th_frame_time, 0, loc,
2336                              master_th->th.th_team_nproc, 1);
2337     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2338              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2339       __kmp_itt_region_joined(gtid);
2340   } // active_level == 1
2341 #endif /* USE_ITT_BUILD */
2342 
2343   if (master_th->th.th_teams_microtask && !exit_teams &&
2344       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2345       team->t.t_level == master_th->th.th_teams_level + 1) {
2346 // AC: We need to leave the team structure intact at the end of parallel
2347 // inside the teams construct, so that at the next parallel same (hot) team
2348 // works, only adjust nesting levels
2349 #if OMPT_SUPPORT
2350     ompt_data_t ompt_parallel_data = ompt_data_none;
2351     if (ompt_enabled.enabled) {
2352       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2353       if (ompt_enabled.ompt_callback_implicit_task) {
2354         int ompt_team_size = team->t.t_nproc;
2355         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2356             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2357             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2358       }
2359       task_info->frame.exit_frame = ompt_data_none;
2360       task_info->task_data = ompt_data_none;
2361       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2362       __ompt_lw_taskteam_unlink(master_th);
2363     }
2364 #endif
2365     /* Decrement our nested depth level */
2366     team->t.t_level--;
2367     team->t.t_active_level--;
2368     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2369 
2370     // Restore number of threads in the team if needed. This code relies on
2371     // the proper adjustment of th_teams_size.nth after the fork in
2372     // __kmp_teams_master on each teams master in the case that
2373     // __kmp_reserve_threads reduced it.
2374     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2375       int old_num = master_th->th.th_team_nproc;
2376       int new_num = master_th->th.th_teams_size.nth;
2377       kmp_info_t **other_threads = team->t.t_threads;
2378       team->t.t_nproc = new_num;
2379       for (int i = 0; i < old_num; ++i) {
2380         other_threads[i]->th.th_team_nproc = new_num;
2381       }
2382       // Adjust states of non-used threads of the team
2383       for (int i = old_num; i < new_num; ++i) {
2384         // Re-initialize thread's barrier data.
2385         KMP_DEBUG_ASSERT(other_threads[i]);
2386         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2387         for (int b = 0; b < bs_last_barrier; ++b) {
2388           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2389           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2390 #if USE_DEBUGGER
2391           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2392 #endif
2393         }
2394         if (__kmp_tasking_mode != tskm_immediate_exec) {
2395           // Synchronize thread's task state
2396           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2397         }
2398       }
2399     }
2400 
2401 #if OMPT_SUPPORT
2402     if (ompt_enabled.enabled) {
2403       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2404                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2405     }
2406 #endif
2407 
2408     return;
2409   }
2410 
2411   /* do cleanup and restore the parent team */
2412   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2413   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2414 
2415   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2416 
2417   /* jc: The following lock has instructions with REL and ACQ semantics,
2418      separating the parallel user code called in this parallel region
2419      from the serial user code called after this function returns. */
2420   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2421 
2422   if (!master_th->th.th_teams_microtask ||
2423       team->t.t_level > master_th->th.th_teams_level) {
2424     /* Decrement our nested depth level */
2425     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2426   }
2427   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2428 
2429 #if OMPT_SUPPORT
2430   if (ompt_enabled.enabled) {
2431     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2432     if (ompt_enabled.ompt_callback_implicit_task) {
2433       int flags = (team_microtask == (void *)__kmp_teams_master)
2434                       ? ompt_task_initial
2435                       : ompt_task_implicit;
2436       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2437       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2438           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2439           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2440     }
2441     task_info->frame.exit_frame = ompt_data_none;
2442     task_info->task_data = ompt_data_none;
2443   }
2444 #endif
2445 
2446   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2447                 master_th, team));
2448   __kmp_pop_current_task_from_thread(master_th);
2449 
2450 #if KMP_AFFINITY_SUPPORTED
2451   // Restore master thread's partition.
2452   master_th->th.th_first_place = team->t.t_first_place;
2453   master_th->th.th_last_place = team->t.t_last_place;
2454 #endif // KMP_AFFINITY_SUPPORTED
2455   master_th->th.th_def_allocator = team->t.t_def_allocator;
2456 
2457   updateHWFPControl(team);
2458 
2459   if (root->r.r_active != master_active)
2460     root->r.r_active = master_active;
2461 
2462   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2463                             master_th)); // this will free worker threads
2464 
2465   /* this race was fun to find. make sure the following is in the critical
2466      region otherwise assertions may fail occasionally since the old team may be
2467      reallocated and the hierarchy appears inconsistent. it is actually safe to
2468      run and won't cause any bugs, but will cause those assertion failures. it's
2469      only one deref&assign so might as well put this in the critical region */
2470   master_th->th.th_team = parent_team;
2471   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2472   master_th->th.th_team_master = parent_team->t.t_threads[0];
2473   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2474 
2475   /* restore serialized team, if need be */
2476   if (parent_team->t.t_serialized &&
2477       parent_team != master_th->th.th_serial_team &&
2478       parent_team != root->r.r_root_team) {
2479     __kmp_free_team(root,
2480                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2481     master_th->th.th_serial_team = parent_team;
2482   }
2483 
2484   if (__kmp_tasking_mode != tskm_immediate_exec) {
2485     if (master_th->th.th_task_state_top >
2486         0) { // Restore task state from memo stack
2487       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2488       // Remember master's state if we re-use this nested hot team
2489       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2490           master_th->th.th_task_state;
2491       --master_th->th.th_task_state_top; // pop
2492       // Now restore state at this level
2493       master_th->th.th_task_state =
2494           master_th->th
2495               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2496     }
2497     // Copy the task team from the parent team to the master thread
2498     master_th->th.th_task_team =
2499         parent_team->t.t_task_team[master_th->th.th_task_state];
2500     KA_TRACE(20,
2501              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2502               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2503               parent_team));
2504   }
2505 
2506   // TODO: GEH - cannot do this assertion because root thread not set up as
2507   // executing
2508   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2509   master_th->th.th_current_task->td_flags.executing = 1;
2510 
2511   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2512 
2513 #if OMPT_SUPPORT
2514   int flags =
2515       OMPT_INVOKER(fork_context) |
2516       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2517                                                       : ompt_parallel_team);
2518   if (ompt_enabled.enabled) {
2519     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2520                     codeptr);
2521   }
2522 #endif
2523 
2524   KMP_MB();
2525   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2526 }
2527 
2528 /* Check whether we should push an internal control record onto the
2529    serial team stack.  If so, do it.  */
2530 void __kmp_save_internal_controls(kmp_info_t *thread) {
2531 
2532   if (thread->th.th_team != thread->th.th_serial_team) {
2533     return;
2534   }
2535   if (thread->th.th_team->t.t_serialized > 1) {
2536     int push = 0;
2537 
2538     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2539       push = 1;
2540     } else {
2541       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2542           thread->th.th_team->t.t_serialized) {
2543         push = 1;
2544       }
2545     }
2546     if (push) { /* push a record on the serial team's stack */
2547       kmp_internal_control_t *control =
2548           (kmp_internal_control_t *)__kmp_allocate(
2549               sizeof(kmp_internal_control_t));
2550 
2551       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2552 
2553       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2554 
2555       control->next = thread->th.th_team->t.t_control_stack_top;
2556       thread->th.th_team->t.t_control_stack_top = control;
2557     }
2558   }
2559 }
2560 
2561 /* Changes set_nproc */
2562 void __kmp_set_num_threads(int new_nth, int gtid) {
2563   kmp_info_t *thread;
2564   kmp_root_t *root;
2565 
2566   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2567   KMP_DEBUG_ASSERT(__kmp_init_serial);
2568 
2569   if (new_nth < 1)
2570     new_nth = 1;
2571   else if (new_nth > __kmp_max_nth)
2572     new_nth = __kmp_max_nth;
2573 
2574   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2575   thread = __kmp_threads[gtid];
2576   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2577     return; // nothing to do
2578 
2579   __kmp_save_internal_controls(thread);
2580 
2581   set__nproc(thread, new_nth);
2582 
2583   // If this omp_set_num_threads() call will cause the hot team size to be
2584   // reduced (in the absence of a num_threads clause), then reduce it now,
2585   // rather than waiting for the next parallel region.
2586   root = thread->th.th_root;
2587   if (__kmp_init_parallel && (!root->r.r_active) &&
2588       (root->r.r_hot_team->t.t_nproc > new_nth)
2589 #if KMP_NESTED_HOT_TEAMS
2590       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2591 #endif
2592   ) {
2593     kmp_team_t *hot_team = root->r.r_hot_team;
2594     int f;
2595 
2596     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2597 
2598     // Release the extra threads we don't need any more.
2599     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2600       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2601       if (__kmp_tasking_mode != tskm_immediate_exec) {
2602         // When decreasing team size, threads no longer in the team should unref
2603         // task team.
2604         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2605       }
2606       __kmp_free_thread(hot_team->t.t_threads[f]);
2607       hot_team->t.t_threads[f] = NULL;
2608     }
2609     hot_team->t.t_nproc = new_nth;
2610 #if KMP_NESTED_HOT_TEAMS
2611     if (thread->th.th_hot_teams) {
2612       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2613       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2614     }
2615 #endif
2616 
2617     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2618 
2619     // Update the t_nproc field in the threads that are still active.
2620     for (f = 0; f < new_nth; f++) {
2621       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2622       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2623     }
2624     // Special flag in case omp_set_num_threads() call
2625     hot_team->t.t_size_changed = -1;
2626   }
2627 }
2628 
2629 /* Changes max_active_levels */
2630 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2631   kmp_info_t *thread;
2632 
2633   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2634                 "%d = (%d)\n",
2635                 gtid, max_active_levels));
2636   KMP_DEBUG_ASSERT(__kmp_init_serial);
2637 
2638   // validate max_active_levels
2639   if (max_active_levels < 0) {
2640     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2641     // We ignore this call if the user has specified a negative value.
2642     // The current setting won't be changed. The last valid setting will be
2643     // used. A warning will be issued (if warnings are allowed as controlled by
2644     // the KMP_WARNINGS env var).
2645     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2646                   "max_active_levels for thread %d = (%d)\n",
2647                   gtid, max_active_levels));
2648     return;
2649   }
2650   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2651     // it's OK, the max_active_levels is within the valid range: [ 0;
2652     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2653     // We allow a zero value. (implementation defined behavior)
2654   } else {
2655     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2656                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2657     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2658     // Current upper limit is MAX_INT. (implementation defined behavior)
2659     // If the input exceeds the upper limit, we correct the input to be the
2660     // upper limit. (implementation defined behavior)
2661     // Actually, the flow should never get here until we use MAX_INT limit.
2662   }
2663   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2664                 "max_active_levels for thread %d = (%d)\n",
2665                 gtid, max_active_levels));
2666 
2667   thread = __kmp_threads[gtid];
2668 
2669   __kmp_save_internal_controls(thread);
2670 
2671   set__max_active_levels(thread, max_active_levels);
2672 }
2673 
2674 /* Gets max_active_levels */
2675 int __kmp_get_max_active_levels(int gtid) {
2676   kmp_info_t *thread;
2677 
2678   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2679   KMP_DEBUG_ASSERT(__kmp_init_serial);
2680 
2681   thread = __kmp_threads[gtid];
2682   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2683   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2684                 "curtask_maxaclevel=%d\n",
2685                 gtid, thread->th.th_current_task,
2686                 thread->th.th_current_task->td_icvs.max_active_levels));
2687   return thread->th.th_current_task->td_icvs.max_active_levels;
2688 }
2689 
2690 // nteams-var per-device ICV
2691 void __kmp_set_num_teams(int num_teams) {
2692   if (num_teams > 0)
2693     __kmp_nteams = num_teams;
2694 }
2695 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2696 // teams-thread-limit-var per-device ICV
2697 void __kmp_set_teams_thread_limit(int limit) {
2698   if (limit > 0)
2699     __kmp_teams_thread_limit = limit;
2700 }
2701 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2702 
2703 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2704 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2705 
2706 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2707 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2708   kmp_info_t *thread;
2709   kmp_sched_t orig_kind;
2710   //    kmp_team_t *team;
2711 
2712   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2713                 gtid, (int)kind, chunk));
2714   KMP_DEBUG_ASSERT(__kmp_init_serial);
2715 
2716   // Check if the kind parameter is valid, correct if needed.
2717   // Valid parameters should fit in one of two intervals - standard or extended:
2718   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2719   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2720   orig_kind = kind;
2721   kind = __kmp_sched_without_mods(kind);
2722 
2723   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2724       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2725     // TODO: Hint needs attention in case we change the default schedule.
2726     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2727               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2728               __kmp_msg_null);
2729     kind = kmp_sched_default;
2730     chunk = 0; // ignore chunk value in case of bad kind
2731   }
2732 
2733   thread = __kmp_threads[gtid];
2734 
2735   __kmp_save_internal_controls(thread);
2736 
2737   if (kind < kmp_sched_upper_std) {
2738     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2739       // differ static chunked vs. unchunked:  chunk should be invalid to
2740       // indicate unchunked schedule (which is the default)
2741       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2742     } else {
2743       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2744           __kmp_sch_map[kind - kmp_sched_lower - 1];
2745     }
2746   } else {
2747     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2748     //    kmp_sched_lower - 2 ];
2749     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2750         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2751                       kmp_sched_lower - 2];
2752   }
2753   __kmp_sched_apply_mods_intkind(
2754       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2755   if (kind == kmp_sched_auto || chunk < 1) {
2756     // ignore parameter chunk for schedule auto
2757     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2758   } else {
2759     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2760   }
2761 }
2762 
2763 /* Gets def_sched_var ICV values */
2764 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2765   kmp_info_t *thread;
2766   enum sched_type th_type;
2767 
2768   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2769   KMP_DEBUG_ASSERT(__kmp_init_serial);
2770 
2771   thread = __kmp_threads[gtid];
2772 
2773   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2774   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2775   case kmp_sch_static:
2776   case kmp_sch_static_greedy:
2777   case kmp_sch_static_balanced:
2778     *kind = kmp_sched_static;
2779     __kmp_sched_apply_mods_stdkind(kind, th_type);
2780     *chunk = 0; // chunk was not set, try to show this fact via zero value
2781     return;
2782   case kmp_sch_static_chunked:
2783     *kind = kmp_sched_static;
2784     break;
2785   case kmp_sch_dynamic_chunked:
2786     *kind = kmp_sched_dynamic;
2787     break;
2788   case kmp_sch_guided_chunked:
2789   case kmp_sch_guided_iterative_chunked:
2790   case kmp_sch_guided_analytical_chunked:
2791     *kind = kmp_sched_guided;
2792     break;
2793   case kmp_sch_auto:
2794     *kind = kmp_sched_auto;
2795     break;
2796   case kmp_sch_trapezoidal:
2797     *kind = kmp_sched_trapezoidal;
2798     break;
2799 #if KMP_STATIC_STEAL_ENABLED
2800   case kmp_sch_static_steal:
2801     *kind = kmp_sched_static_steal;
2802     break;
2803 #endif
2804   default:
2805     KMP_FATAL(UnknownSchedulingType, th_type);
2806   }
2807 
2808   __kmp_sched_apply_mods_stdkind(kind, th_type);
2809   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2810 }
2811 
2812 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2813 
2814   int ii, dd;
2815   kmp_team_t *team;
2816   kmp_info_t *thr;
2817 
2818   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2819   KMP_DEBUG_ASSERT(__kmp_init_serial);
2820 
2821   // validate level
2822   if (level == 0)
2823     return 0;
2824   if (level < 0)
2825     return -1;
2826   thr = __kmp_threads[gtid];
2827   team = thr->th.th_team;
2828   ii = team->t.t_level;
2829   if (level > ii)
2830     return -1;
2831 
2832   if (thr->th.th_teams_microtask) {
2833     // AC: we are in teams region where multiple nested teams have same level
2834     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2835     if (level <=
2836         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2837       KMP_DEBUG_ASSERT(ii >= tlevel);
2838       // AC: As we need to pass by the teams league, we need to artificially
2839       // increase ii
2840       if (ii == tlevel) {
2841         ii += 2; // three teams have same level
2842       } else {
2843         ii++; // two teams have same level
2844       }
2845     }
2846   }
2847 
2848   if (ii == level)
2849     return __kmp_tid_from_gtid(gtid);
2850 
2851   dd = team->t.t_serialized;
2852   level++;
2853   while (ii > level) {
2854     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2855     }
2856     if ((team->t.t_serialized) && (!dd)) {
2857       team = team->t.t_parent;
2858       continue;
2859     }
2860     if (ii > level) {
2861       team = team->t.t_parent;
2862       dd = team->t.t_serialized;
2863       ii--;
2864     }
2865   }
2866 
2867   return (dd > 1) ? (0) : (team->t.t_master_tid);
2868 }
2869 
2870 int __kmp_get_team_size(int gtid, int level) {
2871 
2872   int ii, dd;
2873   kmp_team_t *team;
2874   kmp_info_t *thr;
2875 
2876   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2877   KMP_DEBUG_ASSERT(__kmp_init_serial);
2878 
2879   // validate level
2880   if (level == 0)
2881     return 1;
2882   if (level < 0)
2883     return -1;
2884   thr = __kmp_threads[gtid];
2885   team = thr->th.th_team;
2886   ii = team->t.t_level;
2887   if (level > ii)
2888     return -1;
2889 
2890   if (thr->th.th_teams_microtask) {
2891     // AC: we are in teams region where multiple nested teams have same level
2892     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2893     if (level <=
2894         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2895       KMP_DEBUG_ASSERT(ii >= tlevel);
2896       // AC: As we need to pass by the teams league, we need to artificially
2897       // increase ii
2898       if (ii == tlevel) {
2899         ii += 2; // three teams have same level
2900       } else {
2901         ii++; // two teams have same level
2902       }
2903     }
2904   }
2905 
2906   while (ii > level) {
2907     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2908     }
2909     if (team->t.t_serialized && (!dd)) {
2910       team = team->t.t_parent;
2911       continue;
2912     }
2913     if (ii > level) {
2914       team = team->t.t_parent;
2915       ii--;
2916     }
2917   }
2918 
2919   return team->t.t_nproc;
2920 }
2921 
2922 kmp_r_sched_t __kmp_get_schedule_global() {
2923   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2924   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2925   // independently. So one can get the updated schedule here.
2926 
2927   kmp_r_sched_t r_sched;
2928 
2929   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2930   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2931   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2932   // different roots (even in OMP 2.5)
2933   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2934   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2935   if (s == kmp_sch_static) {
2936     // replace STATIC with more detailed schedule (balanced or greedy)
2937     r_sched.r_sched_type = __kmp_static;
2938   } else if (s == kmp_sch_guided_chunked) {
2939     // replace GUIDED with more detailed schedule (iterative or analytical)
2940     r_sched.r_sched_type = __kmp_guided;
2941   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2942     r_sched.r_sched_type = __kmp_sched;
2943   }
2944   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2945 
2946   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2947     // __kmp_chunk may be wrong here (if it was not ever set)
2948     r_sched.chunk = KMP_DEFAULT_CHUNK;
2949   } else {
2950     r_sched.chunk = __kmp_chunk;
2951   }
2952 
2953   return r_sched;
2954 }
2955 
2956 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2957    at least argc number of *t_argv entries for the requested team. */
2958 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2959 
2960   KMP_DEBUG_ASSERT(team);
2961   if (!realloc || argc > team->t.t_max_argc) {
2962 
2963     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2964                    "current entries=%d\n",
2965                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2966     /* if previously allocated heap space for args, free them */
2967     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2968       __kmp_free((void *)team->t.t_argv);
2969 
2970     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2971       /* use unused space in the cache line for arguments */
2972       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2973       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2974                      "argv entries\n",
2975                      team->t.t_id, team->t.t_max_argc));
2976       team->t.t_argv = &team->t.t_inline_argv[0];
2977       if (__kmp_storage_map) {
2978         __kmp_print_storage_map_gtid(
2979             -1, &team->t.t_inline_argv[0],
2980             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2981             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
2982             team->t.t_id);
2983       }
2984     } else {
2985       /* allocate space for arguments in the heap */
2986       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
2987                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
2988                                : 2 * argc;
2989       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
2990                      "argv entries\n",
2991                      team->t.t_id, team->t.t_max_argc));
2992       team->t.t_argv =
2993           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
2994       if (__kmp_storage_map) {
2995         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
2996                                      &team->t.t_argv[team->t.t_max_argc],
2997                                      sizeof(void *) * team->t.t_max_argc,
2998                                      "team_%d.t_argv", team->t.t_id);
2999       }
3000     }
3001   }
3002 }
3003 
3004 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3005   int i;
3006   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3007   team->t.t_threads =
3008       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3009   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3010       sizeof(dispatch_shared_info_t) * num_disp_buff);
3011   team->t.t_dispatch =
3012       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3013   team->t.t_implicit_task_taskdata =
3014       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3015   team->t.t_max_nproc = max_nth;
3016 
3017   /* setup dispatch buffers */
3018   for (i = 0; i < num_disp_buff; ++i) {
3019     team->t.t_disp_buffer[i].buffer_index = i;
3020     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3021   }
3022 }
3023 
3024 static void __kmp_free_team_arrays(kmp_team_t *team) {
3025   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3026   int i;
3027   for (i = 0; i < team->t.t_max_nproc; ++i) {
3028     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3029       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3030       team->t.t_dispatch[i].th_disp_buffer = NULL;
3031     }
3032   }
3033 #if KMP_USE_HIER_SCHED
3034   __kmp_dispatch_free_hierarchies(team);
3035 #endif
3036   __kmp_free(team->t.t_threads);
3037   __kmp_free(team->t.t_disp_buffer);
3038   __kmp_free(team->t.t_dispatch);
3039   __kmp_free(team->t.t_implicit_task_taskdata);
3040   team->t.t_threads = NULL;
3041   team->t.t_disp_buffer = NULL;
3042   team->t.t_dispatch = NULL;
3043   team->t.t_implicit_task_taskdata = 0;
3044 }
3045 
3046 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3047   kmp_info_t **oldThreads = team->t.t_threads;
3048 
3049   __kmp_free(team->t.t_disp_buffer);
3050   __kmp_free(team->t.t_dispatch);
3051   __kmp_free(team->t.t_implicit_task_taskdata);
3052   __kmp_allocate_team_arrays(team, max_nth);
3053 
3054   KMP_MEMCPY(team->t.t_threads, oldThreads,
3055              team->t.t_nproc * sizeof(kmp_info_t *));
3056 
3057   __kmp_free(oldThreads);
3058 }
3059 
3060 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3061 
3062   kmp_r_sched_t r_sched =
3063       __kmp_get_schedule_global(); // get current state of scheduling globals
3064 
3065   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3066 
3067   kmp_internal_control_t g_icvs = {
3068     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3069     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3070     // adjustment of threads (per thread)
3071     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3072     // whether blocktime is explicitly set
3073     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3074 #if KMP_USE_MONITOR
3075     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3076 // intervals
3077 #endif
3078     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3079     // next parallel region (per thread)
3080     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3081     __kmp_cg_max_nth, // int thread_limit;
3082     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3083     // for max_active_levels
3084     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3085     // {sched,chunk} pair
3086     __kmp_nested_proc_bind.bind_types[0],
3087     __kmp_default_device,
3088     NULL // struct kmp_internal_control *next;
3089   };
3090 
3091   return g_icvs;
3092 }
3093 
3094 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3095 
3096   kmp_internal_control_t gx_icvs;
3097   gx_icvs.serial_nesting_level =
3098       0; // probably =team->t.t_serial like in save_inter_controls
3099   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3100   gx_icvs.next = NULL;
3101 
3102   return gx_icvs;
3103 }
3104 
3105 static void __kmp_initialize_root(kmp_root_t *root) {
3106   int f;
3107   kmp_team_t *root_team;
3108   kmp_team_t *hot_team;
3109   int hot_team_max_nth;
3110   kmp_r_sched_t r_sched =
3111       __kmp_get_schedule_global(); // get current state of scheduling globals
3112   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3113   KMP_DEBUG_ASSERT(root);
3114   KMP_ASSERT(!root->r.r_begin);
3115 
3116   /* setup the root state structure */
3117   __kmp_init_lock(&root->r.r_begin_lock);
3118   root->r.r_begin = FALSE;
3119   root->r.r_active = FALSE;
3120   root->r.r_in_parallel = 0;
3121   root->r.r_blocktime = __kmp_dflt_blocktime;
3122 
3123   /* setup the root team for this task */
3124   /* allocate the root team structure */
3125   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3126 
3127   root_team =
3128       __kmp_allocate_team(root,
3129                           1, // new_nproc
3130                           1, // max_nproc
3131 #if OMPT_SUPPORT
3132                           ompt_data_none, // root parallel id
3133 #endif
3134                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3135                           0 // argc
3136                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3137       );
3138 #if USE_DEBUGGER
3139   // Non-NULL value should be assigned to make the debugger display the root
3140   // team.
3141   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3142 #endif
3143 
3144   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3145 
3146   root->r.r_root_team = root_team;
3147   root_team->t.t_control_stack_top = NULL;
3148 
3149   /* initialize root team */
3150   root_team->t.t_threads[0] = NULL;
3151   root_team->t.t_nproc = 1;
3152   root_team->t.t_serialized = 1;
3153   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3154   root_team->t.t_sched.sched = r_sched.sched;
3155   KA_TRACE(
3156       20,
3157       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3158        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3159 
3160   /* setup the  hot team for this task */
3161   /* allocate the hot team structure */
3162   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3163 
3164   hot_team =
3165       __kmp_allocate_team(root,
3166                           1, // new_nproc
3167                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3168 #if OMPT_SUPPORT
3169                           ompt_data_none, // root parallel id
3170 #endif
3171                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3172                           0 // argc
3173                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3174       );
3175   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3176 
3177   root->r.r_hot_team = hot_team;
3178   root_team->t.t_control_stack_top = NULL;
3179 
3180   /* first-time initialization */
3181   hot_team->t.t_parent = root_team;
3182 
3183   /* initialize hot team */
3184   hot_team_max_nth = hot_team->t.t_max_nproc;
3185   for (f = 0; f < hot_team_max_nth; ++f) {
3186     hot_team->t.t_threads[f] = NULL;
3187   }
3188   hot_team->t.t_nproc = 1;
3189   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3190   hot_team->t.t_sched.sched = r_sched.sched;
3191   hot_team->t.t_size_changed = 0;
3192 }
3193 
3194 #ifdef KMP_DEBUG
3195 
3196 typedef struct kmp_team_list_item {
3197   kmp_team_p const *entry;
3198   struct kmp_team_list_item *next;
3199 } kmp_team_list_item_t;
3200 typedef kmp_team_list_item_t *kmp_team_list_t;
3201 
3202 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3203     kmp_team_list_t list, // List of teams.
3204     kmp_team_p const *team // Team to add.
3205 ) {
3206 
3207   // List must terminate with item where both entry and next are NULL.
3208   // Team is added to the list only once.
3209   // List is sorted in ascending order by team id.
3210   // Team id is *not* a key.
3211 
3212   kmp_team_list_t l;
3213 
3214   KMP_DEBUG_ASSERT(list != NULL);
3215   if (team == NULL) {
3216     return;
3217   }
3218 
3219   __kmp_print_structure_team_accum(list, team->t.t_parent);
3220   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3221 
3222   // Search list for the team.
3223   l = list;
3224   while (l->next != NULL && l->entry != team) {
3225     l = l->next;
3226   }
3227   if (l->next != NULL) {
3228     return; // Team has been added before, exit.
3229   }
3230 
3231   // Team is not found. Search list again for insertion point.
3232   l = list;
3233   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3234     l = l->next;
3235   }
3236 
3237   // Insert team.
3238   {
3239     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3240         sizeof(kmp_team_list_item_t));
3241     *item = *l;
3242     l->entry = team;
3243     l->next = item;
3244   }
3245 }
3246 
3247 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3248 
3249 ) {
3250   __kmp_printf("%s", title);
3251   if (team != NULL) {
3252     __kmp_printf("%2x %p\n", team->t.t_id, team);
3253   } else {
3254     __kmp_printf(" - (nil)\n");
3255   }
3256 }
3257 
3258 static void __kmp_print_structure_thread(char const *title,
3259                                          kmp_info_p const *thread) {
3260   __kmp_printf("%s", title);
3261   if (thread != NULL) {
3262     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3263   } else {
3264     __kmp_printf(" - (nil)\n");
3265   }
3266 }
3267 
3268 void __kmp_print_structure(void) {
3269 
3270   kmp_team_list_t list;
3271 
3272   // Initialize list of teams.
3273   list =
3274       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3275   list->entry = NULL;
3276   list->next = NULL;
3277 
3278   __kmp_printf("\n------------------------------\nGlobal Thread "
3279                "Table\n------------------------------\n");
3280   {
3281     int gtid;
3282     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3283       __kmp_printf("%2d", gtid);
3284       if (__kmp_threads != NULL) {
3285         __kmp_printf(" %p", __kmp_threads[gtid]);
3286       }
3287       if (__kmp_root != NULL) {
3288         __kmp_printf(" %p", __kmp_root[gtid]);
3289       }
3290       __kmp_printf("\n");
3291     }
3292   }
3293 
3294   // Print out __kmp_threads array.
3295   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3296                "----------\n");
3297   if (__kmp_threads != NULL) {
3298     int gtid;
3299     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3300       kmp_info_t const *thread = __kmp_threads[gtid];
3301       if (thread != NULL) {
3302         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3303         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3304         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3305         __kmp_print_structure_team("    Serial Team:  ",
3306                                    thread->th.th_serial_team);
3307         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3308         __kmp_print_structure_thread("    Master:       ",
3309                                      thread->th.th_team_master);
3310         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3311         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3312         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3313         __kmp_print_structure_thread("    Next in pool: ",
3314                                      thread->th.th_next_pool);
3315         __kmp_printf("\n");
3316         __kmp_print_structure_team_accum(list, thread->th.th_team);
3317         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3318       }
3319     }
3320   } else {
3321     __kmp_printf("Threads array is not allocated.\n");
3322   }
3323 
3324   // Print out __kmp_root array.
3325   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3326                "--------\n");
3327   if (__kmp_root != NULL) {
3328     int gtid;
3329     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3330       kmp_root_t const *root = __kmp_root[gtid];
3331       if (root != NULL) {
3332         __kmp_printf("GTID %2d %p:\n", gtid, root);
3333         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3334         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3335         __kmp_print_structure_thread("    Uber Thread:  ",
3336                                      root->r.r_uber_thread);
3337         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3338         __kmp_printf("    In Parallel:  %2d\n",
3339                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3340         __kmp_printf("\n");
3341         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3342         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3343       }
3344     }
3345   } else {
3346     __kmp_printf("Ubers array is not allocated.\n");
3347   }
3348 
3349   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3350                "--------\n");
3351   while (list->next != NULL) {
3352     kmp_team_p const *team = list->entry;
3353     int i;
3354     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3355     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3356     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3357     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3358     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3359     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3360     for (i = 0; i < team->t.t_nproc; ++i) {
3361       __kmp_printf("    Thread %2d:      ", i);
3362       __kmp_print_structure_thread("", team->t.t_threads[i]);
3363     }
3364     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3365     __kmp_printf("\n");
3366     list = list->next;
3367   }
3368 
3369   // Print out __kmp_thread_pool and __kmp_team_pool.
3370   __kmp_printf("\n------------------------------\nPools\n----------------------"
3371                "--------\n");
3372   __kmp_print_structure_thread("Thread pool:          ",
3373                                CCAST(kmp_info_t *, __kmp_thread_pool));
3374   __kmp_print_structure_team("Team pool:            ",
3375                              CCAST(kmp_team_t *, __kmp_team_pool));
3376   __kmp_printf("\n");
3377 
3378   // Free team list.
3379   while (list != NULL) {
3380     kmp_team_list_item_t *item = list;
3381     list = list->next;
3382     KMP_INTERNAL_FREE(item);
3383   }
3384 }
3385 
3386 #endif
3387 
3388 //---------------------------------------------------------------------------
3389 //  Stuff for per-thread fast random number generator
3390 //  Table of primes
3391 static const unsigned __kmp_primes[] = {
3392     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3393     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3394     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3395     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3396     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3397     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3398     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3399     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3400     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3401     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3402     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3403 
3404 //---------------------------------------------------------------------------
3405 //  __kmp_get_random: Get a random number using a linear congruential method.
3406 unsigned short __kmp_get_random(kmp_info_t *thread) {
3407   unsigned x = thread->th.th_x;
3408   unsigned short r = (unsigned short)(x >> 16);
3409 
3410   thread->th.th_x = x * thread->th.th_a + 1;
3411 
3412   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3413                 thread->th.th_info.ds.ds_tid, r));
3414 
3415   return r;
3416 }
3417 //--------------------------------------------------------
3418 // __kmp_init_random: Initialize a random number generator
3419 void __kmp_init_random(kmp_info_t *thread) {
3420   unsigned seed = thread->th.th_info.ds.ds_tid;
3421 
3422   thread->th.th_a =
3423       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3424   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3425   KA_TRACE(30,
3426            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3427 }
3428 
3429 #if KMP_OS_WINDOWS
3430 /* reclaim array entries for root threads that are already dead, returns number
3431  * reclaimed */
3432 static int __kmp_reclaim_dead_roots(void) {
3433   int i, r = 0;
3434 
3435   for (i = 0; i < __kmp_threads_capacity; ++i) {
3436     if (KMP_UBER_GTID(i) &&
3437         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3438         !__kmp_root[i]
3439              ->r.r_active) { // AC: reclaim only roots died in non-active state
3440       r += __kmp_unregister_root_other_thread(i);
3441     }
3442   }
3443   return r;
3444 }
3445 #endif
3446 
3447 /* This function attempts to create free entries in __kmp_threads and
3448    __kmp_root, and returns the number of free entries generated.
3449 
3450    For Windows* OS static library, the first mechanism used is to reclaim array
3451    entries for root threads that are already dead.
3452 
3453    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3454    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3455    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3456    threadprivate cache array has been created. Synchronization with
3457    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3458 
3459    After any dead root reclamation, if the clipping value allows array expansion
3460    to result in the generation of a total of nNeed free slots, the function does
3461    that expansion. If not, nothing is done beyond the possible initial root
3462    thread reclamation.
3463 
3464    If any argument is negative, the behavior is undefined. */
3465 static int __kmp_expand_threads(int nNeed) {
3466   int added = 0;
3467   int minimumRequiredCapacity;
3468   int newCapacity;
3469   kmp_info_t **newThreads;
3470   kmp_root_t **newRoot;
3471 
3472   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3473   // resizing __kmp_threads does not need additional protection if foreign
3474   // threads are present
3475 
3476 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3477   /* only for Windows static library */
3478   /* reclaim array entries for root threads that are already dead */
3479   added = __kmp_reclaim_dead_roots();
3480 
3481   if (nNeed) {
3482     nNeed -= added;
3483     if (nNeed < 0)
3484       nNeed = 0;
3485   }
3486 #endif
3487   if (nNeed <= 0)
3488     return added;
3489 
3490   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3491   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3492   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3493   // > __kmp_max_nth in one of two ways:
3494   //
3495   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3496   //    may not be reused by another thread, so we may need to increase
3497   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3498   //
3499   // 2) New foreign root(s) are encountered.  We always register new foreign
3500   //    roots. This may cause a smaller # of threads to be allocated at
3501   //    subsequent parallel regions, but the worker threads hang around (and
3502   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3503   //
3504   // Anyway, that is the reason for moving the check to see if
3505   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3506   // instead of having it performed here. -BB
3507 
3508   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3509 
3510   /* compute expansion headroom to check if we can expand */
3511   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3512     /* possible expansion too small -- give up */
3513     return added;
3514   }
3515   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3516 
3517   newCapacity = __kmp_threads_capacity;
3518   do {
3519     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3520                                                           : __kmp_sys_max_nth;
3521   } while (newCapacity < minimumRequiredCapacity);
3522   newThreads = (kmp_info_t **)__kmp_allocate(
3523       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3524   newRoot =
3525       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3526   KMP_MEMCPY(newThreads, __kmp_threads,
3527              __kmp_threads_capacity * sizeof(kmp_info_t *));
3528   KMP_MEMCPY(newRoot, __kmp_root,
3529              __kmp_threads_capacity * sizeof(kmp_root_t *));
3530 
3531   kmp_info_t **temp_threads = __kmp_threads;
3532   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3533   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3534   __kmp_free(temp_threads);
3535   added += newCapacity - __kmp_threads_capacity;
3536   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3537 
3538   if (newCapacity > __kmp_tp_capacity) {
3539     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3540     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3541       __kmp_threadprivate_resize_cache(newCapacity);
3542     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3543       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3544     }
3545     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3546   }
3547 
3548   return added;
3549 }
3550 
3551 /* Register the current thread as a root thread and obtain our gtid. We must
3552    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3553    thread that calls from __kmp_do_serial_initialize() */
3554 int __kmp_register_root(int initial_thread) {
3555   kmp_info_t *root_thread;
3556   kmp_root_t *root;
3557   int gtid;
3558   int capacity;
3559   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3560   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3561   KMP_MB();
3562 
3563   /* 2007-03-02:
3564      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3565      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3566      work as expected -- it may return false (that means there is at least one
3567      empty slot in __kmp_threads array), but it is possible the only free slot
3568      is #0, which is reserved for initial thread and so cannot be used for this
3569      one. Following code workarounds this bug.
3570 
3571      However, right solution seems to be not reserving slot #0 for initial
3572      thread because:
3573      (1) there is no magic in slot #0,
3574      (2) we cannot detect initial thread reliably (the first thread which does
3575         serial initialization may be not a real initial thread).
3576   */
3577   capacity = __kmp_threads_capacity;
3578   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3579     --capacity;
3580   }
3581 
3582   /* see if there are too many threads */
3583   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3584     if (__kmp_tp_cached) {
3585       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3586                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3587                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3588     } else {
3589       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3590                   __kmp_msg_null);
3591     }
3592   }
3593 
3594   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3595   // 0: initial thread, also a regular OpenMP thread.
3596   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3597   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3598   // regular OpenMP threads.
3599   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3600     // Find an available thread slot for hidden helper thread. Slots for hidden
3601     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3602     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3603                    gtid <= __kmp_hidden_helper_threads_num;
3604          gtid++)
3605       ;
3606     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3607     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3608                  "hidden helper thread: T#%d\n",
3609                  gtid));
3610   } else {
3611     /* find an available thread slot */
3612     // Don't reassign the zero slot since we need that to only be used by
3613     // initial thread. Slots for hidden helper threads should also be skipped.
3614     if (initial_thread && __kmp_threads[0] == NULL) {
3615       gtid = 0;
3616     } else {
3617       for (gtid = __kmp_hidden_helper_threads_num + 1;
3618            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3619         ;
3620     }
3621     KA_TRACE(
3622         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3623     KMP_ASSERT(gtid < __kmp_threads_capacity);
3624   }
3625 
3626   /* update global accounting */
3627   __kmp_all_nth++;
3628   TCW_4(__kmp_nth, __kmp_nth + 1);
3629 
3630   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3631   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3632   if (__kmp_adjust_gtid_mode) {
3633     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3634       if (TCR_4(__kmp_gtid_mode) != 2) {
3635         TCW_4(__kmp_gtid_mode, 2);
3636       }
3637     } else {
3638       if (TCR_4(__kmp_gtid_mode) != 1) {
3639         TCW_4(__kmp_gtid_mode, 1);
3640       }
3641     }
3642   }
3643 
3644 #ifdef KMP_ADJUST_BLOCKTIME
3645   /* Adjust blocktime to zero if necessary            */
3646   /* Middle initialization might not have occurred yet */
3647   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3648     if (__kmp_nth > __kmp_avail_proc) {
3649       __kmp_zero_bt = TRUE;
3650     }
3651   }
3652 #endif /* KMP_ADJUST_BLOCKTIME */
3653 
3654   /* setup this new hierarchy */
3655   if (!(root = __kmp_root[gtid])) {
3656     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3657     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3658   }
3659 
3660 #if KMP_STATS_ENABLED
3661   // Initialize stats as soon as possible (right after gtid assignment).
3662   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3663   __kmp_stats_thread_ptr->startLife();
3664   KMP_SET_THREAD_STATE(SERIAL_REGION);
3665   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3666 #endif
3667   __kmp_initialize_root(root);
3668 
3669   /* setup new root thread structure */
3670   if (root->r.r_uber_thread) {
3671     root_thread = root->r.r_uber_thread;
3672   } else {
3673     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3674     if (__kmp_storage_map) {
3675       __kmp_print_thread_storage_map(root_thread, gtid);
3676     }
3677     root_thread->th.th_info.ds.ds_gtid = gtid;
3678 #if OMPT_SUPPORT
3679     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3680 #endif
3681     root_thread->th.th_root = root;
3682     if (__kmp_env_consistency_check) {
3683       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3684     }
3685 #if USE_FAST_MEMORY
3686     __kmp_initialize_fast_memory(root_thread);
3687 #endif /* USE_FAST_MEMORY */
3688 
3689 #if KMP_USE_BGET
3690     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3691     __kmp_initialize_bget(root_thread);
3692 #endif
3693     __kmp_init_random(root_thread); // Initialize random number generator
3694   }
3695 
3696   /* setup the serial team held in reserve by the root thread */
3697   if (!root_thread->th.th_serial_team) {
3698     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3699     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3700     root_thread->th.th_serial_team = __kmp_allocate_team(
3701         root, 1, 1,
3702 #if OMPT_SUPPORT
3703         ompt_data_none, // root parallel id
3704 #endif
3705         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3706   }
3707   KMP_ASSERT(root_thread->th.th_serial_team);
3708   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3709                 root_thread->th.th_serial_team));
3710 
3711   /* drop root_thread into place */
3712   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3713 
3714   root->r.r_root_team->t.t_threads[0] = root_thread;
3715   root->r.r_hot_team->t.t_threads[0] = root_thread;
3716   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3717   // AC: the team created in reserve, not for execution (it is unused for now).
3718   root_thread->th.th_serial_team->t.t_serialized = 0;
3719   root->r.r_uber_thread = root_thread;
3720 
3721   /* initialize the thread, get it ready to go */
3722   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3723   TCW_4(__kmp_init_gtid, TRUE);
3724 
3725   /* prepare the master thread for get_gtid() */
3726   __kmp_gtid_set_specific(gtid);
3727 
3728 #if USE_ITT_BUILD
3729   __kmp_itt_thread_name(gtid);
3730 #endif /* USE_ITT_BUILD */
3731 
3732 #ifdef KMP_TDATA_GTID
3733   __kmp_gtid = gtid;
3734 #endif
3735   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3736   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3737 
3738   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3739                 "plain=%u\n",
3740                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3741                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3742                 KMP_INIT_BARRIER_STATE));
3743   { // Initialize barrier data.
3744     int b;
3745     for (b = 0; b < bs_last_barrier; ++b) {
3746       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3747 #if USE_DEBUGGER
3748       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3749 #endif
3750     }
3751   }
3752   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3753                    KMP_INIT_BARRIER_STATE);
3754 
3755 #if KMP_AFFINITY_SUPPORTED
3756   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3757   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3758   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3759   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3760   if (TCR_4(__kmp_init_middle)) {
3761     __kmp_affinity_set_init_mask(gtid, TRUE);
3762   }
3763 #endif /* KMP_AFFINITY_SUPPORTED */
3764   root_thread->th.th_def_allocator = __kmp_def_allocator;
3765   root_thread->th.th_prev_level = 0;
3766   root_thread->th.th_prev_num_threads = 1;
3767 
3768   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3769   tmp->cg_root = root_thread;
3770   tmp->cg_thread_limit = __kmp_cg_max_nth;
3771   tmp->cg_nthreads = 1;
3772   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3773                  " cg_nthreads init to 1\n",
3774                  root_thread, tmp));
3775   tmp->up = NULL;
3776   root_thread->th.th_cg_roots = tmp;
3777 
3778   __kmp_root_counter++;
3779 
3780 #if OMPT_SUPPORT
3781   if (!initial_thread && ompt_enabled.enabled) {
3782 
3783     kmp_info_t *root_thread = ompt_get_thread();
3784 
3785     ompt_set_thread_state(root_thread, ompt_state_overhead);
3786 
3787     if (ompt_enabled.ompt_callback_thread_begin) {
3788       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3789           ompt_thread_initial, __ompt_get_thread_data_internal());
3790     }
3791     ompt_data_t *task_data;
3792     ompt_data_t *parallel_data;
3793     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3794                                   NULL);
3795     if (ompt_enabled.ompt_callback_implicit_task) {
3796       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3797           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3798     }
3799 
3800     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3801   }
3802 #endif
3803 
3804   KMP_MB();
3805   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3806 
3807   return gtid;
3808 }
3809 
3810 #if KMP_NESTED_HOT_TEAMS
3811 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3812                                 const int max_level) {
3813   int i, n, nth;
3814   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3815   if (!hot_teams || !hot_teams[level].hot_team) {
3816     return 0;
3817   }
3818   KMP_DEBUG_ASSERT(level < max_level);
3819   kmp_team_t *team = hot_teams[level].hot_team;
3820   nth = hot_teams[level].hot_team_nth;
3821   n = nth - 1; // master is not freed
3822   if (level < max_level - 1) {
3823     for (i = 0; i < nth; ++i) {
3824       kmp_info_t *th = team->t.t_threads[i];
3825       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3826       if (i > 0 && th->th.th_hot_teams) {
3827         __kmp_free(th->th.th_hot_teams);
3828         th->th.th_hot_teams = NULL;
3829       }
3830     }
3831   }
3832   __kmp_free_team(root, team, NULL);
3833   return n;
3834 }
3835 #endif
3836 
3837 // Resets a root thread and clear its root and hot teams.
3838 // Returns the number of __kmp_threads entries directly and indirectly freed.
3839 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3840   kmp_team_t *root_team = root->r.r_root_team;
3841   kmp_team_t *hot_team = root->r.r_hot_team;
3842   int n = hot_team->t.t_nproc;
3843   int i;
3844 
3845   KMP_DEBUG_ASSERT(!root->r.r_active);
3846 
3847   root->r.r_root_team = NULL;
3848   root->r.r_hot_team = NULL;
3849   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3850   // before call to __kmp_free_team().
3851   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3852 #if KMP_NESTED_HOT_TEAMS
3853   if (__kmp_hot_teams_max_level >
3854       0) { // need to free nested hot teams and their threads if any
3855     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3856       kmp_info_t *th = hot_team->t.t_threads[i];
3857       if (__kmp_hot_teams_max_level > 1) {
3858         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3859       }
3860       if (th->th.th_hot_teams) {
3861         __kmp_free(th->th.th_hot_teams);
3862         th->th.th_hot_teams = NULL;
3863       }
3864     }
3865   }
3866 #endif
3867   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3868 
3869   // Before we can reap the thread, we need to make certain that all other
3870   // threads in the teams that had this root as ancestor have stopped trying to
3871   // steal tasks.
3872   if (__kmp_tasking_mode != tskm_immediate_exec) {
3873     __kmp_wait_to_unref_task_teams();
3874   }
3875 
3876 #if KMP_OS_WINDOWS
3877   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3878   KA_TRACE(
3879       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3880            "\n",
3881            (LPVOID) & (root->r.r_uber_thread->th),
3882            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3883   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3884 #endif /* KMP_OS_WINDOWS */
3885 
3886 #if OMPT_SUPPORT
3887   ompt_data_t *task_data;
3888   ompt_data_t *parallel_data;
3889   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3890                                 NULL);
3891   if (ompt_enabled.ompt_callback_implicit_task) {
3892     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3893         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3894   }
3895   if (ompt_enabled.ompt_callback_thread_end) {
3896     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3897         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3898   }
3899 #endif
3900 
3901   TCW_4(__kmp_nth,
3902         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3903   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3904   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3905                  " to %d\n",
3906                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3907                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3908   if (i == 1) {
3909     // need to free contention group structure
3910     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3911                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3912     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3913     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3914     root->r.r_uber_thread->th.th_cg_roots = NULL;
3915   }
3916   __kmp_reap_thread(root->r.r_uber_thread, 1);
3917 
3918   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3919   // instead of freeing.
3920   root->r.r_uber_thread = NULL;
3921   /* mark root as no longer in use */
3922   root->r.r_begin = FALSE;
3923 
3924   return n;
3925 }
3926 
3927 void __kmp_unregister_root_current_thread(int gtid) {
3928   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3929   /* this lock should be ok, since unregister_root_current_thread is never
3930      called during an abort, only during a normal close. furthermore, if you
3931      have the forkjoin lock, you should never try to get the initz lock */
3932   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3933   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3934     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3935                   "exiting T#%d\n",
3936                   gtid));
3937     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3938     return;
3939   }
3940   kmp_root_t *root = __kmp_root[gtid];
3941 
3942   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3943   KMP_ASSERT(KMP_UBER_GTID(gtid));
3944   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3945   KMP_ASSERT(root->r.r_active == FALSE);
3946 
3947   KMP_MB();
3948 
3949   kmp_info_t *thread = __kmp_threads[gtid];
3950   kmp_team_t *team = thread->th.th_team;
3951   kmp_task_team_t *task_team = thread->th.th_task_team;
3952 
3953   // we need to wait for the proxy tasks before finishing the thread
3954   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3955 #if OMPT_SUPPORT
3956     // the runtime is shutting down so we won't report any events
3957     thread->th.ompt_thread_info.state = ompt_state_undefined;
3958 #endif
3959     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3960   }
3961 
3962   __kmp_reset_root(gtid, root);
3963 
3964   KMP_MB();
3965   KC_TRACE(10,
3966            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3967 
3968   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3969 }
3970 
3971 #if KMP_OS_WINDOWS
3972 /* __kmp_forkjoin_lock must be already held
3973    Unregisters a root thread that is not the current thread.  Returns the number
3974    of __kmp_threads entries freed as a result. */
3975 static int __kmp_unregister_root_other_thread(int gtid) {
3976   kmp_root_t *root = __kmp_root[gtid];
3977   int r;
3978 
3979   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3980   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3981   KMP_ASSERT(KMP_UBER_GTID(gtid));
3982   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3983   KMP_ASSERT(root->r.r_active == FALSE);
3984 
3985   r = __kmp_reset_root(gtid, root);
3986   KC_TRACE(10,
3987            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3988   return r;
3989 }
3990 #endif
3991 
3992 #if KMP_DEBUG
3993 void __kmp_task_info() {
3994 
3995   kmp_int32 gtid = __kmp_entry_gtid();
3996   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
3997   kmp_info_t *this_thr = __kmp_threads[gtid];
3998   kmp_team_t *steam = this_thr->th.th_serial_team;
3999   kmp_team_t *team = this_thr->th.th_team;
4000 
4001   __kmp_printf(
4002       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4003       "ptask=%p\n",
4004       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4005       team->t.t_implicit_task_taskdata[tid].td_parent);
4006 }
4007 #endif // KMP_DEBUG
4008 
4009 /* TODO optimize with one big memclr, take out what isn't needed, split
4010    responsibility to workers as much as possible, and delay initialization of
4011    features as much as possible  */
4012 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4013                                   int tid, int gtid) {
4014   /* this_thr->th.th_info.ds.ds_gtid is setup in
4015      kmp_allocate_thread/create_worker.
4016      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4017   kmp_info_t *master = team->t.t_threads[0];
4018   KMP_DEBUG_ASSERT(this_thr != NULL);
4019   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4020   KMP_DEBUG_ASSERT(team);
4021   KMP_DEBUG_ASSERT(team->t.t_threads);
4022   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4023   KMP_DEBUG_ASSERT(master);
4024   KMP_DEBUG_ASSERT(master->th.th_root);
4025 
4026   KMP_MB();
4027 
4028   TCW_SYNC_PTR(this_thr->th.th_team, team);
4029 
4030   this_thr->th.th_info.ds.ds_tid = tid;
4031   this_thr->th.th_set_nproc = 0;
4032   if (__kmp_tasking_mode != tskm_immediate_exec)
4033     // When tasking is possible, threads are not safe to reap until they are
4034     // done tasking; this will be set when tasking code is exited in wait
4035     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4036   else // no tasking --> always safe to reap
4037     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4038   this_thr->th.th_set_proc_bind = proc_bind_default;
4039 #if KMP_AFFINITY_SUPPORTED
4040   this_thr->th.th_new_place = this_thr->th.th_current_place;
4041 #endif
4042   this_thr->th.th_root = master->th.th_root;
4043 
4044   /* setup the thread's cache of the team structure */
4045   this_thr->th.th_team_nproc = team->t.t_nproc;
4046   this_thr->th.th_team_master = master;
4047   this_thr->th.th_team_serialized = team->t.t_serialized;
4048   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4049 
4050   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4051 
4052   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4053                 tid, gtid, this_thr, this_thr->th.th_current_task));
4054 
4055   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4056                            team, tid, TRUE);
4057 
4058   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4059                 tid, gtid, this_thr, this_thr->th.th_current_task));
4060   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4061   // __kmp_initialize_team()?
4062 
4063   /* TODO no worksharing in speculative threads */
4064   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4065 
4066   this_thr->th.th_local.this_construct = 0;
4067 
4068   if (!this_thr->th.th_pri_common) {
4069     this_thr->th.th_pri_common =
4070         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4071     if (__kmp_storage_map) {
4072       __kmp_print_storage_map_gtid(
4073           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4074           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4075     }
4076     this_thr->th.th_pri_head = NULL;
4077   }
4078 
4079   if (this_thr != master && // Master's CG root is initialized elsewhere
4080       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4081     // Make new thread's CG root same as master's
4082     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4083     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4084     if (tmp) {
4085       // worker changes CG, need to check if old CG should be freed
4086       int i = tmp->cg_nthreads--;
4087       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4088                      " on node %p of thread %p to %d\n",
4089                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4090       if (i == 1) {
4091         __kmp_free(tmp); // last thread left CG --> free it
4092       }
4093     }
4094     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4095     // Increment new thread's CG root's counter to add the new thread
4096     this_thr->th.th_cg_roots->cg_nthreads++;
4097     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4098                    " node %p of thread %p to %d\n",
4099                    this_thr, this_thr->th.th_cg_roots,
4100                    this_thr->th.th_cg_roots->cg_root,
4101                    this_thr->th.th_cg_roots->cg_nthreads));
4102     this_thr->th.th_current_task->td_icvs.thread_limit =
4103         this_thr->th.th_cg_roots->cg_thread_limit;
4104   }
4105 
4106   /* Initialize dynamic dispatch */
4107   {
4108     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4109     // Use team max_nproc since this will never change for the team.
4110     size_t disp_size =
4111         sizeof(dispatch_private_info_t) *
4112         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4113     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4114                   team->t.t_max_nproc));
4115     KMP_ASSERT(dispatch);
4116     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4117     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4118 
4119     dispatch->th_disp_index = 0;
4120     dispatch->th_doacross_buf_idx = 0;
4121     if (!dispatch->th_disp_buffer) {
4122       dispatch->th_disp_buffer =
4123           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4124 
4125       if (__kmp_storage_map) {
4126         __kmp_print_storage_map_gtid(
4127             gtid, &dispatch->th_disp_buffer[0],
4128             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4129                                           ? 1
4130                                           : __kmp_dispatch_num_buffers],
4131             disp_size,
4132             "th_%d.th_dispatch.th_disp_buffer "
4133             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4134             gtid, team->t.t_id, gtid);
4135       }
4136     } else {
4137       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4138     }
4139 
4140     dispatch->th_dispatch_pr_current = 0;
4141     dispatch->th_dispatch_sh_current = 0;
4142 
4143     dispatch->th_deo_fcn = 0; /* ORDERED     */
4144     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4145   }
4146 
4147   this_thr->th.th_next_pool = NULL;
4148 
4149   if (!this_thr->th.th_task_state_memo_stack) {
4150     size_t i;
4151     this_thr->th.th_task_state_memo_stack =
4152         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4153     this_thr->th.th_task_state_top = 0;
4154     this_thr->th.th_task_state_stack_sz = 4;
4155     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4156          ++i) // zero init the stack
4157       this_thr->th.th_task_state_memo_stack[i] = 0;
4158   }
4159 
4160   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4161   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4162 
4163   KMP_MB();
4164 }
4165 
4166 /* allocate a new thread for the requesting team. this is only called from
4167    within a forkjoin critical section. we will first try to get an available
4168    thread from the thread pool. if none is available, we will fork a new one
4169    assuming we are able to create a new one. this should be assured, as the
4170    caller should check on this first. */
4171 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4172                                   int new_tid) {
4173   kmp_team_t *serial_team;
4174   kmp_info_t *new_thr;
4175   int new_gtid;
4176 
4177   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4178   KMP_DEBUG_ASSERT(root && team);
4179 #if !KMP_NESTED_HOT_TEAMS
4180   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4181 #endif
4182   KMP_MB();
4183 
4184   /* first, try to get one from the thread pool */
4185   if (__kmp_thread_pool) {
4186     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4187     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4188     if (new_thr == __kmp_thread_pool_insert_pt) {
4189       __kmp_thread_pool_insert_pt = NULL;
4190     }
4191     TCW_4(new_thr->th.th_in_pool, FALSE);
4192     __kmp_suspend_initialize_thread(new_thr);
4193     __kmp_lock_suspend_mx(new_thr);
4194     if (new_thr->th.th_active_in_pool == TRUE) {
4195       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4196       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4197       new_thr->th.th_active_in_pool = FALSE;
4198     }
4199     __kmp_unlock_suspend_mx(new_thr);
4200 
4201     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4202                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4203     KMP_ASSERT(!new_thr->th.th_team);
4204     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4205 
4206     /* setup the thread structure */
4207     __kmp_initialize_info(new_thr, team, new_tid,
4208                           new_thr->th.th_info.ds.ds_gtid);
4209     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4210 
4211     TCW_4(__kmp_nth, __kmp_nth + 1);
4212 
4213     new_thr->th.th_task_state = 0;
4214     new_thr->th.th_task_state_top = 0;
4215     new_thr->th.th_task_state_stack_sz = 4;
4216 
4217 #ifdef KMP_ADJUST_BLOCKTIME
4218     /* Adjust blocktime back to zero if necessary */
4219     /* Middle initialization might not have occurred yet */
4220     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4221       if (__kmp_nth > __kmp_avail_proc) {
4222         __kmp_zero_bt = TRUE;
4223       }
4224     }
4225 #endif /* KMP_ADJUST_BLOCKTIME */
4226 
4227 #if KMP_DEBUG
4228     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4229     // KMP_BARRIER_PARENT_FLAG.
4230     int b;
4231     kmp_balign_t *balign = new_thr->th.th_bar;
4232     for (b = 0; b < bs_last_barrier; ++b)
4233       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4234 #endif
4235 
4236     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4237                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4238 
4239     KMP_MB();
4240     return new_thr;
4241   }
4242 
4243   /* no, well fork a new one */
4244   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4245   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4246 
4247 #if KMP_USE_MONITOR
4248   // If this is the first worker thread the RTL is creating, then also
4249   // launch the monitor thread.  We try to do this as early as possible.
4250   if (!TCR_4(__kmp_init_monitor)) {
4251     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4252     if (!TCR_4(__kmp_init_monitor)) {
4253       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4254       TCW_4(__kmp_init_monitor, 1);
4255       __kmp_create_monitor(&__kmp_monitor);
4256       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4257 #if KMP_OS_WINDOWS
4258       // AC: wait until monitor has started. This is a fix for CQ232808.
4259       // The reason is that if the library is loaded/unloaded in a loop with
4260       // small (parallel) work in between, then there is high probability that
4261       // monitor thread started after the library shutdown. At shutdown it is
4262       // too late to cope with the problem, because when the master is in
4263       // DllMain (process detach) the monitor has no chances to start (it is
4264       // blocked), and master has no means to inform the monitor that the
4265       // library has gone, because all the memory which the monitor can access
4266       // is going to be released/reset.
4267       while (TCR_4(__kmp_init_monitor) < 2) {
4268         KMP_YIELD(TRUE);
4269       }
4270       KF_TRACE(10, ("after monitor thread has started\n"));
4271 #endif
4272     }
4273     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4274   }
4275 #endif
4276 
4277   KMP_MB();
4278 
4279   {
4280     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4281                              ? 1
4282                              : __kmp_hidden_helper_threads_num + 1;
4283 
4284     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4285          ++new_gtid) {
4286       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4287     }
4288 
4289     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4290       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4291     }
4292   }
4293 
4294   /* allocate space for it. */
4295   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4296 
4297   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4298 
4299 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4300   // suppress race conditions detection on synchronization flags in debug mode
4301   // this helps to analyze library internals eliminating false positives
4302   __itt_suppress_mark_range(
4303       __itt_suppress_range, __itt_suppress_threading_errors,
4304       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4305   __itt_suppress_mark_range(
4306       __itt_suppress_range, __itt_suppress_threading_errors,
4307       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4308 #if KMP_OS_WINDOWS
4309   __itt_suppress_mark_range(
4310       __itt_suppress_range, __itt_suppress_threading_errors,
4311       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4312 #else
4313   __itt_suppress_mark_range(__itt_suppress_range,
4314                             __itt_suppress_threading_errors,
4315                             &new_thr->th.th_suspend_init_count,
4316                             sizeof(new_thr->th.th_suspend_init_count));
4317 #endif
4318   // TODO: check if we need to also suppress b_arrived flags
4319   __itt_suppress_mark_range(__itt_suppress_range,
4320                             __itt_suppress_threading_errors,
4321                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4322                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4323   __itt_suppress_mark_range(__itt_suppress_range,
4324                             __itt_suppress_threading_errors,
4325                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4326                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4327   __itt_suppress_mark_range(__itt_suppress_range,
4328                             __itt_suppress_threading_errors,
4329                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4330                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4331 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4332   if (__kmp_storage_map) {
4333     __kmp_print_thread_storage_map(new_thr, new_gtid);
4334   }
4335 
4336   // add the reserve serialized team, initialized from the team's master thread
4337   {
4338     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4339     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4340     new_thr->th.th_serial_team = serial_team =
4341         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4342 #if OMPT_SUPPORT
4343                                           ompt_data_none, // root parallel id
4344 #endif
4345                                           proc_bind_default, &r_icvs,
4346                                           0 USE_NESTED_HOT_ARG(NULL));
4347   }
4348   KMP_ASSERT(serial_team);
4349   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4350   // execution (it is unused for now).
4351   serial_team->t.t_threads[0] = new_thr;
4352   KF_TRACE(10,
4353            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4354             new_thr));
4355 
4356   /* setup the thread structures */
4357   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4358 
4359 #if USE_FAST_MEMORY
4360   __kmp_initialize_fast_memory(new_thr);
4361 #endif /* USE_FAST_MEMORY */
4362 
4363 #if KMP_USE_BGET
4364   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4365   __kmp_initialize_bget(new_thr);
4366 #endif
4367 
4368   __kmp_init_random(new_thr); // Initialize random number generator
4369 
4370   /* Initialize these only once when thread is grabbed for a team allocation */
4371   KA_TRACE(20,
4372            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4373             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4374 
4375   int b;
4376   kmp_balign_t *balign = new_thr->th.th_bar;
4377   for (b = 0; b < bs_last_barrier; ++b) {
4378     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4379     balign[b].bb.team = NULL;
4380     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4381     balign[b].bb.use_oncore_barrier = 0;
4382   }
4383 
4384   new_thr->th.th_spin_here = FALSE;
4385   new_thr->th.th_next_waiting = 0;
4386 #if KMP_OS_UNIX
4387   new_thr->th.th_blocking = false;
4388 #endif
4389 
4390 #if KMP_AFFINITY_SUPPORTED
4391   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4392   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4393   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4394   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4395 #endif
4396   new_thr->th.th_def_allocator = __kmp_def_allocator;
4397   new_thr->th.th_prev_level = 0;
4398   new_thr->th.th_prev_num_threads = 1;
4399 
4400   TCW_4(new_thr->th.th_in_pool, FALSE);
4401   new_thr->th.th_active_in_pool = FALSE;
4402   TCW_4(new_thr->th.th_active, TRUE);
4403 
4404   /* adjust the global counters */
4405   __kmp_all_nth++;
4406   __kmp_nth++;
4407 
4408   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4409   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4410   if (__kmp_adjust_gtid_mode) {
4411     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4412       if (TCR_4(__kmp_gtid_mode) != 2) {
4413         TCW_4(__kmp_gtid_mode, 2);
4414       }
4415     } else {
4416       if (TCR_4(__kmp_gtid_mode) != 1) {
4417         TCW_4(__kmp_gtid_mode, 1);
4418       }
4419     }
4420   }
4421 
4422 #ifdef KMP_ADJUST_BLOCKTIME
4423   /* Adjust blocktime back to zero if necessary       */
4424   /* Middle initialization might not have occurred yet */
4425   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4426     if (__kmp_nth > __kmp_avail_proc) {
4427       __kmp_zero_bt = TRUE;
4428     }
4429   }
4430 #endif /* KMP_ADJUST_BLOCKTIME */
4431 
4432   /* actually fork it and create the new worker thread */
4433   KF_TRACE(
4434       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4435   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4436   KF_TRACE(10,
4437            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4438 
4439   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4440                 new_gtid));
4441   KMP_MB();
4442   return new_thr;
4443 }
4444 
4445 /* Reinitialize team for reuse.
4446    The hot team code calls this case at every fork barrier, so EPCC barrier
4447    test are extremely sensitive to changes in it, esp. writes to the team
4448    struct, which cause a cache invalidation in all threads.
4449    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4450 static void __kmp_reinitialize_team(kmp_team_t *team,
4451                                     kmp_internal_control_t *new_icvs,
4452                                     ident_t *loc) {
4453   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4454                 team->t.t_threads[0], team));
4455   KMP_DEBUG_ASSERT(team && new_icvs);
4456   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4457   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4458 
4459   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4460   // Copy ICVs to the master thread's implicit taskdata
4461   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4462   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4463 
4464   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4465                 team->t.t_threads[0], team));
4466 }
4467 
4468 /* Initialize the team data structure.
4469    This assumes the t_threads and t_max_nproc are already set.
4470    Also, we don't touch the arguments */
4471 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4472                                   kmp_internal_control_t *new_icvs,
4473                                   ident_t *loc) {
4474   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4475 
4476   /* verify */
4477   KMP_DEBUG_ASSERT(team);
4478   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4479   KMP_DEBUG_ASSERT(team->t.t_threads);
4480   KMP_MB();
4481 
4482   team->t.t_master_tid = 0; /* not needed */
4483   /* team->t.t_master_bar;        not needed */
4484   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4485   team->t.t_nproc = new_nproc;
4486 
4487   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4488   team->t.t_next_pool = NULL;
4489   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4490    * up hot team */
4491 
4492   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4493   team->t.t_invoke = NULL; /* not needed */
4494 
4495   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4496   team->t.t_sched.sched = new_icvs->sched.sched;
4497 
4498 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4499   team->t.t_fp_control_saved = FALSE; /* not needed */
4500   team->t.t_x87_fpu_control_word = 0; /* not needed */
4501   team->t.t_mxcsr = 0; /* not needed */
4502 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4503 
4504   team->t.t_construct = 0;
4505 
4506   team->t.t_ordered.dt.t_value = 0;
4507   team->t.t_master_active = FALSE;
4508 
4509 #ifdef KMP_DEBUG
4510   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4511 #endif
4512 #if KMP_OS_WINDOWS
4513   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4514 #endif
4515 
4516   team->t.t_control_stack_top = NULL;
4517 
4518   __kmp_reinitialize_team(team, new_icvs, loc);
4519 
4520   KMP_MB();
4521   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4522 }
4523 
4524 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4525 /* Sets full mask for thread and returns old mask, no changes to structures. */
4526 static void
4527 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4528   if (KMP_AFFINITY_CAPABLE()) {
4529     int status;
4530     if (old_mask != NULL) {
4531       status = __kmp_get_system_affinity(old_mask, TRUE);
4532       int error = errno;
4533       if (status != 0) {
4534         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4535                     __kmp_msg_null);
4536       }
4537     }
4538     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4539   }
4540 }
4541 #endif
4542 
4543 #if KMP_AFFINITY_SUPPORTED
4544 
4545 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4546 // It calculates the worker + master thread's partition based upon the parent
4547 // thread's partition, and binds each worker to a thread in their partition.
4548 // The master thread's partition should already include its current binding.
4549 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4550   // Copy the master thread's place partition to the team struct
4551   kmp_info_t *master_th = team->t.t_threads[0];
4552   KMP_DEBUG_ASSERT(master_th != NULL);
4553   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4554   int first_place = master_th->th.th_first_place;
4555   int last_place = master_th->th.th_last_place;
4556   int masters_place = master_th->th.th_current_place;
4557   team->t.t_first_place = first_place;
4558   team->t.t_last_place = last_place;
4559 
4560   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4561                 "bound to place %d partition = [%d,%d]\n",
4562                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4563                 team->t.t_id, masters_place, first_place, last_place));
4564 
4565   switch (proc_bind) {
4566 
4567   case proc_bind_default:
4568     // serial teams might have the proc_bind policy set to proc_bind_default. It
4569     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4570     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4571     break;
4572 
4573   case proc_bind_master: {
4574     int f;
4575     int n_th = team->t.t_nproc;
4576     for (f = 1; f < n_th; f++) {
4577       kmp_info_t *th = team->t.t_threads[f];
4578       KMP_DEBUG_ASSERT(th != NULL);
4579       th->th.th_first_place = first_place;
4580       th->th.th_last_place = last_place;
4581       th->th.th_new_place = masters_place;
4582       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4583           team->t.t_display_affinity != 1) {
4584         team->t.t_display_affinity = 1;
4585       }
4586 
4587       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4588                      "partition = [%d,%d]\n",
4589                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4590                      f, masters_place, first_place, last_place));
4591     }
4592   } break;
4593 
4594   case proc_bind_close: {
4595     int f;
4596     int n_th = team->t.t_nproc;
4597     int n_places;
4598     if (first_place <= last_place) {
4599       n_places = last_place - first_place + 1;
4600     } else {
4601       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4602     }
4603     if (n_th <= n_places) {
4604       int place = masters_place;
4605       for (f = 1; f < n_th; f++) {
4606         kmp_info_t *th = team->t.t_threads[f];
4607         KMP_DEBUG_ASSERT(th != NULL);
4608 
4609         if (place == last_place) {
4610           place = first_place;
4611         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4612           place = 0;
4613         } else {
4614           place++;
4615         }
4616         th->th.th_first_place = first_place;
4617         th->th.th_last_place = last_place;
4618         th->th.th_new_place = place;
4619         if (__kmp_display_affinity && place != th->th.th_current_place &&
4620             team->t.t_display_affinity != 1) {
4621           team->t.t_display_affinity = 1;
4622         }
4623 
4624         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4625                        "partition = [%d,%d]\n",
4626                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4627                        team->t.t_id, f, place, first_place, last_place));
4628       }
4629     } else {
4630       int S, rem, gap, s_count;
4631       S = n_th / n_places;
4632       s_count = 0;
4633       rem = n_th - (S * n_places);
4634       gap = rem > 0 ? n_places / rem : n_places;
4635       int place = masters_place;
4636       int gap_ct = gap;
4637       for (f = 0; f < n_th; f++) {
4638         kmp_info_t *th = team->t.t_threads[f];
4639         KMP_DEBUG_ASSERT(th != NULL);
4640 
4641         th->th.th_first_place = first_place;
4642         th->th.th_last_place = last_place;
4643         th->th.th_new_place = place;
4644         if (__kmp_display_affinity && place != th->th.th_current_place &&
4645             team->t.t_display_affinity != 1) {
4646           team->t.t_display_affinity = 1;
4647         }
4648         s_count++;
4649 
4650         if ((s_count == S) && rem && (gap_ct == gap)) {
4651           // do nothing, add an extra thread to place on next iteration
4652         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4653           // we added an extra thread to this place; move to next place
4654           if (place == last_place) {
4655             place = first_place;
4656           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4657             place = 0;
4658           } else {
4659             place++;
4660           }
4661           s_count = 0;
4662           gap_ct = 1;
4663           rem--;
4664         } else if (s_count == S) { // place full; don't add extra
4665           if (place == last_place) {
4666             place = first_place;
4667           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4668             place = 0;
4669           } else {
4670             place++;
4671           }
4672           gap_ct++;
4673           s_count = 0;
4674         }
4675 
4676         KA_TRACE(100,
4677                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4678                   "partition = [%d,%d]\n",
4679                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4680                   th->th.th_new_place, first_place, last_place));
4681       }
4682       KMP_DEBUG_ASSERT(place == masters_place);
4683     }
4684   } break;
4685 
4686   case proc_bind_spread: {
4687     int f;
4688     int n_th = team->t.t_nproc;
4689     int n_places;
4690     int thidx;
4691     if (first_place <= last_place) {
4692       n_places = last_place - first_place + 1;
4693     } else {
4694       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4695     }
4696     if (n_th <= n_places) {
4697       int place = -1;
4698 
4699       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4700         int S = n_places / n_th;
4701         int s_count, rem, gap, gap_ct;
4702 
4703         place = masters_place;
4704         rem = n_places - n_th * S;
4705         gap = rem ? n_th / rem : 1;
4706         gap_ct = gap;
4707         thidx = n_th;
4708         if (update_master_only == 1)
4709           thidx = 1;
4710         for (f = 0; f < thidx; f++) {
4711           kmp_info_t *th = team->t.t_threads[f];
4712           KMP_DEBUG_ASSERT(th != NULL);
4713 
4714           th->th.th_first_place = place;
4715           th->th.th_new_place = place;
4716           if (__kmp_display_affinity && place != th->th.th_current_place &&
4717               team->t.t_display_affinity != 1) {
4718             team->t.t_display_affinity = 1;
4719           }
4720           s_count = 1;
4721           while (s_count < S) {
4722             if (place == last_place) {
4723               place = first_place;
4724             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4725               place = 0;
4726             } else {
4727               place++;
4728             }
4729             s_count++;
4730           }
4731           if (rem && (gap_ct == gap)) {
4732             if (place == last_place) {
4733               place = first_place;
4734             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4735               place = 0;
4736             } else {
4737               place++;
4738             }
4739             rem--;
4740             gap_ct = 0;
4741           }
4742           th->th.th_last_place = place;
4743           gap_ct++;
4744 
4745           if (place == last_place) {
4746             place = first_place;
4747           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4748             place = 0;
4749           } else {
4750             place++;
4751           }
4752 
4753           KA_TRACE(100,
4754                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4755                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4756                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4757                     f, th->th.th_new_place, th->th.th_first_place,
4758                     th->th.th_last_place, __kmp_affinity_num_masks));
4759         }
4760       } else {
4761         /* Having uniform space of available computation places I can create
4762            T partitions of round(P/T) size and put threads into the first
4763            place of each partition. */
4764         double current = static_cast<double>(masters_place);
4765         double spacing =
4766             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4767         int first, last;
4768         kmp_info_t *th;
4769 
4770         thidx = n_th + 1;
4771         if (update_master_only == 1)
4772           thidx = 1;
4773         for (f = 0; f < thidx; f++) {
4774           first = static_cast<int>(current);
4775           last = static_cast<int>(current + spacing) - 1;
4776           KMP_DEBUG_ASSERT(last >= first);
4777           if (first >= n_places) {
4778             if (masters_place) {
4779               first -= n_places;
4780               last -= n_places;
4781               if (first == (masters_place + 1)) {
4782                 KMP_DEBUG_ASSERT(f == n_th);
4783                 first--;
4784               }
4785               if (last == masters_place) {
4786                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4787                 last--;
4788               }
4789             } else {
4790               KMP_DEBUG_ASSERT(f == n_th);
4791               first = 0;
4792               last = 0;
4793             }
4794           }
4795           if (last >= n_places) {
4796             last = (n_places - 1);
4797           }
4798           place = first;
4799           current += spacing;
4800           if (f < n_th) {
4801             KMP_DEBUG_ASSERT(0 <= first);
4802             KMP_DEBUG_ASSERT(n_places > first);
4803             KMP_DEBUG_ASSERT(0 <= last);
4804             KMP_DEBUG_ASSERT(n_places > last);
4805             KMP_DEBUG_ASSERT(last_place >= first_place);
4806             th = team->t.t_threads[f];
4807             KMP_DEBUG_ASSERT(th);
4808             th->th.th_first_place = first;
4809             th->th.th_new_place = place;
4810             th->th.th_last_place = last;
4811             if (__kmp_display_affinity && place != th->th.th_current_place &&
4812                 team->t.t_display_affinity != 1) {
4813               team->t.t_display_affinity = 1;
4814             }
4815             KA_TRACE(100,
4816                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4817                       "partition = [%d,%d], spacing = %.4f\n",
4818                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4819                       team->t.t_id, f, th->th.th_new_place,
4820                       th->th.th_first_place, th->th.th_last_place, spacing));
4821           }
4822         }
4823       }
4824       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4825     } else {
4826       int S, rem, gap, s_count;
4827       S = n_th / n_places;
4828       s_count = 0;
4829       rem = n_th - (S * n_places);
4830       gap = rem > 0 ? n_places / rem : n_places;
4831       int place = masters_place;
4832       int gap_ct = gap;
4833       thidx = n_th;
4834       if (update_master_only == 1)
4835         thidx = 1;
4836       for (f = 0; f < thidx; f++) {
4837         kmp_info_t *th = team->t.t_threads[f];
4838         KMP_DEBUG_ASSERT(th != NULL);
4839 
4840         th->th.th_first_place = place;
4841         th->th.th_last_place = place;
4842         th->th.th_new_place = place;
4843         if (__kmp_display_affinity && place != th->th.th_current_place &&
4844             team->t.t_display_affinity != 1) {
4845           team->t.t_display_affinity = 1;
4846         }
4847         s_count++;
4848 
4849         if ((s_count == S) && rem && (gap_ct == gap)) {
4850           // do nothing, add an extra thread to place on next iteration
4851         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4852           // we added an extra thread to this place; move on to next place
4853           if (place == last_place) {
4854             place = first_place;
4855           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4856             place = 0;
4857           } else {
4858             place++;
4859           }
4860           s_count = 0;
4861           gap_ct = 1;
4862           rem--;
4863         } else if (s_count == S) { // place is full; don't add extra thread
4864           if (place == last_place) {
4865             place = first_place;
4866           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4867             place = 0;
4868           } else {
4869             place++;
4870           }
4871           gap_ct++;
4872           s_count = 0;
4873         }
4874 
4875         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4876                        "partition = [%d,%d]\n",
4877                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4878                        team->t.t_id, f, th->th.th_new_place,
4879                        th->th.th_first_place, th->th.th_last_place));
4880       }
4881       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4882     }
4883   } break;
4884 
4885   default:
4886     break;
4887   }
4888 
4889   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4890 }
4891 
4892 #endif // KMP_AFFINITY_SUPPORTED
4893 
4894 /* allocate a new team data structure to use.  take one off of the free pool if
4895    available */
4896 kmp_team_t *
4897 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4898 #if OMPT_SUPPORT
4899                     ompt_data_t ompt_parallel_data,
4900 #endif
4901                     kmp_proc_bind_t new_proc_bind,
4902                     kmp_internal_control_t *new_icvs,
4903                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4904   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4905   int f;
4906   kmp_team_t *team;
4907   int use_hot_team = !root->r.r_active;
4908   int level = 0;
4909 
4910   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4911   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4912   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4913   KMP_MB();
4914 
4915 #if KMP_NESTED_HOT_TEAMS
4916   kmp_hot_team_ptr_t *hot_teams;
4917   if (master) {
4918     team = master->th.th_team;
4919     level = team->t.t_active_level;
4920     if (master->th.th_teams_microtask) { // in teams construct?
4921       if (master->th.th_teams_size.nteams > 1 &&
4922           ( // #teams > 1
4923               team->t.t_pkfn ==
4924                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4925               master->th.th_teams_level <
4926                   team->t.t_level)) { // or nested parallel inside the teams
4927         ++level; // not increment if #teams==1, or for outer fork of the teams;
4928         // increment otherwise
4929       }
4930     }
4931     hot_teams = master->th.th_hot_teams;
4932     if (level < __kmp_hot_teams_max_level && hot_teams &&
4933         hot_teams[level].hot_team) {
4934       // hot team has already been allocated for given level
4935       use_hot_team = 1;
4936     } else {
4937       use_hot_team = 0;
4938     }
4939   } else {
4940     // check we won't access uninitialized hot_teams, just in case
4941     KMP_DEBUG_ASSERT(new_nproc == 1);
4942   }
4943 #endif
4944   // Optimization to use a "hot" team
4945   if (use_hot_team && new_nproc > 1) {
4946     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4947 #if KMP_NESTED_HOT_TEAMS
4948     team = hot_teams[level].hot_team;
4949 #else
4950     team = root->r.r_hot_team;
4951 #endif
4952 #if KMP_DEBUG
4953     if (__kmp_tasking_mode != tskm_immediate_exec) {
4954       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4955                     "task_team[1] = %p before reinit\n",
4956                     team->t.t_task_team[0], team->t.t_task_team[1]));
4957     }
4958 #endif
4959 
4960     // Has the number of threads changed?
4961     /* Let's assume the most common case is that the number of threads is
4962        unchanged, and put that case first. */
4963     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4964       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4965       // This case can mean that omp_set_num_threads() was called and the hot
4966       // team size was already reduced, so we check the special flag
4967       if (team->t.t_size_changed == -1) {
4968         team->t.t_size_changed = 1;
4969       } else {
4970         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4971       }
4972 
4973       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4974       kmp_r_sched_t new_sched = new_icvs->sched;
4975       // set master's schedule as new run-time schedule
4976       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4977 
4978       __kmp_reinitialize_team(team, new_icvs,
4979                               root->r.r_uber_thread->th.th_ident);
4980 
4981       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4982                     team->t.t_threads[0], team));
4983       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4984 
4985 #if KMP_AFFINITY_SUPPORTED
4986       if ((team->t.t_size_changed == 0) &&
4987           (team->t.t_proc_bind == new_proc_bind)) {
4988         if (new_proc_bind == proc_bind_spread) {
4989           __kmp_partition_places(
4990               team, 1); // add flag to update only master for spread
4991         }
4992         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4993                        "proc_bind = %d, partition = [%d,%d]\n",
4994                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4995                        team->t.t_last_place));
4996       } else {
4997         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4998         __kmp_partition_places(team);
4999       }
5000 #else
5001       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5002 #endif /* KMP_AFFINITY_SUPPORTED */
5003     } else if (team->t.t_nproc > new_nproc) {
5004       KA_TRACE(20,
5005                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5006                 new_nproc));
5007 
5008       team->t.t_size_changed = 1;
5009 #if KMP_NESTED_HOT_TEAMS
5010       if (__kmp_hot_teams_mode == 0) {
5011         // AC: saved number of threads should correspond to team's value in this
5012         // mode, can be bigger in mode 1, when hot team has threads in reserve
5013         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5014         hot_teams[level].hot_team_nth = new_nproc;
5015 #endif // KMP_NESTED_HOT_TEAMS
5016         /* release the extra threads we don't need any more */
5017         for (f = new_nproc; f < team->t.t_nproc; f++) {
5018           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5019           if (__kmp_tasking_mode != tskm_immediate_exec) {
5020             // When decreasing team size, threads no longer in the team should
5021             // unref task team.
5022             team->t.t_threads[f]->th.th_task_team = NULL;
5023           }
5024           __kmp_free_thread(team->t.t_threads[f]);
5025           team->t.t_threads[f] = NULL;
5026         }
5027 #if KMP_NESTED_HOT_TEAMS
5028       } // (__kmp_hot_teams_mode == 0)
5029       else {
5030         // When keeping extra threads in team, switch threads to wait on own
5031         // b_go flag
5032         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5033           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5034           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5035           for (int b = 0; b < bs_last_barrier; ++b) {
5036             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5037               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5038             }
5039             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5040           }
5041         }
5042       }
5043 #endif // KMP_NESTED_HOT_TEAMS
5044       team->t.t_nproc = new_nproc;
5045       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5046       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5047       __kmp_reinitialize_team(team, new_icvs,
5048                               root->r.r_uber_thread->th.th_ident);
5049 
5050       // Update remaining threads
5051       for (f = 0; f < new_nproc; ++f) {
5052         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5053       }
5054 
5055       // restore the current task state of the master thread: should be the
5056       // implicit task
5057       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5058                     team->t.t_threads[0], team));
5059 
5060       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5061 
5062 #ifdef KMP_DEBUG
5063       for (f = 0; f < team->t.t_nproc; f++) {
5064         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5065                          team->t.t_threads[f]->th.th_team_nproc ==
5066                              team->t.t_nproc);
5067       }
5068 #endif
5069 
5070       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5071 #if KMP_AFFINITY_SUPPORTED
5072       __kmp_partition_places(team);
5073 #endif
5074     } else { // team->t.t_nproc < new_nproc
5075 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5076       kmp_affin_mask_t *old_mask;
5077       if (KMP_AFFINITY_CAPABLE()) {
5078         KMP_CPU_ALLOC(old_mask);
5079       }
5080 #endif
5081 
5082       KA_TRACE(20,
5083                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5084                 new_nproc));
5085 
5086       team->t.t_size_changed = 1;
5087 
5088 #if KMP_NESTED_HOT_TEAMS
5089       int avail_threads = hot_teams[level].hot_team_nth;
5090       if (new_nproc < avail_threads)
5091         avail_threads = new_nproc;
5092       kmp_info_t **other_threads = team->t.t_threads;
5093       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5094         // Adjust barrier data of reserved threads (if any) of the team
5095         // Other data will be set in __kmp_initialize_info() below.
5096         int b;
5097         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5098         for (b = 0; b < bs_last_barrier; ++b) {
5099           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5100           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5101 #if USE_DEBUGGER
5102           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5103 #endif
5104         }
5105       }
5106       if (hot_teams[level].hot_team_nth >= new_nproc) {
5107         // we have all needed threads in reserve, no need to allocate any
5108         // this only possible in mode 1, cannot have reserved threads in mode 0
5109         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5110         team->t.t_nproc = new_nproc; // just get reserved threads involved
5111       } else {
5112         // we may have some threads in reserve, but not enough
5113         team->t.t_nproc =
5114             hot_teams[level]
5115                 .hot_team_nth; // get reserved threads involved if any
5116         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5117 #endif // KMP_NESTED_HOT_TEAMS
5118         if (team->t.t_max_nproc < new_nproc) {
5119           /* reallocate larger arrays */
5120           __kmp_reallocate_team_arrays(team, new_nproc);
5121           __kmp_reinitialize_team(team, new_icvs, NULL);
5122         }
5123 
5124 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5125         /* Temporarily set full mask for master thread before creation of
5126            workers. The reason is that workers inherit the affinity from master,
5127            so if a lot of workers are created on the single core quickly, they
5128            don't get a chance to set their own affinity for a long time. */
5129         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5130 #endif
5131 
5132         /* allocate new threads for the hot team */
5133         for (f = team->t.t_nproc; f < new_nproc; f++) {
5134           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5135           KMP_DEBUG_ASSERT(new_worker);
5136           team->t.t_threads[f] = new_worker;
5137 
5138           KA_TRACE(20,
5139                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5140                     "join=%llu, plain=%llu\n",
5141                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5142                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5143                     team->t.t_bar[bs_plain_barrier].b_arrived));
5144 
5145           { // Initialize barrier data for new threads.
5146             int b;
5147             kmp_balign_t *balign = new_worker->th.th_bar;
5148             for (b = 0; b < bs_last_barrier; ++b) {
5149               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5150               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5151                                KMP_BARRIER_PARENT_FLAG);
5152 #if USE_DEBUGGER
5153               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5154 #endif
5155             }
5156           }
5157         }
5158 
5159 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5160         if (KMP_AFFINITY_CAPABLE()) {
5161           /* Restore initial master thread's affinity mask */
5162           __kmp_set_system_affinity(old_mask, TRUE);
5163           KMP_CPU_FREE(old_mask);
5164         }
5165 #endif
5166 #if KMP_NESTED_HOT_TEAMS
5167       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5168 #endif // KMP_NESTED_HOT_TEAMS
5169       /* make sure everyone is syncronized */
5170       int old_nproc = team->t.t_nproc; // save old value and use to update only
5171       // new threads below
5172       __kmp_initialize_team(team, new_nproc, new_icvs,
5173                             root->r.r_uber_thread->th.th_ident);
5174 
5175       /* reinitialize the threads */
5176       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5177       for (f = 0; f < team->t.t_nproc; ++f)
5178         __kmp_initialize_info(team->t.t_threads[f], team, f,
5179                               __kmp_gtid_from_tid(f, team));
5180 
5181       if (level) { // set th_task_state for new threads in nested hot team
5182         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5183         // only need to set the th_task_state for the new threads. th_task_state
5184         // for master thread will not be accurate until after this in
5185         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5186         // correct value.
5187         for (f = old_nproc; f < team->t.t_nproc; ++f)
5188           team->t.t_threads[f]->th.th_task_state =
5189               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5190       } else { // set th_task_state for new threads in non-nested hot team
5191         kmp_uint8 old_state =
5192             team->t.t_threads[0]->th.th_task_state; // copy master's state
5193         for (f = old_nproc; f < team->t.t_nproc; ++f)
5194           team->t.t_threads[f]->th.th_task_state = old_state;
5195       }
5196 
5197 #ifdef KMP_DEBUG
5198       for (f = 0; f < team->t.t_nproc; ++f) {
5199         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5200                          team->t.t_threads[f]->th.th_team_nproc ==
5201                              team->t.t_nproc);
5202       }
5203 #endif
5204 
5205       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5206 #if KMP_AFFINITY_SUPPORTED
5207       __kmp_partition_places(team);
5208 #endif
5209     } // Check changes in number of threads
5210 
5211     kmp_info_t *master = team->t.t_threads[0];
5212     if (master->th.th_teams_microtask) {
5213       for (f = 1; f < new_nproc; ++f) {
5214         // propagate teams construct specific info to workers
5215         kmp_info_t *thr = team->t.t_threads[f];
5216         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5217         thr->th.th_teams_level = master->th.th_teams_level;
5218         thr->th.th_teams_size = master->th.th_teams_size;
5219       }
5220     }
5221 #if KMP_NESTED_HOT_TEAMS
5222     if (level) {
5223       // Sync barrier state for nested hot teams, not needed for outermost hot
5224       // team.
5225       for (f = 1; f < new_nproc; ++f) {
5226         kmp_info_t *thr = team->t.t_threads[f];
5227         int b;
5228         kmp_balign_t *balign = thr->th.th_bar;
5229         for (b = 0; b < bs_last_barrier; ++b) {
5230           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5231           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5232 #if USE_DEBUGGER
5233           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5234 #endif
5235         }
5236       }
5237     }
5238 #endif // KMP_NESTED_HOT_TEAMS
5239 
5240     /* reallocate space for arguments if necessary */
5241     __kmp_alloc_argv_entries(argc, team, TRUE);
5242     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5243     // The hot team re-uses the previous task team,
5244     // if untouched during the previous release->gather phase.
5245 
5246     KF_TRACE(10, (" hot_team = %p\n", team));
5247 
5248 #if KMP_DEBUG
5249     if (__kmp_tasking_mode != tskm_immediate_exec) {
5250       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5251                     "task_team[1] = %p after reinit\n",
5252                     team->t.t_task_team[0], team->t.t_task_team[1]));
5253     }
5254 #endif
5255 
5256 #if OMPT_SUPPORT
5257     __ompt_team_assign_id(team, ompt_parallel_data);
5258 #endif
5259 
5260     KMP_MB();
5261 
5262     return team;
5263   }
5264 
5265   /* next, let's try to take one from the team pool */
5266   KMP_MB();
5267   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5268     /* TODO: consider resizing undersized teams instead of reaping them, now
5269        that we have a resizing mechanism */
5270     if (team->t.t_max_nproc >= max_nproc) {
5271       /* take this team from the team pool */
5272       __kmp_team_pool = team->t.t_next_pool;
5273 
5274       /* setup the team for fresh use */
5275       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5276 
5277       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5278                     "task_team[1] %p to NULL\n",
5279                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5280       team->t.t_task_team[0] = NULL;
5281       team->t.t_task_team[1] = NULL;
5282 
5283       /* reallocate space for arguments if necessary */
5284       __kmp_alloc_argv_entries(argc, team, TRUE);
5285       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5286 
5287       KA_TRACE(
5288           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5289                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5290       { // Initialize barrier data.
5291         int b;
5292         for (b = 0; b < bs_last_barrier; ++b) {
5293           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5294 #if USE_DEBUGGER
5295           team->t.t_bar[b].b_master_arrived = 0;
5296           team->t.t_bar[b].b_team_arrived = 0;
5297 #endif
5298         }
5299       }
5300 
5301       team->t.t_proc_bind = new_proc_bind;
5302 
5303       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5304                     team->t.t_id));
5305 
5306 #if OMPT_SUPPORT
5307       __ompt_team_assign_id(team, ompt_parallel_data);
5308 #endif
5309 
5310       KMP_MB();
5311 
5312       return team;
5313     }
5314 
5315     /* reap team if it is too small, then loop back and check the next one */
5316     // not sure if this is wise, but, will be redone during the hot-teams
5317     // rewrite.
5318     /* TODO: Use technique to find the right size hot-team, don't reap them */
5319     team = __kmp_reap_team(team);
5320     __kmp_team_pool = team;
5321   }
5322 
5323   /* nothing available in the pool, no matter, make a new team! */
5324   KMP_MB();
5325   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5326 
5327   /* and set it up */
5328   team->t.t_max_nproc = max_nproc;
5329   /* NOTE well, for some reason allocating one big buffer and dividing it up
5330      seems to really hurt performance a lot on the P4, so, let's not use this */
5331   __kmp_allocate_team_arrays(team, max_nproc);
5332 
5333   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5334   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5335 
5336   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5337                 "%p to NULL\n",
5338                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5339   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5340   // memory, no need to duplicate
5341   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5342   // memory, no need to duplicate
5343 
5344   if (__kmp_storage_map) {
5345     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5346   }
5347 
5348   /* allocate space for arguments */
5349   __kmp_alloc_argv_entries(argc, team, FALSE);
5350   team->t.t_argc = argc;
5351 
5352   KA_TRACE(20,
5353            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5354             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5355   { // Initialize barrier data.
5356     int b;
5357     for (b = 0; b < bs_last_barrier; ++b) {
5358       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5359 #if USE_DEBUGGER
5360       team->t.t_bar[b].b_master_arrived = 0;
5361       team->t.t_bar[b].b_team_arrived = 0;
5362 #endif
5363     }
5364   }
5365 
5366   team->t.t_proc_bind = new_proc_bind;
5367 
5368 #if OMPT_SUPPORT
5369   __ompt_team_assign_id(team, ompt_parallel_data);
5370   team->t.ompt_serialized_team_info = NULL;
5371 #endif
5372 
5373   KMP_MB();
5374 
5375   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5376                 team->t.t_id));
5377 
5378   return team;
5379 }
5380 
5381 /* TODO implement hot-teams at all levels */
5382 /* TODO implement lazy thread release on demand (disband request) */
5383 
5384 /* free the team.  return it to the team pool.  release all the threads
5385  * associated with it */
5386 void __kmp_free_team(kmp_root_t *root,
5387                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5388   int f;
5389   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5390                 team->t.t_id));
5391 
5392   /* verify state */
5393   KMP_DEBUG_ASSERT(root);
5394   KMP_DEBUG_ASSERT(team);
5395   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5396   KMP_DEBUG_ASSERT(team->t.t_threads);
5397 
5398   int use_hot_team = team == root->r.r_hot_team;
5399 #if KMP_NESTED_HOT_TEAMS
5400   int level;
5401   kmp_hot_team_ptr_t *hot_teams;
5402   if (master) {
5403     level = team->t.t_active_level - 1;
5404     if (master->th.th_teams_microtask) { // in teams construct?
5405       if (master->th.th_teams_size.nteams > 1) {
5406         ++level; // level was not increased in teams construct for
5407         // team_of_masters
5408       }
5409       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5410           master->th.th_teams_level == team->t.t_level) {
5411         ++level; // level was not increased in teams construct for
5412         // team_of_workers before the parallel
5413       } // team->t.t_level will be increased inside parallel
5414     }
5415     hot_teams = master->th.th_hot_teams;
5416     if (level < __kmp_hot_teams_max_level) {
5417       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5418       use_hot_team = 1;
5419     }
5420   }
5421 #endif // KMP_NESTED_HOT_TEAMS
5422 
5423   /* team is done working */
5424   TCW_SYNC_PTR(team->t.t_pkfn,
5425                NULL); // Important for Debugging Support Library.
5426 #if KMP_OS_WINDOWS
5427   team->t.t_copyin_counter = 0; // init counter for possible reuse
5428 #endif
5429   // Do not reset pointer to parent team to NULL for hot teams.
5430 
5431   /* if we are non-hot team, release our threads */
5432   if (!use_hot_team) {
5433     if (__kmp_tasking_mode != tskm_immediate_exec) {
5434       // Wait for threads to reach reapable state
5435       for (f = 1; f < team->t.t_nproc; ++f) {
5436         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5437         kmp_info_t *th = team->t.t_threads[f];
5438         volatile kmp_uint32 *state = &th->th.th_reap_state;
5439         while (*state != KMP_SAFE_TO_REAP) {
5440 #if KMP_OS_WINDOWS
5441           // On Windows a thread can be killed at any time, check this
5442           DWORD ecode;
5443           if (!__kmp_is_thread_alive(th, &ecode)) {
5444             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5445             break;
5446           }
5447 #endif
5448           // first check if thread is sleeping
5449           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5450           if (fl.is_sleeping())
5451             fl.resume(__kmp_gtid_from_thread(th));
5452           KMP_CPU_PAUSE();
5453         }
5454       }
5455 
5456       // Delete task teams
5457       int tt_idx;
5458       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5459         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5460         if (task_team != NULL) {
5461           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5462             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5463             team->t.t_threads[f]->th.th_task_team = NULL;
5464           }
5465           KA_TRACE(
5466               20,
5467               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5468                __kmp_get_gtid(), task_team, team->t.t_id));
5469 #if KMP_NESTED_HOT_TEAMS
5470           __kmp_free_task_team(master, task_team);
5471 #endif
5472           team->t.t_task_team[tt_idx] = NULL;
5473         }
5474       }
5475     }
5476 
5477     // Reset pointer to parent team only for non-hot teams.
5478     team->t.t_parent = NULL;
5479     team->t.t_level = 0;
5480     team->t.t_active_level = 0;
5481 
5482     /* free the worker threads */
5483     for (f = 1; f < team->t.t_nproc; ++f) {
5484       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5485       __kmp_free_thread(team->t.t_threads[f]);
5486       team->t.t_threads[f] = NULL;
5487     }
5488 
5489     /* put the team back in the team pool */
5490     /* TODO limit size of team pool, call reap_team if pool too large */
5491     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5492     __kmp_team_pool = (volatile kmp_team_t *)team;
5493   } else { // Check if team was created for the masters in a teams construct
5494     // See if first worker is a CG root
5495     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5496                      team->t.t_threads[1]->th.th_cg_roots);
5497     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5498       // Clean up the CG root nodes on workers so that this team can be re-used
5499       for (f = 1; f < team->t.t_nproc; ++f) {
5500         kmp_info_t *thr = team->t.t_threads[f];
5501         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5502                          thr->th.th_cg_roots->cg_root == thr);
5503         // Pop current CG root off list
5504         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5505         thr->th.th_cg_roots = tmp->up;
5506         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5507                        " up to node %p. cg_nthreads was %d\n",
5508                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5509         int i = tmp->cg_nthreads--;
5510         if (i == 1) {
5511           __kmp_free(tmp); // free CG if we are the last thread in it
5512         }
5513         // Restore current task's thread_limit from CG root
5514         if (thr->th.th_cg_roots)
5515           thr->th.th_current_task->td_icvs.thread_limit =
5516               thr->th.th_cg_roots->cg_thread_limit;
5517       }
5518     }
5519   }
5520 
5521   KMP_MB();
5522 }
5523 
5524 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5525 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5526   kmp_team_t *next_pool = team->t.t_next_pool;
5527 
5528   KMP_DEBUG_ASSERT(team);
5529   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5530   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5531   KMP_DEBUG_ASSERT(team->t.t_threads);
5532   KMP_DEBUG_ASSERT(team->t.t_argv);
5533 
5534   /* TODO clean the threads that are a part of this? */
5535 
5536   /* free stuff */
5537   __kmp_free_team_arrays(team);
5538   if (team->t.t_argv != &team->t.t_inline_argv[0])
5539     __kmp_free((void *)team->t.t_argv);
5540   __kmp_free(team);
5541 
5542   KMP_MB();
5543   return next_pool;
5544 }
5545 
5546 // Free the thread.  Don't reap it, just place it on the pool of available
5547 // threads.
5548 //
5549 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5550 // binding for the affinity mechanism to be useful.
5551 //
5552 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5553 // However, we want to avoid a potential performance problem by always
5554 // scanning through the list to find the correct point at which to insert
5555 // the thread (potential N**2 behavior).  To do this we keep track of the
5556 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5557 // With single-level parallelism, threads will always be added to the tail
5558 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5559 // parallelism, all bets are off and we may need to scan through the entire
5560 // free list.
5561 //
5562 // This change also has a potentially large performance benefit, for some
5563 // applications.  Previously, as threads were freed from the hot team, they
5564 // would be placed back on the free list in inverse order.  If the hot team
5565 // grew back to it's original size, then the freed thread would be placed
5566 // back on the hot team in reverse order.  This could cause bad cache
5567 // locality problems on programs where the size of the hot team regularly
5568 // grew and shrunk.
5569 //
5570 // Now, for single-level parallelism, the OMP tid is always == gtid.
5571 void __kmp_free_thread(kmp_info_t *this_th) {
5572   int gtid;
5573   kmp_info_t **scan;
5574 
5575   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5576                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5577 
5578   KMP_DEBUG_ASSERT(this_th);
5579 
5580   // When moving thread to pool, switch thread to wait on own b_go flag, and
5581   // uninitialized (NULL team).
5582   int b;
5583   kmp_balign_t *balign = this_th->th.th_bar;
5584   for (b = 0; b < bs_last_barrier; ++b) {
5585     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5586       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5587     balign[b].bb.team = NULL;
5588     balign[b].bb.leaf_kids = 0;
5589   }
5590   this_th->th.th_task_state = 0;
5591   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5592 
5593   /* put thread back on the free pool */
5594   TCW_PTR(this_th->th.th_team, NULL);
5595   TCW_PTR(this_th->th.th_root, NULL);
5596   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5597 
5598   while (this_th->th.th_cg_roots) {
5599     this_th->th.th_cg_roots->cg_nthreads--;
5600     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5601                    " %p of thread  %p to %d\n",
5602                    this_th, this_th->th.th_cg_roots,
5603                    this_th->th.th_cg_roots->cg_root,
5604                    this_th->th.th_cg_roots->cg_nthreads));
5605     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5606     if (tmp->cg_root == this_th) { // Thread is a cg_root
5607       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5608       KA_TRACE(
5609           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5610       this_th->th.th_cg_roots = tmp->up;
5611       __kmp_free(tmp);
5612     } else { // Worker thread
5613       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5614         __kmp_free(tmp);
5615       }
5616       this_th->th.th_cg_roots = NULL;
5617       break;
5618     }
5619   }
5620 
5621   /* If the implicit task assigned to this thread can be used by other threads
5622    * -> multiple threads can share the data and try to free the task at
5623    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5624    * with higher probability when hot team is disabled but can occurs even when
5625    * the hot team is enabled */
5626   __kmp_free_implicit_task(this_th);
5627   this_th->th.th_current_task = NULL;
5628 
5629   // If the __kmp_thread_pool_insert_pt is already past the new insert
5630   // point, then we need to re-scan the entire list.
5631   gtid = this_th->th.th_info.ds.ds_gtid;
5632   if (__kmp_thread_pool_insert_pt != NULL) {
5633     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5634     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5635       __kmp_thread_pool_insert_pt = NULL;
5636     }
5637   }
5638 
5639   // Scan down the list to find the place to insert the thread.
5640   // scan is the address of a link in the list, possibly the address of
5641   // __kmp_thread_pool itself.
5642   //
5643   // In the absence of nested parallelism, the for loop will have 0 iterations.
5644   if (__kmp_thread_pool_insert_pt != NULL) {
5645     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5646   } else {
5647     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5648   }
5649   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5650        scan = &((*scan)->th.th_next_pool))
5651     ;
5652 
5653   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5654   // to its address.
5655   TCW_PTR(this_th->th.th_next_pool, *scan);
5656   __kmp_thread_pool_insert_pt = *scan = this_th;
5657   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5658                    (this_th->th.th_info.ds.ds_gtid <
5659                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5660   TCW_4(this_th->th.th_in_pool, TRUE);
5661   __kmp_suspend_initialize_thread(this_th);
5662   __kmp_lock_suspend_mx(this_th);
5663   if (this_th->th.th_active == TRUE) {
5664     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5665     this_th->th.th_active_in_pool = TRUE;
5666   }
5667 #if KMP_DEBUG
5668   else {
5669     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5670   }
5671 #endif
5672   __kmp_unlock_suspend_mx(this_th);
5673 
5674   TCW_4(__kmp_nth, __kmp_nth - 1);
5675 
5676 #ifdef KMP_ADJUST_BLOCKTIME
5677   /* Adjust blocktime back to user setting or default if necessary */
5678   /* Middle initialization might never have occurred                */
5679   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5680     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5681     if (__kmp_nth <= __kmp_avail_proc) {
5682       __kmp_zero_bt = FALSE;
5683     }
5684   }
5685 #endif /* KMP_ADJUST_BLOCKTIME */
5686 
5687   KMP_MB();
5688 }
5689 
5690 /* ------------------------------------------------------------------------ */
5691 
5692 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5693 #if OMP_PROFILING_SUPPORT
5694   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5695   // TODO: add a configuration option for time granularity
5696   if (ProfileTraceFile)
5697     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5698 #endif
5699 
5700   int gtid = this_thr->th.th_info.ds.ds_gtid;
5701   /*    void                 *stack_data;*/
5702   kmp_team_t **volatile pteam;
5703 
5704   KMP_MB();
5705   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5706 
5707   if (__kmp_env_consistency_check) {
5708     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5709   }
5710 
5711 #if OMPT_SUPPORT
5712   ompt_data_t *thread_data;
5713   if (ompt_enabled.enabled) {
5714     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5715     *thread_data = ompt_data_none;
5716 
5717     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5718     this_thr->th.ompt_thread_info.wait_id = 0;
5719     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5720     this_thr->th.ompt_thread_info.parallel_flags = 0;
5721     if (ompt_enabled.ompt_callback_thread_begin) {
5722       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5723           ompt_thread_worker, thread_data);
5724     }
5725     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5726   }
5727 #endif
5728 
5729   /* This is the place where threads wait for work */
5730   while (!TCR_4(__kmp_global.g.g_done)) {
5731     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5732     KMP_MB();
5733 
5734     /* wait for work to do */
5735     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5736 
5737     /* No tid yet since not part of a team */
5738     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5739 
5740 #if OMPT_SUPPORT
5741     if (ompt_enabled.enabled) {
5742       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5743     }
5744 #endif
5745 
5746     pteam = &this_thr->th.th_team;
5747 
5748     /* have we been allocated? */
5749     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5750       /* we were just woken up, so run our new task */
5751       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5752         int rc;
5753         KA_TRACE(20,
5754                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5755                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5756                   (*pteam)->t.t_pkfn));
5757 
5758         updateHWFPControl(*pteam);
5759 
5760 #if OMPT_SUPPORT
5761         if (ompt_enabled.enabled) {
5762           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5763         }
5764 #endif
5765 
5766         rc = (*pteam)->t.t_invoke(gtid);
5767         KMP_ASSERT(rc);
5768 
5769         KMP_MB();
5770         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5771                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5772                       (*pteam)->t.t_pkfn));
5773       }
5774 #if OMPT_SUPPORT
5775       if (ompt_enabled.enabled) {
5776         /* no frame set while outside task */
5777         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5778 
5779         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5780       }
5781 #endif
5782       /* join barrier after parallel region */
5783       __kmp_join_barrier(gtid);
5784     }
5785   }
5786   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5787 
5788 #if OMPT_SUPPORT
5789   if (ompt_enabled.ompt_callback_thread_end) {
5790     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5791   }
5792 #endif
5793 
5794   this_thr->th.th_task_team = NULL;
5795   /* run the destructors for the threadprivate data for this thread */
5796   __kmp_common_destroy_gtid(gtid);
5797 
5798   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5799   KMP_MB();
5800 
5801 #if OMP_PROFILING_SUPPORT
5802   llvm::timeTraceProfilerFinishThread();
5803 #endif
5804   return this_thr;
5805 }
5806 
5807 /* ------------------------------------------------------------------------ */
5808 
5809 void __kmp_internal_end_dest(void *specific_gtid) {
5810   // Make sure no significant bits are lost
5811   int gtid;
5812   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5813 
5814   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5815   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5816    * this is because 0 is reserved for the nothing-stored case */
5817 
5818   __kmp_internal_end_thread(gtid);
5819 }
5820 
5821 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5822 
5823 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5824   __kmp_internal_end_atexit();
5825 }
5826 
5827 #endif
5828 
5829 /* [Windows] josh: when the atexit handler is called, there may still be more
5830    than one thread alive */
5831 void __kmp_internal_end_atexit(void) {
5832   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5833   /* [Windows]
5834      josh: ideally, we want to completely shutdown the library in this atexit
5835      handler, but stat code that depends on thread specific data for gtid fails
5836      because that data becomes unavailable at some point during the shutdown, so
5837      we call __kmp_internal_end_thread instead. We should eventually remove the
5838      dependency on __kmp_get_specific_gtid in the stat code and use
5839      __kmp_internal_end_library to cleanly shutdown the library.
5840 
5841      // TODO: Can some of this comment about GVS be removed?
5842      I suspect that the offending stat code is executed when the calling thread
5843      tries to clean up a dead root thread's data structures, resulting in GVS
5844      code trying to close the GVS structures for that thread, but since the stat
5845      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5846      the calling thread is cleaning up itself instead of another thread, it get
5847      confused. This happens because allowing a thread to unregister and cleanup
5848      another thread is a recent modification for addressing an issue.
5849      Based on the current design (20050722), a thread may end up
5850      trying to unregister another thread only if thread death does not trigger
5851      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5852      thread specific data destructor function to detect thread death. For
5853      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5854      is nothing.  Thus, the workaround is applicable only for Windows static
5855      stat library. */
5856   __kmp_internal_end_library(-1);
5857 #if KMP_OS_WINDOWS
5858   __kmp_close_console();
5859 #endif
5860 }
5861 
5862 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5863   // It is assumed __kmp_forkjoin_lock is acquired.
5864 
5865   int gtid;
5866 
5867   KMP_DEBUG_ASSERT(thread != NULL);
5868 
5869   gtid = thread->th.th_info.ds.ds_gtid;
5870 
5871   if (!is_root) {
5872     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5873       /* Assume the threads are at the fork barrier here */
5874       KA_TRACE(
5875           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5876                gtid));
5877       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5878        * (GEH) */
5879       ANNOTATE_HAPPENS_BEFORE(thread);
5880       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5881                          thread);
5882       __kmp_release_64(&flag);
5883     }
5884 
5885     // Terminate OS thread.
5886     __kmp_reap_worker(thread);
5887 
5888     // The thread was killed asynchronously.  If it was actively
5889     // spinning in the thread pool, decrement the global count.
5890     //
5891     // There is a small timing hole here - if the worker thread was just waking
5892     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5893     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5894     // the global counter might not get updated.
5895     //
5896     // Currently, this can only happen as the library is unloaded,
5897     // so there are no harmful side effects.
5898     if (thread->th.th_active_in_pool) {
5899       thread->th.th_active_in_pool = FALSE;
5900       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5901       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5902     }
5903   }
5904 
5905   __kmp_free_implicit_task(thread);
5906 
5907 // Free the fast memory for tasking
5908 #if USE_FAST_MEMORY
5909   __kmp_free_fast_memory(thread);
5910 #endif /* USE_FAST_MEMORY */
5911 
5912   __kmp_suspend_uninitialize_thread(thread);
5913 
5914   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5915   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5916 
5917   --__kmp_all_nth;
5918   // __kmp_nth was decremented when thread is added to the pool.
5919 
5920 #ifdef KMP_ADJUST_BLOCKTIME
5921   /* Adjust blocktime back to user setting or default if necessary */
5922   /* Middle initialization might never have occurred                */
5923   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5924     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5925     if (__kmp_nth <= __kmp_avail_proc) {
5926       __kmp_zero_bt = FALSE;
5927     }
5928   }
5929 #endif /* KMP_ADJUST_BLOCKTIME */
5930 
5931   /* free the memory being used */
5932   if (__kmp_env_consistency_check) {
5933     if (thread->th.th_cons) {
5934       __kmp_free_cons_stack(thread->th.th_cons);
5935       thread->th.th_cons = NULL;
5936     }
5937   }
5938 
5939   if (thread->th.th_pri_common != NULL) {
5940     __kmp_free(thread->th.th_pri_common);
5941     thread->th.th_pri_common = NULL;
5942   }
5943 
5944   if (thread->th.th_task_state_memo_stack != NULL) {
5945     __kmp_free(thread->th.th_task_state_memo_stack);
5946     thread->th.th_task_state_memo_stack = NULL;
5947   }
5948 
5949 #if KMP_USE_BGET
5950   if (thread->th.th_local.bget_data != NULL) {
5951     __kmp_finalize_bget(thread);
5952   }
5953 #endif
5954 
5955 #if KMP_AFFINITY_SUPPORTED
5956   if (thread->th.th_affin_mask != NULL) {
5957     KMP_CPU_FREE(thread->th.th_affin_mask);
5958     thread->th.th_affin_mask = NULL;
5959   }
5960 #endif /* KMP_AFFINITY_SUPPORTED */
5961 
5962 #if KMP_USE_HIER_SCHED
5963   if (thread->th.th_hier_bar_data != NULL) {
5964     __kmp_free(thread->th.th_hier_bar_data);
5965     thread->th.th_hier_bar_data = NULL;
5966   }
5967 #endif
5968 
5969   __kmp_reap_team(thread->th.th_serial_team);
5970   thread->th.th_serial_team = NULL;
5971   __kmp_free(thread);
5972 
5973   KMP_MB();
5974 
5975 } // __kmp_reap_thread
5976 
5977 static void __kmp_internal_end(void) {
5978   int i;
5979 
5980   /* First, unregister the library */
5981   __kmp_unregister_library();
5982 
5983 #if KMP_OS_WINDOWS
5984   /* In Win static library, we can't tell when a root actually dies, so we
5985      reclaim the data structures for any root threads that have died but not
5986      unregistered themselves, in order to shut down cleanly.
5987      In Win dynamic library we also can't tell when a thread dies.  */
5988   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5989 // dead roots
5990 #endif
5991 
5992   for (i = 0; i < __kmp_threads_capacity; i++)
5993     if (__kmp_root[i])
5994       if (__kmp_root[i]->r.r_active)
5995         break;
5996   KMP_MB(); /* Flush all pending memory write invalidates.  */
5997   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5998 
5999   if (i < __kmp_threads_capacity) {
6000 #if KMP_USE_MONITOR
6001     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6002     KMP_MB(); /* Flush all pending memory write invalidates.  */
6003 
6004     // Need to check that monitor was initialized before reaping it. If we are
6005     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6006     // __kmp_monitor will appear to contain valid data, but it is only valid in
6007     // the parent process, not the child.
6008     // New behavior (201008): instead of keying off of the flag
6009     // __kmp_init_parallel, the monitor thread creation is keyed off
6010     // of the new flag __kmp_init_monitor.
6011     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6012     if (TCR_4(__kmp_init_monitor)) {
6013       __kmp_reap_monitor(&__kmp_monitor);
6014       TCW_4(__kmp_init_monitor, 0);
6015     }
6016     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6017     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6018 #endif // KMP_USE_MONITOR
6019   } else {
6020 /* TODO move this to cleanup code */
6021 #ifdef KMP_DEBUG
6022     /* make sure that everything has properly ended */
6023     for (i = 0; i < __kmp_threads_capacity; i++) {
6024       if (__kmp_root[i]) {
6025         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6026         //                    there can be uber threads alive here
6027         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6028       }
6029     }
6030 #endif
6031 
6032     KMP_MB();
6033 
6034     // Reap the worker threads.
6035     // This is valid for now, but be careful if threads are reaped sooner.
6036     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6037       // Get the next thread from the pool.
6038       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6039       __kmp_thread_pool = thread->th.th_next_pool;
6040       // Reap it.
6041       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6042       thread->th.th_next_pool = NULL;
6043       thread->th.th_in_pool = FALSE;
6044       __kmp_reap_thread(thread, 0);
6045     }
6046     __kmp_thread_pool_insert_pt = NULL;
6047 
6048     // Reap teams.
6049     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6050       // Get the next team from the pool.
6051       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6052       __kmp_team_pool = team->t.t_next_pool;
6053       // Reap it.
6054       team->t.t_next_pool = NULL;
6055       __kmp_reap_team(team);
6056     }
6057 
6058     __kmp_reap_task_teams();
6059 
6060 #if KMP_OS_UNIX
6061     // Threads that are not reaped should not access any resources since they
6062     // are going to be deallocated soon, so the shutdown sequence should wait
6063     // until all threads either exit the final spin-waiting loop or begin
6064     // sleeping after the given blocktime.
6065     for (i = 0; i < __kmp_threads_capacity; i++) {
6066       kmp_info_t *thr = __kmp_threads[i];
6067       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6068         KMP_CPU_PAUSE();
6069     }
6070 #endif
6071 
6072     for (i = 0; i < __kmp_threads_capacity; ++i) {
6073       // TBD: Add some checking...
6074       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6075     }
6076 
6077     /* Make sure all threadprivate destructors get run by joining with all
6078        worker threads before resetting this flag */
6079     TCW_SYNC_4(__kmp_init_common, FALSE);
6080 
6081     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6082     KMP_MB();
6083 
6084 #if KMP_USE_MONITOR
6085     // See note above: One of the possible fixes for CQ138434 / CQ140126
6086     //
6087     // FIXME: push both code fragments down and CSE them?
6088     // push them into __kmp_cleanup() ?
6089     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6090     if (TCR_4(__kmp_init_monitor)) {
6091       __kmp_reap_monitor(&__kmp_monitor);
6092       TCW_4(__kmp_init_monitor, 0);
6093     }
6094     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6095     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6096 #endif
6097   } /* else !__kmp_global.t_active */
6098   TCW_4(__kmp_init_gtid, FALSE);
6099   KMP_MB(); /* Flush all pending memory write invalidates.  */
6100 
6101   __kmp_cleanup();
6102 #if OMPT_SUPPORT
6103   ompt_fini();
6104 #endif
6105 }
6106 
6107 void __kmp_internal_end_library(int gtid_req) {
6108   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6109   /* this shouldn't be a race condition because __kmp_internal_end() is the
6110      only place to clear __kmp_serial_init */
6111   /* we'll check this later too, after we get the lock */
6112   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6113   // redundant, because the next check will work in any case.
6114   if (__kmp_global.g.g_abort) {
6115     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6116     /* TODO abort? */
6117     return;
6118   }
6119   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6120     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6121     return;
6122   }
6123 
6124   KMP_MB(); /* Flush all pending memory write invalidates.  */
6125   /* find out who we are and what we should do */
6126   {
6127     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6128     KA_TRACE(
6129         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6130     if (gtid == KMP_GTID_SHUTDOWN) {
6131       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6132                     "already shutdown\n"));
6133       return;
6134     } else if (gtid == KMP_GTID_MONITOR) {
6135       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6136                     "registered, or system shutdown\n"));
6137       return;
6138     } else if (gtid == KMP_GTID_DNE) {
6139       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6140                     "shutdown\n"));
6141       /* we don't know who we are, but we may still shutdown the library */
6142     } else if (KMP_UBER_GTID(gtid)) {
6143       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6144       if (__kmp_root[gtid]->r.r_active) {
6145         __kmp_global.g.g_abort = -1;
6146         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6147         __kmp_unregister_library();
6148         KA_TRACE(10,
6149                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6150                   gtid));
6151         return;
6152       } else {
6153         KA_TRACE(
6154             10,
6155             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6156         __kmp_unregister_root_current_thread(gtid);
6157       }
6158     } else {
6159 /* worker threads may call this function through the atexit handler, if they
6160  * call exit() */
6161 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6162    TODO: do a thorough shutdown instead */
6163 #ifdef DUMP_DEBUG_ON_EXIT
6164       if (__kmp_debug_buf)
6165         __kmp_dump_debug_buffer();
6166 #endif
6167       // added unregister library call here when we switch to shm linux
6168       // if we don't, it will leave lots of files in /dev/shm
6169       // cleanup shared memory file before exiting.
6170       __kmp_unregister_library();
6171       return;
6172     }
6173   }
6174   /* synchronize the termination process */
6175   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6176 
6177   /* have we already finished */
6178   if (__kmp_global.g.g_abort) {
6179     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6180     /* TODO abort? */
6181     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6182     return;
6183   }
6184   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6185     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6186     return;
6187   }
6188 
6189   /* We need this lock to enforce mutex between this reading of
6190      __kmp_threads_capacity and the writing by __kmp_register_root.
6191      Alternatively, we can use a counter of roots that is atomically updated by
6192      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6193      __kmp_internal_end_*.  */
6194   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6195 
6196   /* now we can safely conduct the actual termination */
6197   __kmp_internal_end();
6198 
6199   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6200   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6201 
6202   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6203 
6204 #ifdef DUMP_DEBUG_ON_EXIT
6205   if (__kmp_debug_buf)
6206     __kmp_dump_debug_buffer();
6207 #endif
6208 
6209 #if KMP_OS_WINDOWS
6210   __kmp_close_console();
6211 #endif
6212 
6213   __kmp_fini_allocator();
6214 
6215 } // __kmp_internal_end_library
6216 
6217 void __kmp_internal_end_thread(int gtid_req) {
6218   int i;
6219 
6220   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6221   /* this shouldn't be a race condition because __kmp_internal_end() is the
6222    * only place to clear __kmp_serial_init */
6223   /* we'll check this later too, after we get the lock */
6224   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6225   // redundant, because the next check will work in any case.
6226   if (__kmp_global.g.g_abort) {
6227     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6228     /* TODO abort? */
6229     return;
6230   }
6231   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6232     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6233     return;
6234   }
6235 
6236   // If hidden helper team has been initialized, we need to deinit it
6237   if (TCR_4(__kmp_init_hidden_helper)) {
6238     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6239     // First release the main thread to let it continue its work
6240     __kmp_hidden_helper_main_thread_release();
6241     // Wait until the hidden helper team has been destroyed
6242     __kmp_hidden_helper_threads_deinitz_wait();
6243   }
6244 
6245   KMP_MB(); /* Flush all pending memory write invalidates.  */
6246 
6247   /* find out who we are and what we should do */
6248   {
6249     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6250     KA_TRACE(10,
6251              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6252     if (gtid == KMP_GTID_SHUTDOWN) {
6253       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6254                     "already shutdown\n"));
6255       return;
6256     } else if (gtid == KMP_GTID_MONITOR) {
6257       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6258                     "registered, or system shutdown\n"));
6259       return;
6260     } else if (gtid == KMP_GTID_DNE) {
6261       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6262                     "shutdown\n"));
6263       return;
6264       /* we don't know who we are */
6265     } else if (KMP_UBER_GTID(gtid)) {
6266       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6267       if (__kmp_root[gtid]->r.r_active) {
6268         __kmp_global.g.g_abort = -1;
6269         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6270         KA_TRACE(10,
6271                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6272                   gtid));
6273         return;
6274       } else {
6275         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6276                       gtid));
6277         __kmp_unregister_root_current_thread(gtid);
6278       }
6279     } else {
6280       /* just a worker thread, let's leave */
6281       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6282 
6283       if (gtid >= 0) {
6284         __kmp_threads[gtid]->th.th_task_team = NULL;
6285       }
6286 
6287       KA_TRACE(10,
6288                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6289                 gtid));
6290       return;
6291     }
6292   }
6293 #if KMP_DYNAMIC_LIB
6294   if (__kmp_pause_status != kmp_hard_paused)
6295   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6296   // because we will better shutdown later in the library destructor.
6297   {
6298     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6299     return;
6300   }
6301 #endif
6302   /* synchronize the termination process */
6303   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6304 
6305   /* have we already finished */
6306   if (__kmp_global.g.g_abort) {
6307     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6308     /* TODO abort? */
6309     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6310     return;
6311   }
6312   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6313     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6314     return;
6315   }
6316 
6317   /* We need this lock to enforce mutex between this reading of
6318      __kmp_threads_capacity and the writing by __kmp_register_root.
6319      Alternatively, we can use a counter of roots that is atomically updated by
6320      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6321      __kmp_internal_end_*.  */
6322 
6323   /* should we finish the run-time?  are all siblings done? */
6324   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6325 
6326   for (i = 0; i < __kmp_threads_capacity; ++i) {
6327     if (KMP_UBER_GTID(i)) {
6328       KA_TRACE(
6329           10,
6330           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6331       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6332       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6333       return;
6334     }
6335   }
6336 
6337   /* now we can safely conduct the actual termination */
6338 
6339   __kmp_internal_end();
6340 
6341   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6342   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6343 
6344   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6345 
6346 #ifdef DUMP_DEBUG_ON_EXIT
6347   if (__kmp_debug_buf)
6348     __kmp_dump_debug_buffer();
6349 #endif
6350 } // __kmp_internal_end_thread
6351 
6352 // -----------------------------------------------------------------------------
6353 // Library registration stuff.
6354 
6355 static long __kmp_registration_flag = 0;
6356 // Random value used to indicate library initialization.
6357 static char *__kmp_registration_str = NULL;
6358 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6359 
6360 static inline char *__kmp_reg_status_name() {
6361 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6362    each thread. If registration and unregistration go in different threads
6363    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6364    env var can not be found, because the name will contain different pid. */
6365 // macOS* complains about name being too long with additional getuid()
6366 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6367   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6368                           (int)getuid());
6369 #else
6370   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6371 #endif
6372 } // __kmp_reg_status_get
6373 
6374 void __kmp_register_library_startup(void) {
6375 
6376   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6377   int done = 0;
6378   union {
6379     double dtime;
6380     long ltime;
6381   } time;
6382 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6383   __kmp_initialize_system_tick();
6384 #endif
6385   __kmp_read_system_time(&time.dtime);
6386   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6387   __kmp_registration_str =
6388       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6389                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6390 
6391   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6392                 __kmp_registration_str));
6393 
6394   while (!done) {
6395 
6396     char *value = NULL; // Actual value of the environment variable.
6397 
6398 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6399     char *shm_name = __kmp_str_format("/%s", name);
6400     int shm_preexist = 0;
6401     char *data1;
6402     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6403     if ((fd1 == -1) && (errno == EEXIST)) {
6404       // file didn't open because it already exists.
6405       // try opening existing file
6406       fd1 = shm_open(shm_name, O_RDWR, 0666);
6407       if (fd1 == -1) { // file didn't open
6408         // error out here
6409         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6410                     __kmp_msg_null);
6411       } else {
6412         // able to open existing file
6413         shm_preexist = 1;
6414       }
6415     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6416       // already exists.
6417       // error out here.
6418       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6419                   __kmp_msg_null);
6420     }
6421     if (shm_preexist == 0) {
6422       // we created SHM now set size
6423       if (ftruncate(fd1, SHM_SIZE) == -1) {
6424         // error occured setting size;
6425         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6426                     KMP_ERR(errno), __kmp_msg_null);
6427       }
6428     }
6429     data1 =
6430         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6431     if (data1 == MAP_FAILED) {
6432       // failed to map shared memory
6433       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6434                   __kmp_msg_null);
6435     }
6436     if (shm_preexist == 0) { // set data to SHM, set value
6437       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6438     }
6439     // Read value from either what we just wrote or existing file.
6440     value = __kmp_str_format("%s", data1); // read value from SHM
6441     munmap(data1, SHM_SIZE);
6442     close(fd1);
6443 #else // Windows and unix with static library
6444     // Set environment variable, but do not overwrite if it is exist.
6445     __kmp_env_set(name, __kmp_registration_str, 0);
6446     // read value to see if it got set
6447     value = __kmp_env_get(name);
6448 #endif
6449 
6450     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6451       done = 1; // Ok, environment variable set successfully, exit the loop.
6452     } else {
6453       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6454       // Check whether it alive or dead.
6455       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6456       char *tail = value;
6457       char *flag_addr_str = NULL;
6458       char *flag_val_str = NULL;
6459       char const *file_name = NULL;
6460       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6461       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6462       file_name = tail;
6463       if (tail != NULL) {
6464         long *flag_addr = 0;
6465         long flag_val = 0;
6466         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6467         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6468         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6469           // First, check whether environment-encoded address is mapped into
6470           // addr space.
6471           // If so, dereference it to see if it still has the right value.
6472           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6473             neighbor = 1;
6474           } else {
6475             // If not, then we know the other copy of the library is no longer
6476             // running.
6477             neighbor = 2;
6478           }
6479         }
6480       }
6481       switch (neighbor) {
6482       case 0: // Cannot parse environment variable -- neighbor status unknown.
6483         // Assume it is the incompatible format of future version of the
6484         // library. Assume the other library is alive.
6485         // WARN( ... ); // TODO: Issue a warning.
6486         file_name = "unknown library";
6487         KMP_FALLTHROUGH();
6488       // Attention! Falling to the next case. That's intentional.
6489       case 1: { // Neighbor is alive.
6490         // Check it is allowed.
6491         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6492         if (!__kmp_str_match_true(duplicate_ok)) {
6493           // That's not allowed. Issue fatal error.
6494           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6495                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6496         }
6497         KMP_INTERNAL_FREE(duplicate_ok);
6498         __kmp_duplicate_library_ok = 1;
6499         done = 1; // Exit the loop.
6500       } break;
6501       case 2: { // Neighbor is dead.
6502 
6503 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6504         // close shared memory.
6505         shm_unlink(shm_name); // this removes file in /dev/shm
6506 #else
6507         // Clear the variable and try to register library again.
6508         __kmp_env_unset(name);
6509 #endif
6510       } break;
6511       default: {
6512         KMP_DEBUG_ASSERT(0);
6513       } break;
6514       }
6515     }
6516     KMP_INTERNAL_FREE((void *)value);
6517 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6518     KMP_INTERNAL_FREE((void *)shm_name);
6519 #endif
6520   } // while
6521   KMP_INTERNAL_FREE((void *)name);
6522 
6523 } // func __kmp_register_library_startup
6524 
6525 void __kmp_unregister_library(void) {
6526 
6527   char *name = __kmp_reg_status_name();
6528   char *value = NULL;
6529 
6530 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6531   char *shm_name = __kmp_str_format("/%s", name);
6532   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6533   if (fd1 == -1) {
6534     // file did not open. return.
6535     return;
6536   }
6537   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6538   if (data1 != MAP_FAILED) {
6539     value = __kmp_str_format("%s", data1); // read value from SHM
6540     munmap(data1, SHM_SIZE);
6541   }
6542   close(fd1);
6543 #else
6544   value = __kmp_env_get(name);
6545 #endif
6546 
6547   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6548   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6549   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6550 //  Ok, this is our variable. Delete it.
6551 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6552     shm_unlink(shm_name); // this removes file in /dev/shm
6553 #else
6554     __kmp_env_unset(name);
6555 #endif
6556   }
6557 
6558 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6559   KMP_INTERNAL_FREE(shm_name);
6560 #endif
6561 
6562   KMP_INTERNAL_FREE(__kmp_registration_str);
6563   KMP_INTERNAL_FREE(value);
6564   KMP_INTERNAL_FREE(name);
6565 
6566   __kmp_registration_flag = 0;
6567   __kmp_registration_str = NULL;
6568 
6569 } // __kmp_unregister_library
6570 
6571 // End of Library registration stuff.
6572 // -----------------------------------------------------------------------------
6573 
6574 #if KMP_MIC_SUPPORTED
6575 
6576 static void __kmp_check_mic_type() {
6577   kmp_cpuid_t cpuid_state = {0};
6578   kmp_cpuid_t *cs_p = &cpuid_state;
6579   __kmp_x86_cpuid(1, 0, cs_p);
6580   // We don't support mic1 at the moment
6581   if ((cs_p->eax & 0xff0) == 0xB10) {
6582     __kmp_mic_type = mic2;
6583   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6584     __kmp_mic_type = mic3;
6585   } else {
6586     __kmp_mic_type = non_mic;
6587   }
6588 }
6589 
6590 #endif /* KMP_MIC_SUPPORTED */
6591 
6592 #if KMP_HAVE_UMWAIT
6593 static void __kmp_user_level_mwait_init() {
6594   struct kmp_cpuid buf;
6595   __kmp_x86_cpuid(7, 0, &buf);
6596   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6597   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6598                 __kmp_umwait_enabled));
6599 }
6600 #elif KMP_HAVE_MWAIT
6601 #ifndef AT_INTELPHIUSERMWAIT
6602 // Spurious, non-existent value that should always fail to return anything.
6603 // Will be replaced with the correct value when we know that.
6604 #define AT_INTELPHIUSERMWAIT 10000
6605 #endif
6606 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6607 // earlier OS is used to build the RTL, we'll use the following internal
6608 // function when the entry is not found.
6609 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6610 unsigned long getauxval(unsigned long) { return 0; }
6611 
6612 static void __kmp_user_level_mwait_init() {
6613   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6614   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6615   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6616   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6617   if (__kmp_mic_type == mic3) {
6618     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6619     if ((res & 0x1) || __kmp_user_level_mwait) {
6620       __kmp_mwait_enabled = TRUE;
6621       if (__kmp_user_level_mwait) {
6622         KMP_INFORM(EnvMwaitWarn);
6623       }
6624     } else {
6625       __kmp_mwait_enabled = FALSE;
6626     }
6627   }
6628   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6629                 "__kmp_mwait_enabled = %d\n",
6630                 __kmp_mic_type, __kmp_mwait_enabled));
6631 }
6632 #endif /* KMP_HAVE_UMWAIT */
6633 
6634 static void __kmp_do_serial_initialize(void) {
6635   int i, gtid;
6636   size_t size;
6637 
6638   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6639 
6640   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6641   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6642   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6643   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6644   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6645 
6646 #if OMPT_SUPPORT
6647   ompt_pre_init();
6648 #endif
6649 
6650   __kmp_validate_locks();
6651 
6652   /* Initialize internal memory allocator */
6653   __kmp_init_allocator();
6654 
6655   /* Register the library startup via an environment variable and check to see
6656      whether another copy of the library is already registered. */
6657 
6658   __kmp_register_library_startup();
6659 
6660   /* TODO reinitialization of library */
6661   if (TCR_4(__kmp_global.g.g_done)) {
6662     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6663   }
6664 
6665   __kmp_global.g.g_abort = 0;
6666   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6667 
6668 /* initialize the locks */
6669 #if KMP_USE_ADAPTIVE_LOCKS
6670 #if KMP_DEBUG_ADAPTIVE_LOCKS
6671   __kmp_init_speculative_stats();
6672 #endif
6673 #endif
6674 #if KMP_STATS_ENABLED
6675   __kmp_stats_init();
6676 #endif
6677   __kmp_init_lock(&__kmp_global_lock);
6678   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6679   __kmp_init_lock(&__kmp_debug_lock);
6680   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6681   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6682   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6683   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6684   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6685   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6686   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6687   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6688   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6689   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6690   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6691   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6692   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6693   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6694   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6695 #if KMP_USE_MONITOR
6696   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6697 #endif
6698   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6699 
6700   /* conduct initialization and initial setup of configuration */
6701 
6702   __kmp_runtime_initialize();
6703 
6704 #if KMP_MIC_SUPPORTED
6705   __kmp_check_mic_type();
6706 #endif
6707 
6708 // Some global variable initialization moved here from kmp_env_initialize()
6709 #ifdef KMP_DEBUG
6710   kmp_diag = 0;
6711 #endif
6712   __kmp_abort_delay = 0;
6713 
6714   // From __kmp_init_dflt_team_nth()
6715   /* assume the entire machine will be used */
6716   __kmp_dflt_team_nth_ub = __kmp_xproc;
6717   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6718     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6719   }
6720   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6721     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6722   }
6723   __kmp_max_nth = __kmp_sys_max_nth;
6724   __kmp_cg_max_nth = __kmp_sys_max_nth;
6725   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6726   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6727     __kmp_teams_max_nth = __kmp_sys_max_nth;
6728   }
6729 
6730   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6731   // part
6732   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6733 #if KMP_USE_MONITOR
6734   __kmp_monitor_wakeups =
6735       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6736   __kmp_bt_intervals =
6737       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6738 #endif
6739   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6740   __kmp_library = library_throughput;
6741   // From KMP_SCHEDULE initialization
6742   __kmp_static = kmp_sch_static_balanced;
6743 // AC: do not use analytical here, because it is non-monotonous
6744 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6745 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6746 // need to repeat assignment
6747 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6748 // bit control and barrier method control parts
6749 #if KMP_FAST_REDUCTION_BARRIER
6750 #define kmp_reduction_barrier_gather_bb ((int)1)
6751 #define kmp_reduction_barrier_release_bb ((int)1)
6752 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6753 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6754 #endif // KMP_FAST_REDUCTION_BARRIER
6755   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6756     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6757     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6758     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6759     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6760 #if KMP_FAST_REDUCTION_BARRIER
6761     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6762       // lin_64 ): hyper,1
6763       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6764       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6765       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6766       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6767     }
6768 #endif // KMP_FAST_REDUCTION_BARRIER
6769   }
6770 #if KMP_FAST_REDUCTION_BARRIER
6771 #undef kmp_reduction_barrier_release_pat
6772 #undef kmp_reduction_barrier_gather_pat
6773 #undef kmp_reduction_barrier_release_bb
6774 #undef kmp_reduction_barrier_gather_bb
6775 #endif // KMP_FAST_REDUCTION_BARRIER
6776 #if KMP_MIC_SUPPORTED
6777   if (__kmp_mic_type == mic2) { // KNC
6778     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6779     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6780     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6781         1; // forkjoin release
6782     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6783     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6784   }
6785 #if KMP_FAST_REDUCTION_BARRIER
6786   if (__kmp_mic_type == mic2) { // KNC
6787     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6788     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6789   }
6790 #endif // KMP_FAST_REDUCTION_BARRIER
6791 #endif // KMP_MIC_SUPPORTED
6792 
6793 // From KMP_CHECKS initialization
6794 #ifdef KMP_DEBUG
6795   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6796 #else
6797   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6798 #endif
6799 
6800   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6801   __kmp_foreign_tp = TRUE;
6802 
6803   __kmp_global.g.g_dynamic = FALSE;
6804   __kmp_global.g.g_dynamic_mode = dynamic_default;
6805 
6806   __kmp_env_initialize(NULL);
6807 
6808 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6809   __kmp_user_level_mwait_init();
6810 #endif
6811 // Print all messages in message catalog for testing purposes.
6812 #ifdef KMP_DEBUG
6813   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6814   if (__kmp_str_match_true(val)) {
6815     kmp_str_buf_t buffer;
6816     __kmp_str_buf_init(&buffer);
6817     __kmp_i18n_dump_catalog(&buffer);
6818     __kmp_printf("%s", buffer.str);
6819     __kmp_str_buf_free(&buffer);
6820   }
6821   __kmp_env_free(&val);
6822 #endif
6823 
6824   __kmp_threads_capacity =
6825       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6826   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6827   __kmp_tp_capacity = __kmp_default_tp_capacity(
6828       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6829 
6830   // If the library is shut down properly, both pools must be NULL. Just in
6831   // case, set them to NULL -- some memory may leak, but subsequent code will
6832   // work even if pools are not freed.
6833   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6834   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6835   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6836   __kmp_thread_pool = NULL;
6837   __kmp_thread_pool_insert_pt = NULL;
6838   __kmp_team_pool = NULL;
6839 
6840   /* Allocate all of the variable sized records */
6841   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6842    * expandable */
6843   /* Since allocation is cache-aligned, just add extra padding at the end */
6844   size =
6845       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6846       CACHE_LINE;
6847   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6848   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6849                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6850 
6851   /* init thread counts */
6852   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6853                    0); // Asserts fail if the library is reinitializing and
6854   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6855   __kmp_all_nth = 0;
6856   __kmp_nth = 0;
6857 
6858   /* setup the uber master thread and hierarchy */
6859   gtid = __kmp_register_root(TRUE);
6860   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6861   KMP_ASSERT(KMP_UBER_GTID(gtid));
6862   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6863 
6864   KMP_MB(); /* Flush all pending memory write invalidates.  */
6865 
6866   __kmp_common_initialize();
6867 
6868 #if KMP_OS_UNIX
6869   /* invoke the child fork handler */
6870   __kmp_register_atfork();
6871 #endif
6872 
6873 #if !KMP_DYNAMIC_LIB
6874   {
6875     /* Invoke the exit handler when the program finishes, only for static
6876        library. For dynamic library, we already have _fini and DllMain. */
6877     int rc = atexit(__kmp_internal_end_atexit);
6878     if (rc != 0) {
6879       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6880                   __kmp_msg_null);
6881     }
6882   }
6883 #endif
6884 
6885 #if KMP_HANDLE_SIGNALS
6886 #if KMP_OS_UNIX
6887   /* NOTE: make sure that this is called before the user installs their own
6888      signal handlers so that the user handlers are called first. this way they
6889      can return false, not call our handler, avoid terminating the library, and
6890      continue execution where they left off. */
6891   __kmp_install_signals(FALSE);
6892 #endif /* KMP_OS_UNIX */
6893 #if KMP_OS_WINDOWS
6894   __kmp_install_signals(TRUE);
6895 #endif /* KMP_OS_WINDOWS */
6896 #endif
6897 
6898   /* we have finished the serial initialization */
6899   __kmp_init_counter++;
6900 
6901   __kmp_init_serial = TRUE;
6902 
6903   if (__kmp_settings) {
6904     __kmp_env_print();
6905   }
6906 
6907   if (__kmp_display_env || __kmp_display_env_verbose) {
6908     __kmp_env_print_2();
6909   }
6910 
6911 #if OMPT_SUPPORT
6912   ompt_post_init();
6913 #endif
6914 
6915   KMP_MB();
6916 
6917   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6918 }
6919 
6920 void __kmp_serial_initialize(void) {
6921   if (__kmp_init_serial) {
6922     return;
6923   }
6924   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6925   if (__kmp_init_serial) {
6926     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6927     return;
6928   }
6929   __kmp_do_serial_initialize();
6930   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6931 }
6932 
6933 static void __kmp_do_middle_initialize(void) {
6934   int i, j;
6935   int prev_dflt_team_nth;
6936 
6937   if (!__kmp_init_serial) {
6938     __kmp_do_serial_initialize();
6939   }
6940 
6941   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6942 
6943   // Save the previous value for the __kmp_dflt_team_nth so that
6944   // we can avoid some reinitialization if it hasn't changed.
6945   prev_dflt_team_nth = __kmp_dflt_team_nth;
6946 
6947 #if KMP_AFFINITY_SUPPORTED
6948   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6949   // number of cores on the machine.
6950   __kmp_affinity_initialize();
6951 
6952   // Run through the __kmp_threads array and set the affinity mask
6953   // for each root thread that is currently registered with the RTL.
6954   for (i = 0; i < __kmp_threads_capacity; i++) {
6955     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6956       __kmp_affinity_set_init_mask(i, TRUE);
6957     }
6958   }
6959 #endif /* KMP_AFFINITY_SUPPORTED */
6960 
6961   KMP_ASSERT(__kmp_xproc > 0);
6962   if (__kmp_avail_proc == 0) {
6963     __kmp_avail_proc = __kmp_xproc;
6964   }
6965 
6966   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6967   // correct them now
6968   j = 0;
6969   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6970     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6971         __kmp_avail_proc;
6972     j++;
6973   }
6974 
6975   if (__kmp_dflt_team_nth == 0) {
6976 #ifdef KMP_DFLT_NTH_CORES
6977     // Default #threads = #cores
6978     __kmp_dflt_team_nth = __kmp_ncores;
6979     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6980                   "__kmp_ncores (%d)\n",
6981                   __kmp_dflt_team_nth));
6982 #else
6983     // Default #threads = #available OS procs
6984     __kmp_dflt_team_nth = __kmp_avail_proc;
6985     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6986                   "__kmp_avail_proc(%d)\n",
6987                   __kmp_dflt_team_nth));
6988 #endif /* KMP_DFLT_NTH_CORES */
6989   }
6990 
6991   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6992     __kmp_dflt_team_nth = KMP_MIN_NTH;
6993   }
6994   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6995     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6996   }
6997 
6998   // There's no harm in continuing if the following check fails,
6999   // but it indicates an error in the previous logic.
7000   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7001 
7002   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7003     // Run through the __kmp_threads array and set the num threads icv for each
7004     // root thread that is currently registered with the RTL (which has not
7005     // already explicitly set its nthreads-var with a call to
7006     // omp_set_num_threads()).
7007     for (i = 0; i < __kmp_threads_capacity; i++) {
7008       kmp_info_t *thread = __kmp_threads[i];
7009       if (thread == NULL)
7010         continue;
7011       if (thread->th.th_current_task->td_icvs.nproc != 0)
7012         continue;
7013 
7014       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7015     }
7016   }
7017   KA_TRACE(
7018       20,
7019       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7020        __kmp_dflt_team_nth));
7021 
7022 #ifdef KMP_ADJUST_BLOCKTIME
7023   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7024   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7025     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7026     if (__kmp_nth > __kmp_avail_proc) {
7027       __kmp_zero_bt = TRUE;
7028     }
7029   }
7030 #endif /* KMP_ADJUST_BLOCKTIME */
7031 
7032   /* we have finished middle initialization */
7033   TCW_SYNC_4(__kmp_init_middle, TRUE);
7034 
7035   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7036 }
7037 
7038 void __kmp_middle_initialize(void) {
7039   if (__kmp_init_middle) {
7040     return;
7041   }
7042   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7043   if (__kmp_init_middle) {
7044     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7045     return;
7046   }
7047   __kmp_do_middle_initialize();
7048   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7049 }
7050 
7051 void __kmp_parallel_initialize(void) {
7052   int gtid = __kmp_entry_gtid(); // this might be a new root
7053 
7054   /* synchronize parallel initialization (for sibling) */
7055   if (TCR_4(__kmp_init_parallel))
7056     return;
7057   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7058   if (TCR_4(__kmp_init_parallel)) {
7059     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7060     return;
7061   }
7062 
7063   /* TODO reinitialization after we have already shut down */
7064   if (TCR_4(__kmp_global.g.g_done)) {
7065     KA_TRACE(
7066         10,
7067         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7068     __kmp_infinite_loop();
7069   }
7070 
7071   /* jc: The lock __kmp_initz_lock is already held, so calling
7072      __kmp_serial_initialize would cause a deadlock.  So we call
7073      __kmp_do_serial_initialize directly. */
7074   if (!__kmp_init_middle) {
7075     __kmp_do_middle_initialize();
7076   }
7077   __kmp_resume_if_hard_paused();
7078 
7079   /* begin initialization */
7080   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7081   KMP_ASSERT(KMP_UBER_GTID(gtid));
7082 
7083 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7084   // Save the FP control regs.
7085   // Worker threads will set theirs to these values at thread startup.
7086   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7087   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7088   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7089 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7090 
7091 #if KMP_OS_UNIX
7092 #if KMP_HANDLE_SIGNALS
7093   /*  must be after __kmp_serial_initialize  */
7094   __kmp_install_signals(TRUE);
7095 #endif
7096 #endif
7097 
7098   __kmp_suspend_initialize();
7099 
7100 #if defined(USE_LOAD_BALANCE)
7101   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7102     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7103   }
7104 #else
7105   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7106     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7107   }
7108 #endif
7109 
7110   if (__kmp_version) {
7111     __kmp_print_version_2();
7112   }
7113 
7114   /* we have finished parallel initialization */
7115   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7116 
7117   KMP_MB();
7118   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7119 
7120   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7121 }
7122 
7123 void __kmp_hidden_helper_initialize() {
7124   if (TCR_4(__kmp_init_hidden_helper))
7125     return;
7126 
7127   // __kmp_parallel_initialize is required before we initialize hidden helper
7128   if (!TCR_4(__kmp_init_parallel))
7129     __kmp_parallel_initialize();
7130 
7131   // Double check. Note that this double check should not be placed before
7132   // __kmp_parallel_initialize as it will cause dead lock.
7133   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7134   if (TCR_4(__kmp_init_hidden_helper)) {
7135     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7136     return;
7137   }
7138 
7139   // Set the count of hidden helper tasks to be executed to zero
7140   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7141 
7142   // Set the global variable indicating that we're initializing hidden helper
7143   // team/threads
7144   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7145 
7146   // Platform independent initialization
7147   __kmp_do_initialize_hidden_helper_threads();
7148 
7149   // Wait here for the finish of initialization of hidden helper teams
7150   __kmp_hidden_helper_threads_initz_wait();
7151 
7152   // We have finished hidden helper initialization
7153   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7154 
7155   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7156 }
7157 
7158 /* ------------------------------------------------------------------------ */
7159 
7160 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7161                                    kmp_team_t *team) {
7162   kmp_disp_t *dispatch;
7163 
7164   KMP_MB();
7165 
7166   /* none of the threads have encountered any constructs, yet. */
7167   this_thr->th.th_local.this_construct = 0;
7168 #if KMP_CACHE_MANAGE
7169   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7170 #endif /* KMP_CACHE_MANAGE */
7171   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7172   KMP_DEBUG_ASSERT(dispatch);
7173   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7174   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7175   // this_thr->th.th_info.ds.ds_tid ] );
7176 
7177   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7178   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7179   if (__kmp_env_consistency_check)
7180     __kmp_push_parallel(gtid, team->t.t_ident);
7181 
7182   KMP_MB(); /* Flush all pending memory write invalidates.  */
7183 }
7184 
7185 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7186                                   kmp_team_t *team) {
7187   if (__kmp_env_consistency_check)
7188     __kmp_pop_parallel(gtid, team->t.t_ident);
7189 
7190   __kmp_finish_implicit_task(this_thr);
7191 }
7192 
7193 int __kmp_invoke_task_func(int gtid) {
7194   int rc;
7195   int tid = __kmp_tid_from_gtid(gtid);
7196   kmp_info_t *this_thr = __kmp_threads[gtid];
7197   kmp_team_t *team = this_thr->th.th_team;
7198 
7199   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7200 #if USE_ITT_BUILD
7201   if (__itt_stack_caller_create_ptr) {
7202     __kmp_itt_stack_callee_enter(
7203         (__itt_caller)
7204             team->t.t_stack_id); // inform ittnotify about entering user's code
7205   }
7206 #endif /* USE_ITT_BUILD */
7207 #if INCLUDE_SSC_MARKS
7208   SSC_MARK_INVOKING();
7209 #endif
7210 
7211 #if OMPT_SUPPORT
7212   void *dummy;
7213   void **exit_frame_p;
7214   ompt_data_t *my_task_data;
7215   ompt_data_t *my_parallel_data;
7216   int ompt_team_size;
7217 
7218   if (ompt_enabled.enabled) {
7219     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7220                          .ompt_task_info.frame.exit_frame.ptr);
7221   } else {
7222     exit_frame_p = &dummy;
7223   }
7224 
7225   my_task_data =
7226       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7227   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7228   if (ompt_enabled.ompt_callback_implicit_task) {
7229     ompt_team_size = team->t.t_nproc;
7230     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7231         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7232         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7233     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7234   }
7235 #endif
7236 
7237 #if KMP_STATS_ENABLED
7238   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7239   if (previous_state == stats_state_e::TEAMS_REGION) {
7240     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7241   } else {
7242     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7243   }
7244   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7245 #endif
7246 
7247   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7248                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7249 #if OMPT_SUPPORT
7250                               ,
7251                               exit_frame_p
7252 #endif
7253   );
7254 #if OMPT_SUPPORT
7255   *exit_frame_p = NULL;
7256   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7257 #endif
7258 
7259 #if KMP_STATS_ENABLED
7260   if (previous_state == stats_state_e::TEAMS_REGION) {
7261     KMP_SET_THREAD_STATE(previous_state);
7262   }
7263   KMP_POP_PARTITIONED_TIMER();
7264 #endif
7265 
7266 #if USE_ITT_BUILD
7267   if (__itt_stack_caller_create_ptr) {
7268     __kmp_itt_stack_callee_leave(
7269         (__itt_caller)
7270             team->t.t_stack_id); // inform ittnotify about leaving user's code
7271   }
7272 #endif /* USE_ITT_BUILD */
7273   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7274 
7275   return rc;
7276 }
7277 
7278 void __kmp_teams_master(int gtid) {
7279   // This routine is called by all master threads in teams construct
7280   kmp_info_t *thr = __kmp_threads[gtid];
7281   kmp_team_t *team = thr->th.th_team;
7282   ident_t *loc = team->t.t_ident;
7283   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7284   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7285   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7286   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7287                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7288 
7289   // This thread is a new CG root.  Set up the proper variables.
7290   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7291   tmp->cg_root = thr; // Make thr the CG root
7292   // Init to thread limit that was stored when league masters were forked
7293   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7294   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7295   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7296                  " cg_nthreads to 1\n",
7297                  thr, tmp));
7298   tmp->up = thr->th.th_cg_roots;
7299   thr->th.th_cg_roots = tmp;
7300 
7301 // Launch league of teams now, but not let workers execute
7302 // (they hang on fork barrier until next parallel)
7303 #if INCLUDE_SSC_MARKS
7304   SSC_MARK_FORKING();
7305 #endif
7306   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7307                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7308                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7309 #if INCLUDE_SSC_MARKS
7310   SSC_MARK_JOINING();
7311 #endif
7312   // If the team size was reduced from the limit, set it to the new size
7313   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7314     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7315   // AC: last parameter "1" eliminates join barrier which won't work because
7316   // worker threads are in a fork barrier waiting for more parallel regions
7317   __kmp_join_call(loc, gtid
7318 #if OMPT_SUPPORT
7319                   ,
7320                   fork_context_intel
7321 #endif
7322                   ,
7323                   1);
7324 }
7325 
7326 int __kmp_invoke_teams_master(int gtid) {
7327   kmp_info_t *this_thr = __kmp_threads[gtid];
7328   kmp_team_t *team = this_thr->th.th_team;
7329 #if KMP_DEBUG
7330   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7331     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7332                      (void *)__kmp_teams_master);
7333 #endif
7334   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7335 #if OMPT_SUPPORT
7336   int tid = __kmp_tid_from_gtid(gtid);
7337   ompt_data_t *task_data =
7338       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7339   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7340   if (ompt_enabled.ompt_callback_implicit_task) {
7341     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7342         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7343         ompt_task_initial);
7344     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7345   }
7346 #endif
7347   __kmp_teams_master(gtid);
7348 #if OMPT_SUPPORT
7349   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7350 #endif
7351   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7352   return 1;
7353 }
7354 
7355 /* this sets the requested number of threads for the next parallel region
7356    encountered by this team. since this should be enclosed in the forkjoin
7357    critical section it should avoid race conditions with asymmetrical nested
7358    parallelism */
7359 
7360 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7361   kmp_info_t *thr = __kmp_threads[gtid];
7362 
7363   if (num_threads > 0)
7364     thr->th.th_set_nproc = num_threads;
7365 }
7366 
7367 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7368                                     int num_threads) {
7369   KMP_DEBUG_ASSERT(thr);
7370   // Remember the number of threads for inner parallel regions
7371   if (!TCR_4(__kmp_init_middle))
7372     __kmp_middle_initialize(); // get internal globals calculated
7373   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7374   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7375 
7376   if (num_threads == 0) {
7377     if (__kmp_teams_thread_limit > 0) {
7378       num_threads = __kmp_teams_thread_limit;
7379     } else {
7380       num_threads = __kmp_avail_proc / num_teams;
7381     }
7382     // adjust num_threads w/o warning as it is not user setting
7383     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7384     // no thread_limit clause specified -  do not change thread-limit-var ICV
7385     if (num_threads > __kmp_dflt_team_nth) {
7386       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7387     }
7388     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7389       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7390     } // prevent team size to exceed thread-limit-var
7391     if (num_teams * num_threads > __kmp_teams_max_nth) {
7392       num_threads = __kmp_teams_max_nth / num_teams;
7393     }
7394     if (num_threads == 0) {
7395       num_threads = 1;
7396     }
7397   } else {
7398     // This thread will be the master of the league masters
7399     // Store new thread limit; old limit is saved in th_cg_roots list
7400     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7401     // num_threads = min(num_threads, nthreads-var)
7402     if (num_threads > __kmp_dflt_team_nth) {
7403       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7404     }
7405     if (num_teams * num_threads > __kmp_teams_max_nth) {
7406       int new_threads = __kmp_teams_max_nth / num_teams;
7407       if (new_threads == 0) {
7408         new_threads = 1;
7409       }
7410       if (new_threads != num_threads) {
7411         if (!__kmp_reserve_warn) { // user asked for too many threads
7412           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7413           __kmp_msg(kmp_ms_warning,
7414                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7415                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7416         }
7417       }
7418       num_threads = new_threads;
7419     }
7420   }
7421   thr->th.th_teams_size.nth = num_threads;
7422 }
7423 
7424 /* this sets the requested number of teams for the teams region and/or
7425    the number of threads for the next parallel region encountered  */
7426 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7427                           int num_threads) {
7428   kmp_info_t *thr = __kmp_threads[gtid];
7429   KMP_DEBUG_ASSERT(num_teams >= 0);
7430   KMP_DEBUG_ASSERT(num_threads >= 0);
7431 
7432   if (num_teams == 0) {
7433     if (__kmp_nteams > 0) {
7434       num_teams = __kmp_nteams;
7435     } else {
7436       num_teams = 1; // default number of teams is 1.
7437     }
7438   }
7439   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7440     if (!__kmp_reserve_warn) {
7441       __kmp_reserve_warn = 1;
7442       __kmp_msg(kmp_ms_warning,
7443                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7444                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7445     }
7446     num_teams = __kmp_teams_max_nth;
7447   }
7448   // Set number of teams (number of threads in the outer "parallel" of the
7449   // teams)
7450   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7451 
7452   __kmp_push_thread_limit(thr, num_teams, num_threads);
7453 }
7454 
7455 /* This sets the requested number of teams for the teams region and/or
7456    the number of threads for the next parallel region encountered  */
7457 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7458                              int num_teams_ub, int num_threads) {
7459   kmp_info_t *thr = __kmp_threads[gtid];
7460   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7461   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7462   KMP_DEBUG_ASSERT(num_threads >= 0);
7463 
7464   if (num_teams_lb > num_teams_ub) {
7465     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7466                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7467   }
7468 
7469   int num_teams = 1; // defalt number of teams is 1.
7470 
7471   if (num_teams_lb == 0 && num_teams_ub > 0)
7472     num_teams_lb = num_teams_ub;
7473 
7474   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7475     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7476     if (num_teams > __kmp_teams_max_nth) {
7477       if (!__kmp_reserve_warn) {
7478         __kmp_reserve_warn = 1;
7479         __kmp_msg(kmp_ms_warning,
7480                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7481                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7482       }
7483       num_teams = __kmp_teams_max_nth;
7484     }
7485   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7486     num_teams = num_teams_ub;
7487   } else { // num_teams_lb <= num_teams <= num_teams_ub
7488     if (num_threads == 0) {
7489       if (num_teams_ub > __kmp_teams_max_nth) {
7490         num_teams = num_teams_lb;
7491       } else {
7492         num_teams = num_teams_ub;
7493       }
7494     } else {
7495       num_teams = (num_threads > __kmp_teams_max_nth)
7496                       ? num_teams
7497                       : __kmp_teams_max_nth / num_threads;
7498       if (num_teams < num_teams_lb) {
7499         num_teams = num_teams_lb;
7500       } else if (num_teams > num_teams_ub) {
7501         num_teams = num_teams_ub;
7502       }
7503     }
7504   }
7505   // Set number of teams (number of threads in the outer "parallel" of the
7506   // teams)
7507   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7508 
7509   __kmp_push_thread_limit(thr, num_teams, num_threads);
7510 }
7511 
7512 // Set the proc_bind var to use in the following parallel region.
7513 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7514   kmp_info_t *thr = __kmp_threads[gtid];
7515   thr->th.th_set_proc_bind = proc_bind;
7516 }
7517 
7518 /* Launch the worker threads into the microtask. */
7519 
7520 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7521   kmp_info_t *this_thr = __kmp_threads[gtid];
7522 
7523 #ifdef KMP_DEBUG
7524   int f;
7525 #endif /* KMP_DEBUG */
7526 
7527   KMP_DEBUG_ASSERT(team);
7528   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7529   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7530   KMP_MB(); /* Flush all pending memory write invalidates.  */
7531 
7532   team->t.t_construct = 0; /* no single directives seen yet */
7533   team->t.t_ordered.dt.t_value =
7534       0; /* thread 0 enters the ordered section first */
7535 
7536   /* Reset the identifiers on the dispatch buffer */
7537   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7538   if (team->t.t_max_nproc > 1) {
7539     int i;
7540     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7541       team->t.t_disp_buffer[i].buffer_index = i;
7542       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7543     }
7544   } else {
7545     team->t.t_disp_buffer[0].buffer_index = 0;
7546     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7547   }
7548 
7549   KMP_MB(); /* Flush all pending memory write invalidates.  */
7550   KMP_ASSERT(this_thr->th.th_team == team);
7551 
7552 #ifdef KMP_DEBUG
7553   for (f = 0; f < team->t.t_nproc; f++) {
7554     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7555                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7556   }
7557 #endif /* KMP_DEBUG */
7558 
7559   /* release the worker threads so they may begin working */
7560   __kmp_fork_barrier(gtid, 0);
7561 }
7562 
7563 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7564   kmp_info_t *this_thr = __kmp_threads[gtid];
7565 
7566   KMP_DEBUG_ASSERT(team);
7567   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7568   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7569   KMP_MB(); /* Flush all pending memory write invalidates.  */
7570 
7571   /* Join barrier after fork */
7572 
7573 #ifdef KMP_DEBUG
7574   if (__kmp_threads[gtid] &&
7575       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7576     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7577                  __kmp_threads[gtid]);
7578     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7579                  "team->t.t_nproc=%d\n",
7580                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7581                  team->t.t_nproc);
7582     __kmp_print_structure();
7583   }
7584   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7585                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7586 #endif /* KMP_DEBUG */
7587 
7588   __kmp_join_barrier(gtid); /* wait for everyone */
7589 #if OMPT_SUPPORT
7590   if (ompt_enabled.enabled &&
7591       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7592     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7593     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7594     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7595 #if OMPT_OPTIONAL
7596     void *codeptr = NULL;
7597     if (KMP_MASTER_TID(ds_tid) &&
7598         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7599          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7600       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7601 
7602     if (ompt_enabled.ompt_callback_sync_region_wait) {
7603       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7604           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7605           codeptr);
7606     }
7607     if (ompt_enabled.ompt_callback_sync_region) {
7608       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7609           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7610           codeptr);
7611     }
7612 #endif
7613     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7614       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7615           ompt_scope_end, NULL, task_data, 0, ds_tid,
7616           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7617     }
7618   }
7619 #endif
7620 
7621   KMP_MB(); /* Flush all pending memory write invalidates.  */
7622   KMP_ASSERT(this_thr->th.th_team == team);
7623 }
7624 
7625 /* ------------------------------------------------------------------------ */
7626 
7627 #ifdef USE_LOAD_BALANCE
7628 
7629 // Return the worker threads actively spinning in the hot team, if we
7630 // are at the outermost level of parallelism.  Otherwise, return 0.
7631 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7632   int i;
7633   int retval;
7634   kmp_team_t *hot_team;
7635 
7636   if (root->r.r_active) {
7637     return 0;
7638   }
7639   hot_team = root->r.r_hot_team;
7640   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7641     return hot_team->t.t_nproc - 1; // Don't count master thread
7642   }
7643 
7644   // Skip the master thread - it is accounted for elsewhere.
7645   retval = 0;
7646   for (i = 1; i < hot_team->t.t_nproc; i++) {
7647     if (hot_team->t.t_threads[i]->th.th_active) {
7648       retval++;
7649     }
7650   }
7651   return retval;
7652 }
7653 
7654 // Perform an automatic adjustment to the number of
7655 // threads used by the next parallel region.
7656 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7657   int retval;
7658   int pool_active;
7659   int hot_team_active;
7660   int team_curr_active;
7661   int system_active;
7662 
7663   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7664                 set_nproc));
7665   KMP_DEBUG_ASSERT(root);
7666   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7667                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7668   KMP_DEBUG_ASSERT(set_nproc > 1);
7669 
7670   if (set_nproc == 1) {
7671     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7672     return 1;
7673   }
7674 
7675   // Threads that are active in the thread pool, active in the hot team for this
7676   // particular root (if we are at the outer par level), and the currently
7677   // executing thread (to become the master) are available to add to the new
7678   // team, but are currently contributing to the system load, and must be
7679   // accounted for.
7680   pool_active = __kmp_thread_pool_active_nth;
7681   hot_team_active = __kmp_active_hot_team_nproc(root);
7682   team_curr_active = pool_active + hot_team_active + 1;
7683 
7684   // Check the system load.
7685   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7686   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7687                 "hot team active = %d\n",
7688                 system_active, pool_active, hot_team_active));
7689 
7690   if (system_active < 0) {
7691     // There was an error reading the necessary info from /proc, so use the
7692     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7693     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7694     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7695     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7696 
7697     // Make this call behave like the thread limit algorithm.
7698     retval = __kmp_avail_proc - __kmp_nth +
7699              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7700     if (retval > set_nproc) {
7701       retval = set_nproc;
7702     }
7703     if (retval < KMP_MIN_NTH) {
7704       retval = KMP_MIN_NTH;
7705     }
7706 
7707     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7708                   retval));
7709     return retval;
7710   }
7711 
7712   // There is a slight delay in the load balance algorithm in detecting new
7713   // running procs. The real system load at this instant should be at least as
7714   // large as the #active omp thread that are available to add to the team.
7715   if (system_active < team_curr_active) {
7716     system_active = team_curr_active;
7717   }
7718   retval = __kmp_avail_proc - system_active + team_curr_active;
7719   if (retval > set_nproc) {
7720     retval = set_nproc;
7721   }
7722   if (retval < KMP_MIN_NTH) {
7723     retval = KMP_MIN_NTH;
7724   }
7725 
7726   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7727   return retval;
7728 } // __kmp_load_balance_nproc()
7729 
7730 #endif /* USE_LOAD_BALANCE */
7731 
7732 /* ------------------------------------------------------------------------ */
7733 
7734 /* NOTE: this is called with the __kmp_init_lock held */
7735 void __kmp_cleanup(void) {
7736   int f;
7737 
7738   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7739 
7740   if (TCR_4(__kmp_init_parallel)) {
7741 #if KMP_HANDLE_SIGNALS
7742     __kmp_remove_signals();
7743 #endif
7744     TCW_4(__kmp_init_parallel, FALSE);
7745   }
7746 
7747   if (TCR_4(__kmp_init_middle)) {
7748 #if KMP_AFFINITY_SUPPORTED
7749     __kmp_affinity_uninitialize();
7750 #endif /* KMP_AFFINITY_SUPPORTED */
7751     __kmp_cleanup_hierarchy();
7752     TCW_4(__kmp_init_middle, FALSE);
7753   }
7754 
7755   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7756 
7757   if (__kmp_init_serial) {
7758     __kmp_runtime_destroy();
7759     __kmp_init_serial = FALSE;
7760   }
7761 
7762   __kmp_cleanup_threadprivate_caches();
7763 
7764   for (f = 0; f < __kmp_threads_capacity; f++) {
7765     if (__kmp_root[f] != NULL) {
7766       __kmp_free(__kmp_root[f]);
7767       __kmp_root[f] = NULL;
7768     }
7769   }
7770   __kmp_free(__kmp_threads);
7771   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7772   // there is no need in freeing __kmp_root.
7773   __kmp_threads = NULL;
7774   __kmp_root = NULL;
7775   __kmp_threads_capacity = 0;
7776 
7777 #if KMP_USE_DYNAMIC_LOCK
7778   __kmp_cleanup_indirect_user_locks();
7779 #else
7780   __kmp_cleanup_user_locks();
7781 #endif
7782 
7783 #if KMP_AFFINITY_SUPPORTED
7784   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7785   __kmp_cpuinfo_file = NULL;
7786 #endif /* KMP_AFFINITY_SUPPORTED */
7787 
7788 #if KMP_USE_ADAPTIVE_LOCKS
7789 #if KMP_DEBUG_ADAPTIVE_LOCKS
7790   __kmp_print_speculative_stats();
7791 #endif
7792 #endif
7793   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7794   __kmp_nested_nth.nth = NULL;
7795   __kmp_nested_nth.size = 0;
7796   __kmp_nested_nth.used = 0;
7797   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7798   __kmp_nested_proc_bind.bind_types = NULL;
7799   __kmp_nested_proc_bind.size = 0;
7800   __kmp_nested_proc_bind.used = 0;
7801   if (__kmp_affinity_format) {
7802     KMP_INTERNAL_FREE(__kmp_affinity_format);
7803     __kmp_affinity_format = NULL;
7804   }
7805 
7806   __kmp_i18n_catclose();
7807 
7808 #if KMP_USE_HIER_SCHED
7809   __kmp_hier_scheds.deallocate();
7810 #endif
7811 
7812 #if KMP_STATS_ENABLED
7813   __kmp_stats_fini();
7814 #endif
7815 
7816   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7817 }
7818 
7819 /* ------------------------------------------------------------------------ */
7820 
7821 int __kmp_ignore_mppbeg(void) {
7822   char *env;
7823 
7824   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7825     if (__kmp_str_match_false(env))
7826       return FALSE;
7827   }
7828   // By default __kmpc_begin() is no-op.
7829   return TRUE;
7830 }
7831 
7832 int __kmp_ignore_mppend(void) {
7833   char *env;
7834 
7835   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7836     if (__kmp_str_match_false(env))
7837       return FALSE;
7838   }
7839   // By default __kmpc_end() is no-op.
7840   return TRUE;
7841 }
7842 
7843 void __kmp_internal_begin(void) {
7844   int gtid;
7845   kmp_root_t *root;
7846 
7847   /* this is a very important step as it will register new sibling threads
7848      and assign these new uber threads a new gtid */
7849   gtid = __kmp_entry_gtid();
7850   root = __kmp_threads[gtid]->th.th_root;
7851   KMP_ASSERT(KMP_UBER_GTID(gtid));
7852 
7853   if (root->r.r_begin)
7854     return;
7855   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7856   if (root->r.r_begin) {
7857     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7858     return;
7859   }
7860 
7861   root->r.r_begin = TRUE;
7862 
7863   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7864 }
7865 
7866 /* ------------------------------------------------------------------------ */
7867 
7868 void __kmp_user_set_library(enum library_type arg) {
7869   int gtid;
7870   kmp_root_t *root;
7871   kmp_info_t *thread;
7872 
7873   /* first, make sure we are initialized so we can get our gtid */
7874 
7875   gtid = __kmp_entry_gtid();
7876   thread = __kmp_threads[gtid];
7877 
7878   root = thread->th.th_root;
7879 
7880   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7881                 library_serial));
7882   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7883                                   thread */
7884     KMP_WARNING(SetLibraryIncorrectCall);
7885     return;
7886   }
7887 
7888   switch (arg) {
7889   case library_serial:
7890     thread->th.th_set_nproc = 0;
7891     set__nproc(thread, 1);
7892     break;
7893   case library_turnaround:
7894     thread->th.th_set_nproc = 0;
7895     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7896                                            : __kmp_dflt_team_nth_ub);
7897     break;
7898   case library_throughput:
7899     thread->th.th_set_nproc = 0;
7900     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7901                                            : __kmp_dflt_team_nth_ub);
7902     break;
7903   default:
7904     KMP_FATAL(UnknownLibraryType, arg);
7905   }
7906 
7907   __kmp_aux_set_library(arg);
7908 }
7909 
7910 void __kmp_aux_set_stacksize(size_t arg) {
7911   if (!__kmp_init_serial)
7912     __kmp_serial_initialize();
7913 
7914 #if KMP_OS_DARWIN
7915   if (arg & (0x1000 - 1)) {
7916     arg &= ~(0x1000 - 1);
7917     if (arg + 0x1000) /* check for overflow if we round up */
7918       arg += 0x1000;
7919   }
7920 #endif
7921   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7922 
7923   /* only change the default stacksize before the first parallel region */
7924   if (!TCR_4(__kmp_init_parallel)) {
7925     size_t value = arg; /* argument is in bytes */
7926 
7927     if (value < __kmp_sys_min_stksize)
7928       value = __kmp_sys_min_stksize;
7929     else if (value > KMP_MAX_STKSIZE)
7930       value = KMP_MAX_STKSIZE;
7931 
7932     __kmp_stksize = value;
7933 
7934     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7935   }
7936 
7937   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7938 }
7939 
7940 /* set the behaviour of the runtime library */
7941 /* TODO this can cause some odd behaviour with sibling parallelism... */
7942 void __kmp_aux_set_library(enum library_type arg) {
7943   __kmp_library = arg;
7944 
7945   switch (__kmp_library) {
7946   case library_serial: {
7947     KMP_INFORM(LibraryIsSerial);
7948   } break;
7949   case library_turnaround:
7950     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7951       __kmp_use_yield = 2; // only yield when oversubscribed
7952     break;
7953   case library_throughput:
7954     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7955       __kmp_dflt_blocktime = 200;
7956     break;
7957   default:
7958     KMP_FATAL(UnknownLibraryType, arg);
7959   }
7960 }
7961 
7962 /* Getting team information common for all team API */
7963 // Returns NULL if not in teams construct
7964 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7965   kmp_info_t *thr = __kmp_entry_thread();
7966   teams_serialized = 0;
7967   if (thr->th.th_teams_microtask) {
7968     kmp_team_t *team = thr->th.th_team;
7969     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7970     int ii = team->t.t_level;
7971     teams_serialized = team->t.t_serialized;
7972     int level = tlevel + 1;
7973     KMP_DEBUG_ASSERT(ii >= tlevel);
7974     while (ii > level) {
7975       for (teams_serialized = team->t.t_serialized;
7976            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7977       }
7978       if (team->t.t_serialized && (!teams_serialized)) {
7979         team = team->t.t_parent;
7980         continue;
7981       }
7982       if (ii > level) {
7983         team = team->t.t_parent;
7984         ii--;
7985       }
7986     }
7987     return team;
7988   }
7989   return NULL;
7990 }
7991 
7992 int __kmp_aux_get_team_num() {
7993   int serialized;
7994   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7995   if (team) {
7996     if (serialized > 1) {
7997       return 0; // teams region is serialized ( 1 team of 1 thread ).
7998     } else {
7999       return team->t.t_master_tid;
8000     }
8001   }
8002   return 0;
8003 }
8004 
8005 int __kmp_aux_get_num_teams() {
8006   int serialized;
8007   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8008   if (team) {
8009     if (serialized > 1) {
8010       return 1;
8011     } else {
8012       return team->t.t_parent->t.t_nproc;
8013     }
8014   }
8015   return 1;
8016 }
8017 
8018 /* ------------------------------------------------------------------------ */
8019 
8020 /*
8021  * Affinity Format Parser
8022  *
8023  * Field is in form of: %[[[0].]size]type
8024  * % and type are required (%% means print a literal '%')
8025  * type is either single char or long name surrounded by {},
8026  * e.g., N or {num_threads}
8027  * 0 => leading zeros
8028  * . => right justified when size is specified
8029  * by default output is left justified
8030  * size is the *minimum* field length
8031  * All other characters are printed as is
8032  *
8033  * Available field types:
8034  * L {thread_level}      - omp_get_level()
8035  * n {thread_num}        - omp_get_thread_num()
8036  * h {host}              - name of host machine
8037  * P {process_id}        - process id (integer)
8038  * T {thread_identifier} - native thread identifier (integer)
8039  * N {num_threads}       - omp_get_num_threads()
8040  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8041  * a {thread_affinity}   - comma separated list of integers or integer ranges
8042  *                         (values of affinity mask)
8043  *
8044  * Implementation-specific field types can be added
8045  * If a type is unknown, print "undefined"
8046  */
8047 
8048 // Structure holding the short name, long name, and corresponding data type
8049 // for snprintf.  A table of these will represent the entire valid keyword
8050 // field types.
8051 typedef struct kmp_affinity_format_field_t {
8052   char short_name; // from spec e.g., L -> thread level
8053   const char *long_name; // from spec thread_level -> thread level
8054   char field_format; // data type for snprintf (typically 'd' or 's'
8055   // for integer or string)
8056 } kmp_affinity_format_field_t;
8057 
8058 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8059 #if KMP_AFFINITY_SUPPORTED
8060     {'A', "thread_affinity", 's'},
8061 #endif
8062     {'t', "team_num", 'd'},
8063     {'T', "num_teams", 'd'},
8064     {'L', "nesting_level", 'd'},
8065     {'n', "thread_num", 'd'},
8066     {'N', "num_threads", 'd'},
8067     {'a', "ancestor_tnum", 'd'},
8068     {'H', "host", 's'},
8069     {'P', "process_id", 'd'},
8070     {'i', "native_thread_id", 'd'}};
8071 
8072 // Return the number of characters it takes to hold field
8073 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8074                                             const char **ptr,
8075                                             kmp_str_buf_t *field_buffer) {
8076   int rc, format_index, field_value;
8077   const char *width_left, *width_right;
8078   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8079   static const int FORMAT_SIZE = 20;
8080   char format[FORMAT_SIZE] = {0};
8081   char absolute_short_name = 0;
8082 
8083   KMP_DEBUG_ASSERT(gtid >= 0);
8084   KMP_DEBUG_ASSERT(th);
8085   KMP_DEBUG_ASSERT(**ptr == '%');
8086   KMP_DEBUG_ASSERT(field_buffer);
8087 
8088   __kmp_str_buf_clear(field_buffer);
8089 
8090   // Skip the initial %
8091   (*ptr)++;
8092 
8093   // Check for %% first
8094   if (**ptr == '%') {
8095     __kmp_str_buf_cat(field_buffer, "%", 1);
8096     (*ptr)++; // skip over the second %
8097     return 1;
8098   }
8099 
8100   // Parse field modifiers if they are present
8101   pad_zeros = false;
8102   if (**ptr == '0') {
8103     pad_zeros = true;
8104     (*ptr)++; // skip over 0
8105   }
8106   right_justify = false;
8107   if (**ptr == '.') {
8108     right_justify = true;
8109     (*ptr)++; // skip over .
8110   }
8111   // Parse width of field: [width_left, width_right)
8112   width_left = width_right = NULL;
8113   if (**ptr >= '0' && **ptr <= '9') {
8114     width_left = *ptr;
8115     SKIP_DIGITS(*ptr);
8116     width_right = *ptr;
8117   }
8118 
8119   // Create the format for KMP_SNPRINTF based on flags parsed above
8120   format_index = 0;
8121   format[format_index++] = '%';
8122   if (!right_justify)
8123     format[format_index++] = '-';
8124   if (pad_zeros)
8125     format[format_index++] = '0';
8126   if (width_left && width_right) {
8127     int i = 0;
8128     // Only allow 8 digit number widths.
8129     // This also prevents overflowing format variable
8130     while (i < 8 && width_left < width_right) {
8131       format[format_index++] = *width_left;
8132       width_left++;
8133       i++;
8134     }
8135   }
8136 
8137   // Parse a name (long or short)
8138   // Canonicalize the name into absolute_short_name
8139   found_valid_name = false;
8140   parse_long_name = (**ptr == '{');
8141   if (parse_long_name)
8142     (*ptr)++; // skip initial left brace
8143   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8144                              sizeof(__kmp_affinity_format_table[0]);
8145        ++i) {
8146     char short_name = __kmp_affinity_format_table[i].short_name;
8147     const char *long_name = __kmp_affinity_format_table[i].long_name;
8148     char field_format = __kmp_affinity_format_table[i].field_format;
8149     if (parse_long_name) {
8150       size_t length = KMP_STRLEN(long_name);
8151       if (strncmp(*ptr, long_name, length) == 0) {
8152         found_valid_name = true;
8153         (*ptr) += length; // skip the long name
8154       }
8155     } else if (**ptr == short_name) {
8156       found_valid_name = true;
8157       (*ptr)++; // skip the short name
8158     }
8159     if (found_valid_name) {
8160       format[format_index++] = field_format;
8161       format[format_index++] = '\0';
8162       absolute_short_name = short_name;
8163       break;
8164     }
8165   }
8166   if (parse_long_name) {
8167     if (**ptr != '}') {
8168       absolute_short_name = 0;
8169     } else {
8170       (*ptr)++; // skip over the right brace
8171     }
8172   }
8173 
8174   // Attempt to fill the buffer with the requested
8175   // value using snprintf within __kmp_str_buf_print()
8176   switch (absolute_short_name) {
8177   case 't':
8178     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8179     break;
8180   case 'T':
8181     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8182     break;
8183   case 'L':
8184     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8185     break;
8186   case 'n':
8187     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8188     break;
8189   case 'H': {
8190     static const int BUFFER_SIZE = 256;
8191     char buf[BUFFER_SIZE];
8192     __kmp_expand_host_name(buf, BUFFER_SIZE);
8193     rc = __kmp_str_buf_print(field_buffer, format, buf);
8194   } break;
8195   case 'P':
8196     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8197     break;
8198   case 'i':
8199     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8200     break;
8201   case 'N':
8202     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8203     break;
8204   case 'a':
8205     field_value =
8206         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8207     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8208     break;
8209 #if KMP_AFFINITY_SUPPORTED
8210   case 'A': {
8211     kmp_str_buf_t buf;
8212     __kmp_str_buf_init(&buf);
8213     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8214     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8215     __kmp_str_buf_free(&buf);
8216   } break;
8217 #endif
8218   default:
8219     // According to spec, If an implementation does not have info for field
8220     // type, then "undefined" is printed
8221     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8222     // Skip the field
8223     if (parse_long_name) {
8224       SKIP_TOKEN(*ptr);
8225       if (**ptr == '}')
8226         (*ptr)++;
8227     } else {
8228       (*ptr)++;
8229     }
8230   }
8231 
8232   KMP_ASSERT(format_index <= FORMAT_SIZE);
8233   return rc;
8234 }
8235 
8236 /*
8237  * Return number of characters needed to hold the affinity string
8238  * (not including null byte character)
8239  * The resultant string is printed to buffer, which the caller can then
8240  * handle afterwards
8241  */
8242 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8243                                   kmp_str_buf_t *buffer) {
8244   const char *parse_ptr;
8245   size_t retval;
8246   const kmp_info_t *th;
8247   kmp_str_buf_t field;
8248 
8249   KMP_DEBUG_ASSERT(buffer);
8250   KMP_DEBUG_ASSERT(gtid >= 0);
8251 
8252   __kmp_str_buf_init(&field);
8253   __kmp_str_buf_clear(buffer);
8254 
8255   th = __kmp_threads[gtid];
8256   retval = 0;
8257 
8258   // If format is NULL or zero-length string, then we use
8259   // affinity-format-var ICV
8260   parse_ptr = format;
8261   if (parse_ptr == NULL || *parse_ptr == '\0') {
8262     parse_ptr = __kmp_affinity_format;
8263   }
8264   KMP_DEBUG_ASSERT(parse_ptr);
8265 
8266   while (*parse_ptr != '\0') {
8267     // Parse a field
8268     if (*parse_ptr == '%') {
8269       // Put field in the buffer
8270       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8271       __kmp_str_buf_catbuf(buffer, &field);
8272       retval += rc;
8273     } else {
8274       // Put literal character in buffer
8275       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8276       retval++;
8277       parse_ptr++;
8278     }
8279   }
8280   __kmp_str_buf_free(&field);
8281   return retval;
8282 }
8283 
8284 // Displays the affinity string to stdout
8285 void __kmp_aux_display_affinity(int gtid, const char *format) {
8286   kmp_str_buf_t buf;
8287   __kmp_str_buf_init(&buf);
8288   __kmp_aux_capture_affinity(gtid, format, &buf);
8289   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8290   __kmp_str_buf_free(&buf);
8291 }
8292 
8293 /* ------------------------------------------------------------------------ */
8294 
8295 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8296   int blocktime = arg; /* argument is in milliseconds */
8297 #if KMP_USE_MONITOR
8298   int bt_intervals;
8299 #endif
8300   kmp_int8 bt_set;
8301 
8302   __kmp_save_internal_controls(thread);
8303 
8304   /* Normalize and set blocktime for the teams */
8305   if (blocktime < KMP_MIN_BLOCKTIME)
8306     blocktime = KMP_MIN_BLOCKTIME;
8307   else if (blocktime > KMP_MAX_BLOCKTIME)
8308     blocktime = KMP_MAX_BLOCKTIME;
8309 
8310   set__blocktime_team(thread->th.th_team, tid, blocktime);
8311   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8312 
8313 #if KMP_USE_MONITOR
8314   /* Calculate and set blocktime intervals for the teams */
8315   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8316 
8317   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8318   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8319 #endif
8320 
8321   /* Set whether blocktime has been set to "TRUE" */
8322   bt_set = TRUE;
8323 
8324   set__bt_set_team(thread->th.th_team, tid, bt_set);
8325   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8326 #if KMP_USE_MONITOR
8327   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8328                 "bt_intervals=%d, monitor_updates=%d\n",
8329                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8330                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8331                 __kmp_monitor_wakeups));
8332 #else
8333   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8334                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8335                 thread->th.th_team->t.t_id, tid, blocktime));
8336 #endif
8337 }
8338 
8339 void __kmp_aux_set_defaults(char const *str, size_t len) {
8340   if (!__kmp_init_serial) {
8341     __kmp_serial_initialize();
8342   }
8343   __kmp_env_initialize(str);
8344 
8345   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8346     __kmp_env_print();
8347   }
8348 } // __kmp_aux_set_defaults
8349 
8350 /* ------------------------------------------------------------------------ */
8351 /* internal fast reduction routines */
8352 
8353 PACKED_REDUCTION_METHOD_T
8354 __kmp_determine_reduction_method(
8355     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8356     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8357     kmp_critical_name *lck) {
8358 
8359   // Default reduction method: critical construct ( lck != NULL, like in current
8360   // PAROPT )
8361   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8362   // can be selected by RTL
8363   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8364   // can be selected by RTL
8365   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8366   // among generated by PAROPT.
8367 
8368   PACKED_REDUCTION_METHOD_T retval;
8369 
8370   int team_size;
8371 
8372   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8373   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8374 
8375 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8376   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8377 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8378 
8379   retval = critical_reduce_block;
8380 
8381   // another choice of getting a team size (with 1 dynamic deference) is slower
8382   team_size = __kmp_get_team_num_threads(global_tid);
8383   if (team_size == 1) {
8384 
8385     retval = empty_reduce_block;
8386 
8387   } else {
8388 
8389     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8390 
8391 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8392     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8393 
8394 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8395     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8396 
8397     int teamsize_cutoff = 4;
8398 
8399 #if KMP_MIC_SUPPORTED
8400     if (__kmp_mic_type != non_mic) {
8401       teamsize_cutoff = 8;
8402     }
8403 #endif
8404     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8405     if (tree_available) {
8406       if (team_size <= teamsize_cutoff) {
8407         if (atomic_available) {
8408           retval = atomic_reduce_block;
8409         }
8410       } else {
8411         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8412       }
8413     } else if (atomic_available) {
8414       retval = atomic_reduce_block;
8415     }
8416 #else
8417 #error "Unknown or unsupported OS"
8418 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8419        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8420 
8421 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8422 
8423 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8424 
8425     // basic tuning
8426 
8427     if (atomic_available) {
8428       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8429         retval = atomic_reduce_block;
8430       }
8431     } // otherwise: use critical section
8432 
8433 #elif KMP_OS_DARWIN
8434 
8435     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8436     if (atomic_available && (num_vars <= 3)) {
8437       retval = atomic_reduce_block;
8438     } else if (tree_available) {
8439       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8440           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8441         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8442       }
8443     } // otherwise: use critical section
8444 
8445 #else
8446 #error "Unknown or unsupported OS"
8447 #endif
8448 
8449 #else
8450 #error "Unknown or unsupported architecture"
8451 #endif
8452   }
8453 
8454   // KMP_FORCE_REDUCTION
8455 
8456   // If the team is serialized (team_size == 1), ignore the forced reduction
8457   // method and stay with the unsynchronized method (empty_reduce_block)
8458   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8459       team_size != 1) {
8460 
8461     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8462 
8463     int atomic_available, tree_available;
8464 
8465     switch ((forced_retval = __kmp_force_reduction_method)) {
8466     case critical_reduce_block:
8467       KMP_ASSERT(lck); // lck should be != 0
8468       break;
8469 
8470     case atomic_reduce_block:
8471       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8472       if (!atomic_available) {
8473         KMP_WARNING(RedMethodNotSupported, "atomic");
8474         forced_retval = critical_reduce_block;
8475       }
8476       break;
8477 
8478     case tree_reduce_block:
8479       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8480       if (!tree_available) {
8481         KMP_WARNING(RedMethodNotSupported, "tree");
8482         forced_retval = critical_reduce_block;
8483       } else {
8484 #if KMP_FAST_REDUCTION_BARRIER
8485         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8486 #endif
8487       }
8488       break;
8489 
8490     default:
8491       KMP_ASSERT(0); // "unsupported method specified"
8492     }
8493 
8494     retval = forced_retval;
8495   }
8496 
8497   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8498 
8499 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8500 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8501 
8502   return (retval);
8503 }
8504 // this function is for testing set/get/determine reduce method
8505 kmp_int32 __kmp_get_reduce_method(void) {
8506   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8507 }
8508 
8509 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8510 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8511 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8512 
8513 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8514 // OpenMP is used subsequently.
8515 void __kmp_hard_pause() {
8516   __kmp_pause_status = kmp_hard_paused;
8517   __kmp_internal_end_thread(-1);
8518 }
8519 
8520 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8521 void __kmp_resume_if_soft_paused() {
8522   if (__kmp_pause_status == kmp_soft_paused) {
8523     __kmp_pause_status = kmp_not_paused;
8524 
8525     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8526       kmp_info_t *thread = __kmp_threads[gtid];
8527       if (thread) { // Wake it if sleeping
8528         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8529                          thread);
8530         if (fl.is_sleeping())
8531           fl.resume(gtid);
8532         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8533           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8534         } else { // thread holds the lock and may sleep soon
8535           do { // until either the thread sleeps, or we can get the lock
8536             if (fl.is_sleeping()) {
8537               fl.resume(gtid);
8538               break;
8539             } else if (__kmp_try_suspend_mx(thread)) {
8540               __kmp_unlock_suspend_mx(thread);
8541               break;
8542             }
8543           } while (1);
8544         }
8545       }
8546     }
8547   }
8548 }
8549 
8550 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8551 // TODO: add warning messages
8552 int __kmp_pause_resource(kmp_pause_status_t level) {
8553   if (level == kmp_not_paused) { // requesting resume
8554     if (__kmp_pause_status == kmp_not_paused) {
8555       // error message about runtime not being paused, so can't resume
8556       return 1;
8557     } else {
8558       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8559                        __kmp_pause_status == kmp_hard_paused);
8560       __kmp_pause_status = kmp_not_paused;
8561       return 0;
8562     }
8563   } else if (level == kmp_soft_paused) { // requesting soft pause
8564     if (__kmp_pause_status != kmp_not_paused) {
8565       // error message about already being paused
8566       return 1;
8567     } else {
8568       __kmp_soft_pause();
8569       return 0;
8570     }
8571   } else if (level == kmp_hard_paused) { // requesting hard pause
8572     if (__kmp_pause_status != kmp_not_paused) {
8573       // error message about already being paused
8574       return 1;
8575     } else {
8576       __kmp_hard_pause();
8577       return 0;
8578     }
8579   } else {
8580     // error message about invalid level
8581     return 1;
8582   }
8583 }
8584 
8585 void __kmp_omp_display_env(int verbose) {
8586   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8587   if (__kmp_init_serial == 0)
8588     __kmp_do_serial_initialize();
8589   __kmp_display_env_impl(!verbose, verbose);
8590   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8591 }
8592 
8593 // Globals and functions for hidden helper task
8594 kmp_info_t **__kmp_hidden_helper_threads;
8595 kmp_info_t *__kmp_hidden_helper_main_thread;
8596 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8597 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8598 #if KMP_OS_LINUX
8599 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8600 #else
8601 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8602 #endif
8603 
8604 namespace {
8605 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8606 
8607 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8608   // This is an explicit synchronization on all hidden helper threads in case
8609   // that when a regular thread pushes a hidden helper task to one hidden
8610   // helper thread, the thread has not been awaken once since they're released
8611   // by the main thread after creating the team.
8612   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8613   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8614          __kmp_hidden_helper_threads_num)
8615     ;
8616 
8617   // If main thread, then wait for signal
8618   if (__kmpc_master(nullptr, *gtid)) {
8619     // First, unset the initial state and release the initial thread
8620     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8621     __kmp_hidden_helper_initz_release();
8622     __kmp_hidden_helper_main_thread_wait();
8623     // Now wake up all worker threads
8624     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8625       __kmp_hidden_helper_worker_thread_signal();
8626     }
8627   }
8628 }
8629 } // namespace
8630 
8631 void __kmp_hidden_helper_threads_initz_routine() {
8632   // Create a new root for hidden helper team/threads
8633   const int gtid = __kmp_register_root(TRUE);
8634   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8635   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8636   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8637       __kmp_hidden_helper_threads_num;
8638 
8639   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8640 
8641   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8642 
8643   // Set the initialization flag to FALSE
8644   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8645 
8646   __kmp_hidden_helper_threads_deinitz_release();
8647 }
8648