1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 #if OMP_PROFILING_SUPPORT
36 #include "llvm/Support/TimeProfiler.h"
37 static char *ProfileTraceFile = nullptr;
38 #endif
39 
40 /* these are temporary issues to be dealt with */
41 #define KMP_USE_PRCTL 0
42 
43 #if KMP_OS_WINDOWS
44 #include <process.h>
45 #endif
46 
47 #include "tsan_annotations.h"
48 
49 #if KMP_OS_WINDOWS
50 // windows does not need include files as it doesn't use shared memory
51 #else
52 #include <sys/mman.h>
53 #include <sys/stat.h>
54 #include <fcntl.h>
55 #define SHM_SIZE 1024
56 #endif
57 
58 #if defined(KMP_GOMP_COMPAT)
59 char const __kmp_version_alt_comp[] =
60     KMP_VERSION_PREFIX "alternative compiler support: yes";
61 #endif /* defined(KMP_GOMP_COMPAT) */
62 
63 char const __kmp_version_omp_api[] =
64     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
65 
66 #ifdef KMP_DEBUG
67 char const __kmp_version_lock[] =
68     KMP_VERSION_PREFIX "lock type: run time selectable";
69 #endif /* KMP_DEBUG */
70 
71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
72 
73 /* ------------------------------------------------------------------------ */
74 
75 #if KMP_USE_MONITOR
76 kmp_info_t __kmp_monitor;
77 #endif
78 
79 /* Forward declarations */
80 
81 void __kmp_cleanup(void);
82 
83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
84                                   int gtid);
85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
86                                   kmp_internal_control_t *new_icvs,
87                                   ident_t *loc);
88 #if KMP_AFFINITY_SUPPORTED
89 static void __kmp_partition_places(kmp_team_t *team,
90                                    int update_master_only = 0);
91 #endif
92 static void __kmp_do_serial_initialize(void);
93 void __kmp_fork_barrier(int gtid, int tid);
94 void __kmp_join_barrier(int gtid);
95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
96                           kmp_internal_control_t *new_icvs, ident_t *loc);
97 
98 #ifdef USE_LOAD_BALANCE
99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
100 #endif
101 
102 static int __kmp_expand_threads(int nNeed);
103 #if KMP_OS_WINDOWS
104 static int __kmp_unregister_root_other_thread(int gtid);
105 #endif
106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
108 
109 /* Calculate the identifier of the current thread */
110 /* fast (and somewhat portable) way to get unique identifier of executing
111    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
112 int __kmp_get_global_thread_id() {
113   int i;
114   kmp_info_t **other_threads;
115   size_t stack_data;
116   char *stack_addr;
117   size_t stack_size;
118   char *stack_base;
119 
120   KA_TRACE(
121       1000,
122       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
123        __kmp_nth, __kmp_all_nth));
124 
125   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
126      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
127      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
128      __kmp_init_gtid for this to work. */
129 
130   if (!TCR_4(__kmp_init_gtid))
131     return KMP_GTID_DNE;
132 
133 #ifdef KMP_TDATA_GTID
134   if (TCR_4(__kmp_gtid_mode) >= 3) {
135     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
136     return __kmp_gtid;
137   }
138 #endif
139   if (TCR_4(__kmp_gtid_mode) >= 2) {
140     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
141     return __kmp_gtid_get_specific();
142   }
143   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
144 
145   stack_addr = (char *)&stack_data;
146   other_threads = __kmp_threads;
147 
148   /* ATT: The code below is a source of potential bugs due to unsynchronized
149      access to __kmp_threads array. For example:
150      1. Current thread loads other_threads[i] to thr and checks it, it is
151         non-NULL.
152      2. Current thread is suspended by OS.
153      3. Another thread unregisters and finishes (debug versions of free()
154         may fill memory with something like 0xEF).
155      4. Current thread is resumed.
156      5. Current thread reads junk from *thr.
157      TODO: Fix it.  --ln  */
158 
159   for (i = 0; i < __kmp_threads_capacity; i++) {
160 
161     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
162     if (!thr)
163       continue;
164 
165     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
166     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
167 
168     /* stack grows down -- search through all of the active threads */
169 
170     if (stack_addr <= stack_base) {
171       size_t stack_diff = stack_base - stack_addr;
172 
173       if (stack_diff <= stack_size) {
174         /* The only way we can be closer than the allocated */
175         /* stack size is if we are running on this thread. */
176         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
177         return i;
178       }
179     }
180   }
181 
182   /* get specific to try and determine our gtid */
183   KA_TRACE(1000,
184            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
185             "thread, using TLS\n"));
186   i = __kmp_gtid_get_specific();
187 
188   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
189 
190   /* if we havn't been assigned a gtid, then return code */
191   if (i < 0)
192     return i;
193 
194   /* dynamically updated stack window for uber threads to avoid get_specific
195      call */
196   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
197     KMP_FATAL(StackOverflow, i);
198   }
199 
200   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201   if (stack_addr > stack_base) {
202     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
203     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
205                 stack_base);
206   } else {
207     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
208             stack_base - stack_addr);
209   }
210 
211   /* Reprint stack bounds for ubermaster since they have been refined */
212   if (__kmp_storage_map) {
213     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
214     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
215     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
216                                  other_threads[i]->th.th_info.ds.ds_stacksize,
217                                  "th_%d stack (refinement)", i);
218   }
219   return i;
220 }
221 
222 int __kmp_get_global_thread_id_reg() {
223   int gtid;
224 
225   if (!__kmp_init_serial) {
226     gtid = KMP_GTID_DNE;
227   } else
228 #ifdef KMP_TDATA_GTID
229       if (TCR_4(__kmp_gtid_mode) >= 3) {
230     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
231     gtid = __kmp_gtid;
232   } else
233 #endif
234       if (TCR_4(__kmp_gtid_mode) >= 2) {
235     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
236     gtid = __kmp_gtid_get_specific();
237   } else {
238     KA_TRACE(1000,
239              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
240     gtid = __kmp_get_global_thread_id();
241   }
242 
243   /* we must be a new uber master sibling thread */
244   if (gtid == KMP_GTID_DNE) {
245     KA_TRACE(10,
246              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
247               "Registering a new gtid.\n"));
248     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
249     if (!__kmp_init_serial) {
250       __kmp_do_serial_initialize();
251       gtid = __kmp_gtid_get_specific();
252     } else {
253       gtid = __kmp_register_root(FALSE);
254     }
255     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
256     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
257   }
258 
259   KMP_DEBUG_ASSERT(gtid >= 0);
260 
261   return gtid;
262 }
263 
264 /* caller must hold forkjoin_lock */
265 void __kmp_check_stack_overlap(kmp_info_t *th) {
266   int f;
267   char *stack_beg = NULL;
268   char *stack_end = NULL;
269   int gtid;
270 
271   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
272   if (__kmp_storage_map) {
273     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
274     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
275 
276     gtid = __kmp_gtid_from_thread(th);
277 
278     if (gtid == KMP_GTID_MONITOR) {
279       __kmp_print_storage_map_gtid(
280           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281           "th_%s stack (%s)", "mon",
282           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283     } else {
284       __kmp_print_storage_map_gtid(
285           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286           "th_%d stack (%s)", gtid,
287           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288     }
289   }
290 
291   /* No point in checking ubermaster threads since they use refinement and
292    * cannot overlap */
293   gtid = __kmp_gtid_from_thread(th);
294   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
295     KA_TRACE(10,
296              ("__kmp_check_stack_overlap: performing extensive checking\n"));
297     if (stack_beg == NULL) {
298       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
299       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
300     }
301 
302     for (f = 0; f < __kmp_threads_capacity; f++) {
303       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
304 
305       if (f_th && f_th != th) {
306         char *other_stack_end =
307             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
308         char *other_stack_beg =
309             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
310         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
311             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
312 
313           /* Print the other stack values before the abort */
314           if (__kmp_storage_map)
315             __kmp_print_storage_map_gtid(
316                 -1, other_stack_beg, other_stack_end,
317                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
318                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
319 
320           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
321                       __kmp_msg_null);
322         }
323       }
324     }
325   }
326   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
327 }
328 
329 /* ------------------------------------------------------------------------ */
330 
331 void __kmp_infinite_loop(void) {
332   static int done = FALSE;
333 
334   while (!done) {
335     KMP_YIELD(TRUE);
336   }
337 }
338 
339 #define MAX_MESSAGE 512
340 
341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
342                                   char const *format, ...) {
343   char buffer[MAX_MESSAGE];
344   va_list ap;
345 
346   va_start(ap, format);
347   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
348                p2, (unsigned long)size, format);
349   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
350   __kmp_vprintf(kmp_err, buffer, ap);
351 #if KMP_PRINT_DATA_PLACEMENT
352   int node;
353   if (gtid >= 0) {
354     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
355       if (__kmp_storage_map_verbose) {
356         node = __kmp_get_host_node(p1);
357         if (node < 0) /* doesn't work, so don't try this next time */
358           __kmp_storage_map_verbose = FALSE;
359         else {
360           char *last;
361           int lastNode;
362           int localProc = __kmp_get_cpu_from_gtid(gtid);
363 
364           const int page_size = KMP_GET_PAGE_SIZE();
365 
366           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
367           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
368           if (localProc >= 0)
369             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
370                                  localProc >> 1);
371           else
372             __kmp_printf_no_lock("  GTID %d\n", gtid);
373 #if KMP_USE_PRCTL
374           /* The more elaborate format is disabled for now because of the prctl
375            * hanging bug. */
376           do {
377             last = p1;
378             lastNode = node;
379             /* This loop collates adjacent pages with the same host node. */
380             do {
381               (char *)p1 += page_size;
382             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
383             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
384                                  lastNode);
385           } while (p1 <= p2);
386 #else
387           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
388                                (char *)p1 + (page_size - 1),
389                                __kmp_get_host_node(p1));
390           if (p1 < p2) {
391             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
392                                  (char *)p2 + (page_size - 1),
393                                  __kmp_get_host_node(p2));
394           }
395 #endif
396         }
397       }
398     } else
399       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
400   }
401 #endif /* KMP_PRINT_DATA_PLACEMENT */
402   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
403 }
404 
405 void __kmp_warn(char const *format, ...) {
406   char buffer[MAX_MESSAGE];
407   va_list ap;
408 
409   if (__kmp_generate_warnings == kmp_warnings_off) {
410     return;
411   }
412 
413   va_start(ap, format);
414 
415   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
416   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
417   __kmp_vprintf(kmp_err, buffer, ap);
418   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
419 
420   va_end(ap);
421 }
422 
423 void __kmp_abort_process() {
424   // Later threads may stall here, but that's ok because abort() will kill them.
425   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
426 
427   if (__kmp_debug_buf) {
428     __kmp_dump_debug_buffer();
429   }
430 
431   if (KMP_OS_WINDOWS) {
432     // Let other threads know of abnormal termination and prevent deadlock
433     // if abort happened during library initialization or shutdown
434     __kmp_global.g.g_abort = SIGABRT;
435 
436     /* On Windows* OS by default abort() causes pop-up error box, which stalls
437        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
438        boxes. _set_abort_behavior() works well, but this function is not
439        available in VS7 (this is not problem for DLL, but it is a problem for
440        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
441        help, at least in some versions of MS C RTL.
442 
443        It seems following sequence is the only way to simulate abort() and
444        avoid pop-up error box. */
445     raise(SIGABRT);
446     _exit(3); // Just in case, if signal ignored, exit anyway.
447   } else {
448     __kmp_unregister_library();
449     abort();
450   }
451 
452   __kmp_infinite_loop();
453   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
454 
455 } // __kmp_abort_process
456 
457 void __kmp_abort_thread(void) {
458   // TODO: Eliminate g_abort global variable and this function.
459   // In case of abort just call abort(), it will kill all the threads.
460   __kmp_infinite_loop();
461 } // __kmp_abort_thread
462 
463 /* Print out the storage map for the major kmp_info_t thread data structures
464    that are allocated together. */
465 
466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
467   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
468                                gtid);
469 
470   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
471                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
472 
473   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
474                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
475 
476   __kmp_print_storage_map_gtid(
477       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
478       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
481                                &thr->th.th_bar[bs_plain_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
483                                gtid);
484 
485   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
486                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
487                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
488                                gtid);
489 
490 #if KMP_FAST_REDUCTION_BARRIER
491   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
492                                &thr->th.th_bar[bs_reduction_barrier + 1],
493                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
494                                gtid);
495 #endif // KMP_FAST_REDUCTION_BARRIER
496 }
497 
498 /* Print out the storage map for the major kmp_team_t team data structures
499    that are allocated together. */
500 
501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
502                                          int team_id, int num_thr) {
503   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
504   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
505                                header, team_id);
506 
507   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
508                                &team->t.t_bar[bs_last_barrier],
509                                sizeof(kmp_balign_team_t) * bs_last_barrier,
510                                "%s_%d.t_bar", header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
513                                &team->t.t_bar[bs_plain_barrier + 1],
514                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
515                                header, team_id);
516 
517   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
518                                &team->t.t_bar[bs_forkjoin_barrier + 1],
519                                sizeof(kmp_balign_team_t),
520                                "%s_%d.t_bar[forkjoin]", header, team_id);
521 
522 #if KMP_FAST_REDUCTION_BARRIER
523   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
524                                &team->t.t_bar[bs_reduction_barrier + 1],
525                                sizeof(kmp_balign_team_t),
526                                "%s_%d.t_bar[reduction]", header, team_id);
527 #endif // KMP_FAST_REDUCTION_BARRIER
528 
529   __kmp_print_storage_map_gtid(
530       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
531       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
532 
533   __kmp_print_storage_map_gtid(
534       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
535       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
536 
537   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
538                                &team->t.t_disp_buffer[num_disp_buff],
539                                sizeof(dispatch_shared_info_t) * num_disp_buff,
540                                "%s_%d.t_disp_buffer", header, team_id);
541 }
542 
543 static void __kmp_init_allocator() { __kmp_init_memkind(); }
544 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
545 
546 /* ------------------------------------------------------------------------ */
547 
548 #if KMP_DYNAMIC_LIB
549 #if KMP_OS_WINDOWS
550 
551 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
552   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
553 
554   switch (fdwReason) {
555 
556   case DLL_PROCESS_ATTACH:
557     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
558 
559     return TRUE;
560 
561   case DLL_PROCESS_DETACH:
562     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
563 
564     // According to Windows* documentation for DllMain entry point:
565     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
566     //   lpReserved == NULL when FreeLibrary() is called,
567     //   lpReserved != NULL when the process is terminated.
568     // When FreeLibrary() is called, worker threads remain alive. So the
569     // runtime's state is consistent and executing proper shutdown is OK.
570     // When the process is terminated, worker threads have exited or been
571     // forcefully terminated by the OS and only the shutdown thread remains.
572     // This can leave the runtime in an inconsistent state.
573     // Hence, only attempt proper cleanup when FreeLibrary() is called.
574     // Otherwise, rely on OS to reclaim resources.
575     if (lpReserved == NULL)
576       __kmp_internal_end_library(__kmp_gtid_get_specific());
577 
578     return TRUE;
579 
580   case DLL_THREAD_ATTACH:
581     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
582 
583     /* if we want to register new siblings all the time here call
584      * __kmp_get_gtid(); */
585     return TRUE;
586 
587   case DLL_THREAD_DETACH:
588     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
589 
590     __kmp_internal_end_thread(__kmp_gtid_get_specific());
591     return TRUE;
592   }
593 
594   return TRUE;
595 }
596 
597 #endif /* KMP_OS_WINDOWS */
598 #endif /* KMP_DYNAMIC_LIB */
599 
600 /* __kmp_parallel_deo -- Wait until it's our turn. */
601 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
602   int gtid = *gtid_ref;
603 #ifdef BUILD_PARALLEL_ORDERED
604   kmp_team_t *team = __kmp_team_from_gtid(gtid);
605 #endif /* BUILD_PARALLEL_ORDERED */
606 
607   if (__kmp_env_consistency_check) {
608     if (__kmp_threads[gtid]->th.th_root->r.r_active)
609 #if KMP_USE_DYNAMIC_LOCK
610       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
611 #else
612       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
613 #endif
614   }
615 #ifdef BUILD_PARALLEL_ORDERED
616   if (!team->t.t_serialized) {
617     KMP_MB();
618     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
619              NULL);
620     KMP_MB();
621   }
622 #endif /* BUILD_PARALLEL_ORDERED */
623 }
624 
625 /* __kmp_parallel_dxo -- Signal the next task. */
626 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
627   int gtid = *gtid_ref;
628 #ifdef BUILD_PARALLEL_ORDERED
629   int tid = __kmp_tid_from_gtid(gtid);
630   kmp_team_t *team = __kmp_team_from_gtid(gtid);
631 #endif /* BUILD_PARALLEL_ORDERED */
632 
633   if (__kmp_env_consistency_check) {
634     if (__kmp_threads[gtid]->th.th_root->r.r_active)
635       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
636   }
637 #ifdef BUILD_PARALLEL_ORDERED
638   if (!team->t.t_serialized) {
639     KMP_MB(); /* Flush all pending memory write invalidates.  */
640 
641     /* use the tid of the next thread in this team */
642     /* TODO replace with general release procedure */
643     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
644 
645     KMP_MB(); /* Flush all pending memory write invalidates.  */
646   }
647 #endif /* BUILD_PARALLEL_ORDERED */
648 }
649 
650 /* ------------------------------------------------------------------------ */
651 /* The BARRIER for a SINGLE process section is always explicit   */
652 
653 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
654   int status;
655   kmp_info_t *th;
656   kmp_team_t *team;
657 
658   if (!TCR_4(__kmp_init_parallel))
659     __kmp_parallel_initialize();
660   __kmp_resume_if_soft_paused();
661 
662   th = __kmp_threads[gtid];
663   team = th->th.th_team;
664   status = 0;
665 
666   th->th.th_ident = id_ref;
667 
668   if (team->t.t_serialized) {
669     status = 1;
670   } else {
671     kmp_int32 old_this = th->th.th_local.this_construct;
672 
673     ++th->th.th_local.this_construct;
674     /* try to set team count to thread count--success means thread got the
675        single block */
676     /* TODO: Should this be acquire or release? */
677     if (team->t.t_construct == old_this) {
678       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
679                                               th->th.th_local.this_construct);
680     }
681 #if USE_ITT_BUILD
682     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
683         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
684         team->t.t_active_level ==
685             1) { // Only report metadata by master of active team at level 1
686       __kmp_itt_metadata_single(id_ref);
687     }
688 #endif /* USE_ITT_BUILD */
689   }
690 
691   if (__kmp_env_consistency_check) {
692     if (status && push_ws) {
693       __kmp_push_workshare(gtid, ct_psingle, id_ref);
694     } else {
695       __kmp_check_workshare(gtid, ct_psingle, id_ref);
696     }
697   }
698 #if USE_ITT_BUILD
699   if (status) {
700     __kmp_itt_single_start(gtid);
701   }
702 #endif /* USE_ITT_BUILD */
703   return status;
704 }
705 
706 void __kmp_exit_single(int gtid) {
707 #if USE_ITT_BUILD
708   __kmp_itt_single_end(gtid);
709 #endif /* USE_ITT_BUILD */
710   if (__kmp_env_consistency_check)
711     __kmp_pop_workshare(gtid, ct_psingle, NULL);
712 }
713 
714 /* determine if we can go parallel or must use a serialized parallel region and
715  * how many threads we can use
716  * set_nproc is the number of threads requested for the team
717  * returns 0 if we should serialize or only use one thread,
718  * otherwise the number of threads to use
719  * The forkjoin lock is held by the caller. */
720 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
721                                  int master_tid, int set_nthreads,
722                                  int enter_teams) {
723   int capacity;
724   int new_nthreads;
725   KMP_DEBUG_ASSERT(__kmp_init_serial);
726   KMP_DEBUG_ASSERT(root && parent_team);
727   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
728 
729   // If dyn-var is set, dynamically adjust the number of desired threads,
730   // according to the method specified by dynamic_mode.
731   new_nthreads = set_nthreads;
732   if (!get__dynamic_2(parent_team, master_tid)) {
733     ;
734   }
735 #ifdef USE_LOAD_BALANCE
736   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
737     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
738     if (new_nthreads == 1) {
739       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
740                     "reservation to 1 thread\n",
741                     master_tid));
742       return 1;
743     }
744     if (new_nthreads < set_nthreads) {
745       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
746                     "reservation to %d threads\n",
747                     master_tid, new_nthreads));
748     }
749   }
750 #endif /* USE_LOAD_BALANCE */
751   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
752     new_nthreads = __kmp_avail_proc - __kmp_nth +
753                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
754     if (new_nthreads <= 1) {
755       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
756                     "reservation to 1 thread\n",
757                     master_tid));
758       return 1;
759     }
760     if (new_nthreads < set_nthreads) {
761       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
762                     "reservation to %d threads\n",
763                     master_tid, new_nthreads));
764     } else {
765       new_nthreads = set_nthreads;
766     }
767   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
768     if (set_nthreads > 2) {
769       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
770       new_nthreads = (new_nthreads % set_nthreads) + 1;
771       if (new_nthreads == 1) {
772         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
773                       "reservation to 1 thread\n",
774                       master_tid));
775         return 1;
776       }
777       if (new_nthreads < set_nthreads) {
778         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
779                       "reservation to %d threads\n",
780                       master_tid, new_nthreads));
781       }
782     }
783   } else {
784     KMP_ASSERT(0);
785   }
786 
787   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
788   if (__kmp_nth + new_nthreads -
789           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
790       __kmp_max_nth) {
791     int tl_nthreads = __kmp_max_nth - __kmp_nth +
792                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
793     if (tl_nthreads <= 0) {
794       tl_nthreads = 1;
795     }
796 
797     // If dyn-var is false, emit a 1-time warning.
798     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
799       __kmp_reserve_warn = 1;
800       __kmp_msg(kmp_ms_warning,
801                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
802                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
803     }
804     if (tl_nthreads == 1) {
805       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
806                     "reduced reservation to 1 thread\n",
807                     master_tid));
808       return 1;
809     }
810     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
811                   "reservation to %d threads\n",
812                   master_tid, tl_nthreads));
813     new_nthreads = tl_nthreads;
814   }
815 
816   // Respect OMP_THREAD_LIMIT
817   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
818   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
819   if (cg_nthreads + new_nthreads -
820           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
821       max_cg_threads) {
822     int tl_nthreads = max_cg_threads - cg_nthreads +
823                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
824     if (tl_nthreads <= 0) {
825       tl_nthreads = 1;
826     }
827 
828     // If dyn-var is false, emit a 1-time warning.
829     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
830       __kmp_reserve_warn = 1;
831       __kmp_msg(kmp_ms_warning,
832                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
833                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
834     }
835     if (tl_nthreads == 1) {
836       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
837                     "reduced reservation to 1 thread\n",
838                     master_tid));
839       return 1;
840     }
841     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
842                   "reservation to %d threads\n",
843                   master_tid, tl_nthreads));
844     new_nthreads = tl_nthreads;
845   }
846 
847   // Check if the threads array is large enough, or needs expanding.
848   // See comment in __kmp_register_root() about the adjustment if
849   // __kmp_threads[0] == NULL.
850   capacity = __kmp_threads_capacity;
851   if (TCR_PTR(__kmp_threads[0]) == NULL) {
852     --capacity;
853   }
854   if (__kmp_nth + new_nthreads -
855           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
856       capacity) {
857     // Expand the threads array.
858     int slotsRequired = __kmp_nth + new_nthreads -
859                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
860                         capacity;
861     int slotsAdded = __kmp_expand_threads(slotsRequired);
862     if (slotsAdded < slotsRequired) {
863       // The threads array was not expanded enough.
864       new_nthreads -= (slotsRequired - slotsAdded);
865       KMP_ASSERT(new_nthreads >= 1);
866 
867       // If dyn-var is false, emit a 1-time warning.
868       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
869         __kmp_reserve_warn = 1;
870         if (__kmp_tp_cached) {
871           __kmp_msg(kmp_ms_warning,
872                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
873                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
874                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
875         } else {
876           __kmp_msg(kmp_ms_warning,
877                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
878                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
879         }
880       }
881     }
882   }
883 
884 #ifdef KMP_DEBUG
885   if (new_nthreads == 1) {
886     KC_TRACE(10,
887              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
888               "dead roots and rechecking; requested %d threads\n",
889               __kmp_get_gtid(), set_nthreads));
890   } else {
891     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
892                   " %d threads\n",
893                   __kmp_get_gtid(), new_nthreads, set_nthreads));
894   }
895 #endif // KMP_DEBUG
896   return new_nthreads;
897 }
898 
899 /* Allocate threads from the thread pool and assign them to the new team. We are
900    assured that there are enough threads available, because we checked on that
901    earlier within critical section forkjoin */
902 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
903                                     kmp_info_t *master_th, int master_gtid) {
904   int i;
905   int use_hot_team;
906 
907   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
908   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
909   KMP_MB();
910 
911   /* first, let's setup the master thread */
912   master_th->th.th_info.ds.ds_tid = 0;
913   master_th->th.th_team = team;
914   master_th->th.th_team_nproc = team->t.t_nproc;
915   master_th->th.th_team_master = master_th;
916   master_th->th.th_team_serialized = FALSE;
917   master_th->th.th_dispatch = &team->t.t_dispatch[0];
918 
919 /* make sure we are not the optimized hot team */
920 #if KMP_NESTED_HOT_TEAMS
921   use_hot_team = 0;
922   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
923   if (hot_teams) { // hot teams array is not allocated if
924     // KMP_HOT_TEAMS_MAX_LEVEL=0
925     int level = team->t.t_active_level - 1; // index in array of hot teams
926     if (master_th->th.th_teams_microtask) { // are we inside the teams?
927       if (master_th->th.th_teams_size.nteams > 1) {
928         ++level; // level was not increased in teams construct for
929         // team_of_masters
930       }
931       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
932           master_th->th.th_teams_level == team->t.t_level) {
933         ++level; // level was not increased in teams construct for
934         // team_of_workers before the parallel
935       } // team->t.t_level will be increased inside parallel
936     }
937     if (level < __kmp_hot_teams_max_level) {
938       if (hot_teams[level].hot_team) {
939         // hot team has already been allocated for given level
940         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
941         use_hot_team = 1; // the team is ready to use
942       } else {
943         use_hot_team = 0; // AC: threads are not allocated yet
944         hot_teams[level].hot_team = team; // remember new hot team
945         hot_teams[level].hot_team_nth = team->t.t_nproc;
946       }
947     } else {
948       use_hot_team = 0;
949     }
950   }
951 #else
952   use_hot_team = team == root->r.r_hot_team;
953 #endif
954   if (!use_hot_team) {
955 
956     /* install the master thread */
957     team->t.t_threads[0] = master_th;
958     __kmp_initialize_info(master_th, team, 0, master_gtid);
959 
960     /* now, install the worker threads */
961     for (i = 1; i < team->t.t_nproc; i++) {
962 
963       /* fork or reallocate a new thread and install it in team */
964       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
965       team->t.t_threads[i] = thr;
966       KMP_DEBUG_ASSERT(thr);
967       KMP_DEBUG_ASSERT(thr->th.th_team == team);
968       /* align team and thread arrived states */
969       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
970                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
971                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
972                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
973                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
974                     team->t.t_bar[bs_plain_barrier].b_arrived));
975       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
976       thr->th.th_teams_level = master_th->th.th_teams_level;
977       thr->th.th_teams_size = master_th->th.th_teams_size;
978       { // Initialize threads' barrier data.
979         int b;
980         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
981         for (b = 0; b < bs_last_barrier; ++b) {
982           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
983           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
984 #if USE_DEBUGGER
985           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
986 #endif
987         }
988       }
989     }
990 
991 #if KMP_AFFINITY_SUPPORTED
992     __kmp_partition_places(team);
993 #endif
994   }
995 
996   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
997     for (i = 0; i < team->t.t_nproc; i++) {
998       kmp_info_t *thr = team->t.t_threads[i];
999       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1000           thr->th.th_prev_level != team->t.t_level) {
1001         team->t.t_display_affinity = 1;
1002         break;
1003       }
1004     }
1005   }
1006 
1007   KMP_MB();
1008 }
1009 
1010 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1011 // Propagate any changes to the floating point control registers out to the team
1012 // We try to avoid unnecessary writes to the relevant cache line in the team
1013 // structure, so we don't make changes unless they are needed.
1014 inline static void propagateFPControl(kmp_team_t *team) {
1015   if (__kmp_inherit_fp_control) {
1016     kmp_int16 x87_fpu_control_word;
1017     kmp_uint32 mxcsr;
1018 
1019     // Get master values of FPU control flags (both X87 and vector)
1020     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1021     __kmp_store_mxcsr(&mxcsr);
1022     mxcsr &= KMP_X86_MXCSR_MASK;
1023 
1024     // There is no point looking at t_fp_control_saved here.
1025     // If it is TRUE, we still have to update the values if they are different
1026     // from those we now have. If it is FALSE we didn't save anything yet, but
1027     // our objective is the same. We have to ensure that the values in the team
1028     // are the same as those we have.
1029     // So, this code achieves what we need whether or not t_fp_control_saved is
1030     // true. By checking whether the value needs updating we avoid unnecessary
1031     // writes that would put the cache-line into a written state, causing all
1032     // threads in the team to have to read it again.
1033     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1034     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1035     // Although we don't use this value, other code in the runtime wants to know
1036     // whether it should restore them. So we must ensure it is correct.
1037     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1038   } else {
1039     // Similarly here. Don't write to this cache-line in the team structure
1040     // unless we have to.
1041     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1042   }
1043 }
1044 
1045 // Do the opposite, setting the hardware registers to the updated values from
1046 // the team.
1047 inline static void updateHWFPControl(kmp_team_t *team) {
1048   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1049     // Only reset the fp control regs if they have been changed in the team.
1050     // the parallel region that we are exiting.
1051     kmp_int16 x87_fpu_control_word;
1052     kmp_uint32 mxcsr;
1053     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1054     __kmp_store_mxcsr(&mxcsr);
1055     mxcsr &= KMP_X86_MXCSR_MASK;
1056 
1057     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1058       __kmp_clear_x87_fpu_status_word();
1059       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1060     }
1061 
1062     if (team->t.t_mxcsr != mxcsr) {
1063       __kmp_load_mxcsr(&team->t.t_mxcsr);
1064     }
1065   }
1066 }
1067 #else
1068 #define propagateFPControl(x) ((void)0)
1069 #define updateHWFPControl(x) ((void)0)
1070 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1071 
1072 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1073                                      int realloc); // forward declaration
1074 
1075 /* Run a parallel region that has been serialized, so runs only in a team of the
1076    single master thread. */
1077 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1078   kmp_info_t *this_thr;
1079   kmp_team_t *serial_team;
1080 
1081   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1082 
1083   /* Skip all this code for autopar serialized loops since it results in
1084      unacceptable overhead */
1085   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1086     return;
1087 
1088   if (!TCR_4(__kmp_init_parallel))
1089     __kmp_parallel_initialize();
1090   __kmp_resume_if_soft_paused();
1091 
1092   this_thr = __kmp_threads[global_tid];
1093   serial_team = this_thr->th.th_serial_team;
1094 
1095   /* utilize the serialized team held by this thread */
1096   KMP_DEBUG_ASSERT(serial_team);
1097   KMP_MB();
1098 
1099   if (__kmp_tasking_mode != tskm_immediate_exec) {
1100     KMP_DEBUG_ASSERT(
1101         this_thr->th.th_task_team ==
1102         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1103     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1104                      NULL);
1105     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1106                   "team %p, new task_team = NULL\n",
1107                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1108     this_thr->th.th_task_team = NULL;
1109   }
1110 
1111   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1112   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1113     proc_bind = proc_bind_false;
1114   } else if (proc_bind == proc_bind_default) {
1115     // No proc_bind clause was specified, so use the current value
1116     // of proc-bind-var for this parallel region.
1117     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1118   }
1119   // Reset for next parallel region
1120   this_thr->th.th_set_proc_bind = proc_bind_default;
1121 
1122 #if OMPT_SUPPORT
1123   ompt_data_t ompt_parallel_data = ompt_data_none;
1124   ompt_data_t *implicit_task_data;
1125   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1126   if (ompt_enabled.enabled &&
1127       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1128 
1129     ompt_task_info_t *parent_task_info;
1130     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1131 
1132     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1133     if (ompt_enabled.ompt_callback_parallel_begin) {
1134       int team_size = 1;
1135 
1136       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1137           &(parent_task_info->task_data), &(parent_task_info->frame),
1138           &ompt_parallel_data, team_size,
1139           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1140     }
1141   }
1142 #endif // OMPT_SUPPORT
1143 
1144   if (this_thr->th.th_team != serial_team) {
1145     // Nested level will be an index in the nested nthreads array
1146     int level = this_thr->th.th_team->t.t_level;
1147 
1148     if (serial_team->t.t_serialized) {
1149       /* this serial team was already used
1150          TODO increase performance by making this locks more specific */
1151       kmp_team_t *new_team;
1152 
1153       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1154 
1155       new_team =
1156           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1157 #if OMPT_SUPPORT
1158                               ompt_parallel_data,
1159 #endif
1160                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1161                               0 USE_NESTED_HOT_ARG(NULL));
1162       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1163       KMP_ASSERT(new_team);
1164 
1165       /* setup new serialized team and install it */
1166       new_team->t.t_threads[0] = this_thr;
1167       new_team->t.t_parent = this_thr->th.th_team;
1168       serial_team = new_team;
1169       this_thr->th.th_serial_team = serial_team;
1170 
1171       KF_TRACE(
1172           10,
1173           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1174            global_tid, serial_team));
1175 
1176       /* TODO the above breaks the requirement that if we run out of resources,
1177          then we can still guarantee that serialized teams are ok, since we may
1178          need to allocate a new one */
1179     } else {
1180       KF_TRACE(
1181           10,
1182           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1183            global_tid, serial_team));
1184     }
1185 
1186     /* we have to initialize this serial team */
1187     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1188     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1189     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1190     serial_team->t.t_ident = loc;
1191     serial_team->t.t_serialized = 1;
1192     serial_team->t.t_nproc = 1;
1193     serial_team->t.t_parent = this_thr->th.th_team;
1194     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1195     this_thr->th.th_team = serial_team;
1196     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1197 
1198     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1199                   this_thr->th.th_current_task));
1200     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1201     this_thr->th.th_current_task->td_flags.executing = 0;
1202 
1203     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1204 
1205     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1206        implicit task for each serialized task represented by
1207        team->t.t_serialized? */
1208     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1209               &this_thr->th.th_current_task->td_parent->td_icvs);
1210 
1211     // Thread value exists in the nested nthreads array for the next nested
1212     // level
1213     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1214       this_thr->th.th_current_task->td_icvs.nproc =
1215           __kmp_nested_nth.nth[level + 1];
1216     }
1217 
1218     if (__kmp_nested_proc_bind.used &&
1219         (level + 1 < __kmp_nested_proc_bind.used)) {
1220       this_thr->th.th_current_task->td_icvs.proc_bind =
1221           __kmp_nested_proc_bind.bind_types[level + 1];
1222     }
1223 
1224 #if USE_DEBUGGER
1225     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1226 #endif
1227     this_thr->th.th_info.ds.ds_tid = 0;
1228 
1229     /* set thread cache values */
1230     this_thr->th.th_team_nproc = 1;
1231     this_thr->th.th_team_master = this_thr;
1232     this_thr->th.th_team_serialized = 1;
1233 
1234     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1235     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1236     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1237 
1238     propagateFPControl(serial_team);
1239 
1240     /* check if we need to allocate dispatch buffers stack */
1241     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1242     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1243       serial_team->t.t_dispatch->th_disp_buffer =
1244           (dispatch_private_info_t *)__kmp_allocate(
1245               sizeof(dispatch_private_info_t));
1246     }
1247     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1248 
1249     KMP_MB();
1250 
1251   } else {
1252     /* this serialized team is already being used,
1253      * that's fine, just add another nested level */
1254     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1255     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1256     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1257     ++serial_team->t.t_serialized;
1258     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1259 
1260     // Nested level will be an index in the nested nthreads array
1261     int level = this_thr->th.th_team->t.t_level;
1262     // Thread value exists in the nested nthreads array for the next nested
1263     // level
1264     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1265       this_thr->th.th_current_task->td_icvs.nproc =
1266           __kmp_nested_nth.nth[level + 1];
1267     }
1268     serial_team->t.t_level++;
1269     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1270                   "of serial team %p to %d\n",
1271                   global_tid, serial_team, serial_team->t.t_level));
1272 
1273     /* allocate/push dispatch buffers stack */
1274     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1275     {
1276       dispatch_private_info_t *disp_buffer =
1277           (dispatch_private_info_t *)__kmp_allocate(
1278               sizeof(dispatch_private_info_t));
1279       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1280       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1281     }
1282     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1283 
1284     KMP_MB();
1285   }
1286   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1287 
1288   // Perform the display affinity functionality for
1289   // serialized parallel regions
1290   if (__kmp_display_affinity) {
1291     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1292         this_thr->th.th_prev_num_threads != 1) {
1293       // NULL means use the affinity-format-var ICV
1294       __kmp_aux_display_affinity(global_tid, NULL);
1295       this_thr->th.th_prev_level = serial_team->t.t_level;
1296       this_thr->th.th_prev_num_threads = 1;
1297     }
1298   }
1299 
1300   if (__kmp_env_consistency_check)
1301     __kmp_push_parallel(global_tid, NULL);
1302 #if OMPT_SUPPORT
1303   serial_team->t.ompt_team_info.master_return_address = codeptr;
1304   if (ompt_enabled.enabled &&
1305       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1306     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1307         OMPT_GET_FRAME_ADDRESS(0);
1308 
1309     ompt_lw_taskteam_t lw_taskteam;
1310     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1311                             &ompt_parallel_data, codeptr);
1312 
1313     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1314     // don't use lw_taskteam after linking. content was swaped
1315 
1316     /* OMPT implicit task begin */
1317     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1318     if (ompt_enabled.ompt_callback_implicit_task) {
1319       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1320           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1321           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1322           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1323       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1324           __kmp_tid_from_gtid(global_tid);
1325     }
1326 
1327     /* OMPT state */
1328     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1329     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1330         OMPT_GET_FRAME_ADDRESS(0);
1331   }
1332 #endif
1333 }
1334 
1335 /* most of the work for a fork */
1336 /* return true if we really went parallel, false if serialized */
1337 int __kmp_fork_call(ident_t *loc, int gtid,
1338                     enum fork_context_e call_context, // Intel, GNU, ...
1339                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1340                     kmp_va_list ap) {
1341   void **argv;
1342   int i;
1343   int master_tid;
1344   int master_this_cons;
1345   kmp_team_t *team;
1346   kmp_team_t *parent_team;
1347   kmp_info_t *master_th;
1348   kmp_root_t *root;
1349   int nthreads;
1350   int master_active;
1351   int master_set_numthreads;
1352   int level;
1353   int active_level;
1354   int teams_level;
1355 #if KMP_NESTED_HOT_TEAMS
1356   kmp_hot_team_ptr_t **p_hot_teams;
1357 #endif
1358   { // KMP_TIME_BLOCK
1359     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1360     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1361 
1362     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1363     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1364       /* Some systems prefer the stack for the root thread(s) to start with */
1365       /* some gap from the parent stack to prevent false sharing. */
1366       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1367       /* These 2 lines below are so this does not get optimized out */
1368       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1369         __kmp_stkpadding += (short)((kmp_int64)dummy);
1370     }
1371 
1372     /* initialize if needed */
1373     KMP_DEBUG_ASSERT(
1374         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1375     if (!TCR_4(__kmp_init_parallel))
1376       __kmp_parallel_initialize();
1377     __kmp_resume_if_soft_paused();
1378 
1379     /* setup current data */
1380     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1381     // shutdown
1382     parent_team = master_th->th.th_team;
1383     master_tid = master_th->th.th_info.ds.ds_tid;
1384     master_this_cons = master_th->th.th_local.this_construct;
1385     root = master_th->th.th_root;
1386     master_active = root->r.r_active;
1387     master_set_numthreads = master_th->th.th_set_nproc;
1388 
1389 #if OMPT_SUPPORT
1390     ompt_data_t ompt_parallel_data = ompt_data_none;
1391     ompt_data_t *parent_task_data;
1392     ompt_frame_t *ompt_frame;
1393     ompt_data_t *implicit_task_data;
1394     void *return_address = NULL;
1395 
1396     if (ompt_enabled.enabled) {
1397       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1398                                     NULL, NULL);
1399       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1400     }
1401 #endif
1402 
1403     // Nested level will be an index in the nested nthreads array
1404     level = parent_team->t.t_level;
1405     // used to launch non-serial teams even if nested is not allowed
1406     active_level = parent_team->t.t_active_level;
1407     // needed to check nesting inside the teams
1408     teams_level = master_th->th.th_teams_level;
1409 #if KMP_NESTED_HOT_TEAMS
1410     p_hot_teams = &master_th->th.th_hot_teams;
1411     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1412       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1413           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1414       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1415       // it is either actual or not needed (when active_level > 0)
1416       (*p_hot_teams)[0].hot_team_nth = 1;
1417     }
1418 #endif
1419 
1420 #if OMPT_SUPPORT
1421     if (ompt_enabled.enabled) {
1422       if (ompt_enabled.ompt_callback_parallel_begin) {
1423         int team_size = master_set_numthreads
1424                             ? master_set_numthreads
1425                             : get__nproc_2(parent_team, master_tid);
1426         int flags = OMPT_INVOKER(call_context) |
1427                     ((microtask == (microtask_t)__kmp_teams_master)
1428                          ? ompt_parallel_league
1429                          : ompt_parallel_team);
1430         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1431             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1432             return_address);
1433       }
1434       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1435     }
1436 #endif
1437 
1438     master_th->th.th_ident = loc;
1439 
1440     if (master_th->th.th_teams_microtask && ap &&
1441         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1442       // AC: This is start of parallel that is nested inside teams construct.
1443       // The team is actual (hot), all workers are ready at the fork barrier.
1444       // No lock needed to initialize the team a bit, then free workers.
1445       parent_team->t.t_ident = loc;
1446       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1447       parent_team->t.t_argc = argc;
1448       argv = (void **)parent_team->t.t_argv;
1449       for (i = argc - 1; i >= 0; --i)
1450         *argv++ = va_arg(kmp_va_deref(ap), void *);
1451       // Increment our nested depth levels, but not increase the serialization
1452       if (parent_team == master_th->th.th_serial_team) {
1453         // AC: we are in serialized parallel
1454         __kmpc_serialized_parallel(loc, gtid);
1455         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1456 
1457         if (call_context == fork_context_gnu) {
1458           // AC: need to decrement t_serialized for enquiry functions to work
1459           // correctly, will restore at join time
1460           parent_team->t.t_serialized--;
1461           return TRUE;
1462         }
1463 
1464 #if OMPT_SUPPORT
1465         void *dummy;
1466         void **exit_frame_p;
1467 
1468         ompt_lw_taskteam_t lw_taskteam;
1469 
1470         if (ompt_enabled.enabled) {
1471           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1472                                   &ompt_parallel_data, return_address);
1473           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1474 
1475           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1476           // don't use lw_taskteam after linking. content was swaped
1477 
1478           /* OMPT implicit task begin */
1479           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1480           if (ompt_enabled.ompt_callback_implicit_task) {
1481             OMPT_CUR_TASK_INFO(master_th)->thread_num =
1482                 __kmp_tid_from_gtid(gtid);
1483             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1484                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1485                 implicit_task_data, 1,
1486                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1487           }
1488 
1489           /* OMPT state */
1490           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1491         } else {
1492           exit_frame_p = &dummy;
1493         }
1494 #endif
1495         // AC: need to decrement t_serialized for enquiry functions to work
1496         // correctly, will restore at join time
1497         parent_team->t.t_serialized--;
1498 
1499         {
1500           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1501           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1502           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1503 #if OMPT_SUPPORT
1504                                  ,
1505                                  exit_frame_p
1506 #endif
1507           );
1508         }
1509 
1510 #if OMPT_SUPPORT
1511         if (ompt_enabled.enabled) {
1512           *exit_frame_p = NULL;
1513           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1514           if (ompt_enabled.ompt_callback_implicit_task) {
1515             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1516                 ompt_scope_end, NULL, implicit_task_data, 1,
1517                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1518           }
1519           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1520           __ompt_lw_taskteam_unlink(master_th);
1521           if (ompt_enabled.ompt_callback_parallel_end) {
1522             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1523                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1524                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1525                 return_address);
1526           }
1527           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1528         }
1529 #endif
1530         return TRUE;
1531       }
1532 
1533       parent_team->t.t_pkfn = microtask;
1534       parent_team->t.t_invoke = invoker;
1535       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1536       parent_team->t.t_active_level++;
1537       parent_team->t.t_level++;
1538       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1539 
1540 #if OMPT_SUPPORT
1541       if (ompt_enabled.enabled) {
1542         ompt_lw_taskteam_t lw_taskteam;
1543         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1544                                 &ompt_parallel_data, return_address);
1545         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1546       }
1547 #endif
1548 
1549       /* Change number of threads in the team if requested */
1550       if (master_set_numthreads) { // The parallel has num_threads clause
1551         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1552           // AC: only can reduce number of threads dynamically, can't increase
1553           kmp_info_t **other_threads = parent_team->t.t_threads;
1554           parent_team->t.t_nproc = master_set_numthreads;
1555           for (i = 0; i < master_set_numthreads; ++i) {
1556             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1557           }
1558           // Keep extra threads hot in the team for possible next parallels
1559         }
1560         master_th->th.th_set_nproc = 0;
1561       }
1562 
1563 #if USE_DEBUGGER
1564       if (__kmp_debugging) { // Let debugger override number of threads.
1565         int nth = __kmp_omp_num_threads(loc);
1566         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1567           master_set_numthreads = nth;
1568         }
1569       }
1570 #endif
1571 
1572 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1573       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1574            KMP_ITT_DEBUG) &&
1575           __kmp_forkjoin_frames_mode == 3 &&
1576           parent_team->t.t_active_level == 1 // only report frames at level 1
1577           && master_th->th.th_teams_size.nteams == 1) {
1578         kmp_uint64 tmp_time = __itt_get_timestamp();
1579         master_th->th.th_frame_time = tmp_time;
1580         parent_team->t.t_region_time = tmp_time;
1581       }
1582       if (__itt_stack_caller_create_ptr) {
1583         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1584         // create new stack stitching id before entering fork barrier
1585         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1586       }
1587 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1588 
1589       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1590                     "master_th=%p, gtid=%d\n",
1591                     root, parent_team, master_th, gtid));
1592       __kmp_internal_fork(loc, gtid, parent_team);
1593       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1594                     "master_th=%p, gtid=%d\n",
1595                     root, parent_team, master_th, gtid));
1596 
1597       if (call_context == fork_context_gnu)
1598         return TRUE;
1599 
1600       /* Invoke microtask for MASTER thread */
1601       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1602                     parent_team->t.t_id, parent_team->t.t_pkfn));
1603 
1604       if (!parent_team->t.t_invoke(gtid)) {
1605         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1606       }
1607       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1608                     parent_team->t.t_id, parent_team->t.t_pkfn));
1609       KMP_MB(); /* Flush all pending memory write invalidates.  */
1610 
1611       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1612 
1613       return TRUE;
1614     } // Parallel closely nested in teams construct
1615 
1616 #if KMP_DEBUG
1617     if (__kmp_tasking_mode != tskm_immediate_exec) {
1618       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1619                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1620     }
1621 #endif
1622 
1623     int enter_teams = 0;
1624     if (parent_team->t.t_active_level >=
1625         master_th->th.th_current_task->td_icvs.max_active_levels) {
1626       nthreads = 1;
1627     } else {
1628       enter_teams = ((ap == NULL && active_level == 0) ||
1629                      (ap && teams_level > 0 && teams_level == level));
1630       nthreads =
1631           master_set_numthreads
1632               ? master_set_numthreads
1633               : get__nproc_2(
1634                     parent_team,
1635                     master_tid); // TODO: get nproc directly from current task
1636 
1637       // Check if we need to take forkjoin lock? (no need for serialized
1638       // parallel out of teams construct). This code moved here from
1639       // __kmp_reserve_threads() to speedup nested serialized parallels.
1640       if (nthreads > 1) {
1641         if ((get__max_active_levels(master_th) == 1 &&
1642              (root->r.r_in_parallel && !enter_teams)) ||
1643             (__kmp_library == library_serial)) {
1644           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1645                         " threads\n",
1646                         gtid, nthreads));
1647           nthreads = 1;
1648         }
1649       }
1650       if (nthreads > 1) {
1651         /* determine how many new threads we can use */
1652         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1653         /* AC: If we execute teams from parallel region (on host), then teams
1654            should be created but each can only have 1 thread if nesting is
1655            disabled. If teams called from serial region, then teams and their
1656            threads should be created regardless of the nesting setting. */
1657         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1658                                          nthreads, enter_teams);
1659         if (nthreads == 1) {
1660           // Free lock for single thread execution here; for multi-thread
1661           // execution it will be freed later after team of threads created
1662           // and initialized
1663           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1664         }
1665       }
1666     }
1667     KMP_DEBUG_ASSERT(nthreads > 0);
1668 
1669     // If we temporarily changed the set number of threads then restore it now
1670     master_th->th.th_set_nproc = 0;
1671 
1672     /* create a serialized parallel region? */
1673     if (nthreads == 1) {
1674 /* josh todo: hypothetical question: what do we do for OS X*? */
1675 #if KMP_OS_LINUX &&                                                            \
1676     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1677       void *args[argc];
1678 #else
1679       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1680 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1681           KMP_ARCH_AARCH64) */
1682 
1683       KA_TRACE(20,
1684                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1685 
1686       __kmpc_serialized_parallel(loc, gtid);
1687 
1688       if (call_context == fork_context_intel) {
1689         /* TODO this sucks, use the compiler itself to pass args! :) */
1690         master_th->th.th_serial_team->t.t_ident = loc;
1691         if (!ap) {
1692           // revert change made in __kmpc_serialized_parallel()
1693           master_th->th.th_serial_team->t.t_level--;
1694           // Get args from parent team for teams construct
1695 
1696 #if OMPT_SUPPORT
1697           void *dummy;
1698           void **exit_frame_p;
1699           ompt_task_info_t *task_info;
1700 
1701           ompt_lw_taskteam_t lw_taskteam;
1702 
1703           if (ompt_enabled.enabled) {
1704             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1705                                     &ompt_parallel_data, return_address);
1706 
1707             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1708             // don't use lw_taskteam after linking. content was swaped
1709 
1710             task_info = OMPT_CUR_TASK_INFO(master_th);
1711             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1712             if (ompt_enabled.ompt_callback_implicit_task) {
1713               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1714                   __kmp_tid_from_gtid(gtid);
1715               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1716                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1717                   &(task_info->task_data), 1,
1718                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1719                   ompt_task_implicit);
1720             }
1721 
1722             /* OMPT state */
1723             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1724           } else {
1725             exit_frame_p = &dummy;
1726           }
1727 #endif
1728 
1729           {
1730             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1731             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1732             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1733                                    parent_team->t.t_argv
1734 #if OMPT_SUPPORT
1735                                    ,
1736                                    exit_frame_p
1737 #endif
1738             );
1739           }
1740 
1741 #if OMPT_SUPPORT
1742           if (ompt_enabled.enabled) {
1743             *exit_frame_p = NULL;
1744             if (ompt_enabled.ompt_callback_implicit_task) {
1745               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1746                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1747                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1748                   ompt_task_implicit);
1749             }
1750             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1751             __ompt_lw_taskteam_unlink(master_th);
1752             if (ompt_enabled.ompt_callback_parallel_end) {
1753               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1754                   &ompt_parallel_data, parent_task_data,
1755                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1756                   return_address);
1757             }
1758             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1759           }
1760 #endif
1761         } else if (microtask == (microtask_t)__kmp_teams_master) {
1762           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1763                            master_th->th.th_serial_team);
1764           team = master_th->th.th_team;
1765           // team->t.t_pkfn = microtask;
1766           team->t.t_invoke = invoker;
1767           __kmp_alloc_argv_entries(argc, team, TRUE);
1768           team->t.t_argc = argc;
1769           argv = (void **)team->t.t_argv;
1770           if (ap) {
1771             for (i = argc - 1; i >= 0; --i)
1772               *argv++ = va_arg(kmp_va_deref(ap), void *);
1773           } else {
1774             for (i = 0; i < argc; ++i)
1775               // Get args from parent team for teams construct
1776               argv[i] = parent_team->t.t_argv[i];
1777           }
1778           // AC: revert change made in __kmpc_serialized_parallel()
1779           //     because initial code in teams should have level=0
1780           team->t.t_level--;
1781           // AC: call special invoker for outer "parallel" of teams construct
1782           invoker(gtid);
1783 #if OMPT_SUPPORT
1784           if (ompt_enabled.enabled) {
1785             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1786             if (ompt_enabled.ompt_callback_implicit_task) {
1787               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1788                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1789                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1790             }
1791             if (ompt_enabled.ompt_callback_parallel_end) {
1792               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1793                   &ompt_parallel_data, parent_task_data,
1794                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1795                   return_address);
1796             }
1797             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1798           }
1799 #endif
1800         } else {
1801           argv = args;
1802           for (i = argc - 1; i >= 0; --i)
1803             *argv++ = va_arg(kmp_va_deref(ap), void *);
1804           KMP_MB();
1805 
1806 #if OMPT_SUPPORT
1807           void *dummy;
1808           void **exit_frame_p;
1809           ompt_task_info_t *task_info;
1810 
1811           ompt_lw_taskteam_t lw_taskteam;
1812 
1813           if (ompt_enabled.enabled) {
1814             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1815                                     &ompt_parallel_data, return_address);
1816             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1817             // don't use lw_taskteam after linking. content was swaped
1818             task_info = OMPT_CUR_TASK_INFO(master_th);
1819             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1820 
1821             /* OMPT implicit task begin */
1822             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1823             if (ompt_enabled.ompt_callback_implicit_task) {
1824               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1825                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1826                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1827                   ompt_task_implicit);
1828               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1829                   __kmp_tid_from_gtid(gtid);
1830             }
1831 
1832             /* OMPT state */
1833             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1834           } else {
1835             exit_frame_p = &dummy;
1836           }
1837 #endif
1838 
1839           {
1840             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1841             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1842             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1843 #if OMPT_SUPPORT
1844                                    ,
1845                                    exit_frame_p
1846 #endif
1847             );
1848           }
1849 
1850 #if OMPT_SUPPORT
1851           if (ompt_enabled.enabled) {
1852             *exit_frame_p = NULL;
1853             if (ompt_enabled.ompt_callback_implicit_task) {
1854               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1855                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1856                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1857                   ompt_task_implicit);
1858             }
1859 
1860             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1861             __ompt_lw_taskteam_unlink(master_th);
1862             if (ompt_enabled.ompt_callback_parallel_end) {
1863               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1864                   &ompt_parallel_data, parent_task_data,
1865                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1866                   return_address);
1867             }
1868             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1869           }
1870 #endif
1871         }
1872       } else if (call_context == fork_context_gnu) {
1873 #if OMPT_SUPPORT
1874         ompt_lw_taskteam_t lwt;
1875         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1876                                 return_address);
1877 
1878         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1879         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1880 // don't use lw_taskteam after linking. content was swaped
1881 #endif
1882 
1883         // we were called from GNU native code
1884         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1885         return FALSE;
1886       } else {
1887         KMP_ASSERT2(call_context < fork_context_last,
1888                     "__kmp_fork_call: unknown fork_context parameter");
1889       }
1890 
1891       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1892       KMP_MB();
1893       return FALSE;
1894     } // if (nthreads == 1)
1895 
1896     // GEH: only modify the executing flag in the case when not serialized
1897     //      serialized case is handled in kmpc_serialized_parallel
1898     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1899                   "curtask=%p, curtask_max_aclevel=%d\n",
1900                   parent_team->t.t_active_level, master_th,
1901                   master_th->th.th_current_task,
1902                   master_th->th.th_current_task->td_icvs.max_active_levels));
1903     // TODO: GEH - cannot do this assertion because root thread not set up as
1904     // executing
1905     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1906     master_th->th.th_current_task->td_flags.executing = 0;
1907 
1908     if (!master_th->th.th_teams_microtask || level > teams_level) {
1909       /* Increment our nested depth level */
1910       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1911     }
1912 
1913     // See if we need to make a copy of the ICVs.
1914     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1915     if ((level + 1 < __kmp_nested_nth.used) &&
1916         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1917       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1918     } else {
1919       nthreads_icv = 0; // don't update
1920     }
1921 
1922     // Figure out the proc_bind_policy for the new team.
1923     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1924     kmp_proc_bind_t proc_bind_icv =
1925         proc_bind_default; // proc_bind_default means don't update
1926     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1927       proc_bind = proc_bind_false;
1928     } else {
1929       if (proc_bind == proc_bind_default) {
1930         // No proc_bind clause specified; use current proc-bind-var for this
1931         // parallel region
1932         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1933       }
1934       /* else: The proc_bind policy was specified explicitly on parallel clause.
1935          This overrides proc-bind-var for this parallel region, but does not
1936          change proc-bind-var. */
1937       // Figure the value of proc-bind-var for the child threads.
1938       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1939           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1940            master_th->th.th_current_task->td_icvs.proc_bind)) {
1941         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1942       }
1943     }
1944 
1945     // Reset for next parallel region
1946     master_th->th.th_set_proc_bind = proc_bind_default;
1947 
1948     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1949       kmp_internal_control_t new_icvs;
1950       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1951       new_icvs.next = NULL;
1952       if (nthreads_icv > 0) {
1953         new_icvs.nproc = nthreads_icv;
1954       }
1955       if (proc_bind_icv != proc_bind_default) {
1956         new_icvs.proc_bind = proc_bind_icv;
1957       }
1958 
1959       /* allocate a new parallel team */
1960       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1961       team = __kmp_allocate_team(root, nthreads, nthreads,
1962 #if OMPT_SUPPORT
1963                                  ompt_parallel_data,
1964 #endif
1965                                  proc_bind, &new_icvs,
1966                                  argc USE_NESTED_HOT_ARG(master_th));
1967     } else {
1968       /* allocate a new parallel team */
1969       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1970       team = __kmp_allocate_team(root, nthreads, nthreads,
1971 #if OMPT_SUPPORT
1972                                  ompt_parallel_data,
1973 #endif
1974                                  proc_bind,
1975                                  &master_th->th.th_current_task->td_icvs,
1976                                  argc USE_NESTED_HOT_ARG(master_th));
1977     }
1978     KF_TRACE(
1979         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
1980 
1981     /* setup the new team */
1982     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
1983     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
1984     KMP_CHECK_UPDATE(team->t.t_ident, loc);
1985     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
1986     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
1987 #if OMPT_SUPPORT
1988     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
1989                           return_address);
1990 #endif
1991     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
1992     // TODO: parent_team->t.t_level == INT_MAX ???
1993     if (!master_th->th.th_teams_microtask || level > teams_level) {
1994       int new_level = parent_team->t.t_level + 1;
1995       KMP_CHECK_UPDATE(team->t.t_level, new_level);
1996       new_level = parent_team->t.t_active_level + 1;
1997       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
1998     } else {
1999       // AC: Do not increase parallel level at start of the teams construct
2000       int new_level = parent_team->t.t_level;
2001       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2002       new_level = parent_team->t.t_active_level;
2003       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2004     }
2005     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2006     // set master's schedule as new run-time schedule
2007     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2008 
2009     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2010     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2011 
2012     // Update the floating point rounding in the team if required.
2013     propagateFPControl(team);
2014 
2015     if (__kmp_tasking_mode != tskm_immediate_exec) {
2016       // Set master's task team to team's task team. Unless this is hot team, it
2017       // should be NULL.
2018       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2019                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2020       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2021                     "%p, new task_team %p / team %p\n",
2022                     __kmp_gtid_from_thread(master_th),
2023                     master_th->th.th_task_team, parent_team,
2024                     team->t.t_task_team[master_th->th.th_task_state], team));
2025 
2026       if (active_level || master_th->th.th_task_team) {
2027         // Take a memo of master's task_state
2028         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2029         if (master_th->th.th_task_state_top >=
2030             master_th->th.th_task_state_stack_sz) { // increase size
2031           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2032           kmp_uint8 *old_stack, *new_stack;
2033           kmp_uint32 i;
2034           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2035           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2036             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2037           }
2038           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2039                ++i) { // zero-init rest of stack
2040             new_stack[i] = 0;
2041           }
2042           old_stack = master_th->th.th_task_state_memo_stack;
2043           master_th->th.th_task_state_memo_stack = new_stack;
2044           master_th->th.th_task_state_stack_sz = new_size;
2045           __kmp_free(old_stack);
2046         }
2047         // Store master's task_state on stack
2048         master_th->th
2049             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2050             master_th->th.th_task_state;
2051         master_th->th.th_task_state_top++;
2052 #if KMP_NESTED_HOT_TEAMS
2053         if (master_th->th.th_hot_teams &&
2054             active_level < __kmp_hot_teams_max_level &&
2055             team == master_th->th.th_hot_teams[active_level].hot_team) {
2056           // Restore master's nested state if nested hot team
2057           master_th->th.th_task_state =
2058               master_th->th
2059                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2060         } else {
2061 #endif
2062           master_th->th.th_task_state = 0;
2063 #if KMP_NESTED_HOT_TEAMS
2064         }
2065 #endif
2066       }
2067 #if !KMP_NESTED_HOT_TEAMS
2068       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2069                        (team == root->r.r_hot_team));
2070 #endif
2071     }
2072 
2073     KA_TRACE(
2074         20,
2075         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2076          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2077          team->t.t_nproc));
2078     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2079                      (team->t.t_master_tid == 0 &&
2080                       (team->t.t_parent == root->r.r_root_team ||
2081                        team->t.t_parent->t.t_serialized)));
2082     KMP_MB();
2083 
2084     /* now, setup the arguments */
2085     argv = (void **)team->t.t_argv;
2086     if (ap) {
2087       for (i = argc - 1; i >= 0; --i) {
2088         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2089         KMP_CHECK_UPDATE(*argv, new_argv);
2090         argv++;
2091       }
2092     } else {
2093       for (i = 0; i < argc; ++i) {
2094         // Get args from parent team for teams construct
2095         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2096       }
2097     }
2098 
2099     /* now actually fork the threads */
2100     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2101     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2102       root->r.r_active = TRUE;
2103 
2104     __kmp_fork_team_threads(root, team, master_th, gtid);
2105     __kmp_setup_icv_copy(team, nthreads,
2106                          &master_th->th.th_current_task->td_icvs, loc);
2107 
2108 #if OMPT_SUPPORT
2109     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2110 #endif
2111 
2112     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2113 
2114 #if USE_ITT_BUILD
2115     if (team->t.t_active_level == 1 // only report frames at level 1
2116         && !master_th->th.th_teams_microtask) { // not in teams construct
2117 #if USE_ITT_NOTIFY
2118       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2119           (__kmp_forkjoin_frames_mode == 3 ||
2120            __kmp_forkjoin_frames_mode == 1)) {
2121         kmp_uint64 tmp_time = 0;
2122         if (__itt_get_timestamp_ptr)
2123           tmp_time = __itt_get_timestamp();
2124         // Internal fork - report frame begin
2125         master_th->th.th_frame_time = tmp_time;
2126         if (__kmp_forkjoin_frames_mode == 3)
2127           team->t.t_region_time = tmp_time;
2128       } else
2129 // only one notification scheme (either "submit" or "forking/joined", not both)
2130 #endif /* USE_ITT_NOTIFY */
2131           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2132               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2133         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2134         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2135       }
2136     }
2137 #endif /* USE_ITT_BUILD */
2138 
2139     /* now go on and do the work */
2140     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2141     KMP_MB();
2142     KF_TRACE(10,
2143              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2144               root, team, master_th, gtid));
2145 
2146 #if USE_ITT_BUILD
2147     if (__itt_stack_caller_create_ptr) {
2148       // create new stack stitching id before entering fork barrier
2149       if (!enter_teams) {
2150         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2151         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2152       } else if (parent_team->t.t_serialized) {
2153         // keep stack stitching id in the serialized parent_team;
2154         // current team will be used for parallel inside the teams;
2155         // if parent_team is active, then it already keeps stack stitching id
2156         // for the league of teams
2157         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2158         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2159       }
2160     }
2161 #endif /* USE_ITT_BUILD */
2162 
2163     // AC: skip __kmp_internal_fork at teams construct, let only master
2164     // threads execute
2165     if (ap) {
2166       __kmp_internal_fork(loc, gtid, team);
2167       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2168                     "master_th=%p, gtid=%d\n",
2169                     root, team, master_th, gtid));
2170     }
2171 
2172     if (call_context == fork_context_gnu) {
2173       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2174       return TRUE;
2175     }
2176 
2177     /* Invoke microtask for MASTER thread */
2178     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2179                   team->t.t_id, team->t.t_pkfn));
2180   } // END of timer KMP_fork_call block
2181 
2182 #if KMP_STATS_ENABLED
2183   // If beginning a teams construct, then change thread state
2184   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2185   if (!ap) {
2186     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2187   }
2188 #endif
2189 
2190   if (!team->t.t_invoke(gtid)) {
2191     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2192   }
2193 
2194 #if KMP_STATS_ENABLED
2195   // If was beginning of a teams construct, then reset thread state
2196   if (!ap) {
2197     KMP_SET_THREAD_STATE(previous_state);
2198   }
2199 #endif
2200 
2201   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2202                 team->t.t_id, team->t.t_pkfn));
2203   KMP_MB(); /* Flush all pending memory write invalidates.  */
2204 
2205   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2206 
2207 #if OMPT_SUPPORT
2208   if (ompt_enabled.enabled) {
2209     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2210   }
2211 #endif
2212 
2213   return TRUE;
2214 }
2215 
2216 #if OMPT_SUPPORT
2217 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2218                                             kmp_team_t *team) {
2219   // restore state outside the region
2220   thread->th.ompt_thread_info.state =
2221       ((team->t.t_serialized) ? ompt_state_work_serial
2222                               : ompt_state_work_parallel);
2223 }
2224 
2225 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2226                                    kmp_team_t *team, ompt_data_t *parallel_data,
2227                                    int flags, void *codeptr) {
2228   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2229   if (ompt_enabled.ompt_callback_parallel_end) {
2230     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2231         parallel_data, &(task_info->task_data), flags, codeptr);
2232   }
2233 
2234   task_info->frame.enter_frame = ompt_data_none;
2235   __kmp_join_restore_state(thread, team);
2236 }
2237 #endif
2238 
2239 void __kmp_join_call(ident_t *loc, int gtid
2240 #if OMPT_SUPPORT
2241                      ,
2242                      enum fork_context_e fork_context
2243 #endif
2244                      ,
2245                      int exit_teams) {
2246   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2247   kmp_team_t *team;
2248   kmp_team_t *parent_team;
2249   kmp_info_t *master_th;
2250   kmp_root_t *root;
2251   int master_active;
2252 
2253   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2254 
2255   /* setup current data */
2256   master_th = __kmp_threads[gtid];
2257   root = master_th->th.th_root;
2258   team = master_th->th.th_team;
2259   parent_team = team->t.t_parent;
2260 
2261   master_th->th.th_ident = loc;
2262 
2263 #if OMPT_SUPPORT
2264   void *team_microtask = (void *)team->t.t_pkfn;
2265   // For GOMP interface with serialized parallel, need the
2266   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2267   // and end-parallel events.
2268   if (ompt_enabled.enabled &&
2269       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2270     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2271   }
2272 #endif
2273 
2274 #if KMP_DEBUG
2275   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2276     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2277                   "th_task_team = %p\n",
2278                   __kmp_gtid_from_thread(master_th), team,
2279                   team->t.t_task_team[master_th->th.th_task_state],
2280                   master_th->th.th_task_team));
2281     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2282                      team->t.t_task_team[master_th->th.th_task_state]);
2283   }
2284 #endif
2285 
2286   if (team->t.t_serialized) {
2287     if (master_th->th.th_teams_microtask) {
2288       // We are in teams construct
2289       int level = team->t.t_level;
2290       int tlevel = master_th->th.th_teams_level;
2291       if (level == tlevel) {
2292         // AC: we haven't incremented it earlier at start of teams construct,
2293         //     so do it here - at the end of teams construct
2294         team->t.t_level++;
2295       } else if (level == tlevel + 1) {
2296         // AC: we are exiting parallel inside teams, need to increment
2297         // serialization in order to restore it in the next call to
2298         // __kmpc_end_serialized_parallel
2299         team->t.t_serialized++;
2300       }
2301     }
2302     __kmpc_end_serialized_parallel(loc, gtid);
2303 
2304 #if OMPT_SUPPORT
2305     if (ompt_enabled.enabled) {
2306       __kmp_join_restore_state(master_th, parent_team);
2307     }
2308 #endif
2309 
2310     return;
2311   }
2312 
2313   master_active = team->t.t_master_active;
2314 
2315   if (!exit_teams) {
2316     // AC: No barrier for internal teams at exit from teams construct.
2317     //     But there is barrier for external team (league).
2318     __kmp_internal_join(loc, gtid, team);
2319 #if USE_ITT_BUILD
2320     if (__itt_stack_caller_create_ptr) {
2321       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2322       // destroy the stack stitching id after join barrier
2323       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2324       team->t.t_stack_id = NULL;
2325     }
2326 #endif
2327   } else {
2328     master_th->th.th_task_state =
2329         0; // AC: no tasking in teams (out of any parallel)
2330 #if USE_ITT_BUILD
2331     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2332       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2333       // destroy the stack stitching id on exit from the teams construct
2334       // if parent_team is active, then the id will be destroyed later on
2335       // by master of the league of teams
2336       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2337       parent_team->t.t_stack_id = NULL;
2338     }
2339 #endif
2340   }
2341 
2342   KMP_MB();
2343 
2344 #if OMPT_SUPPORT
2345   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2346   void *codeptr = team->t.ompt_team_info.master_return_address;
2347 #endif
2348 
2349 #if USE_ITT_BUILD
2350   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2351   if (team->t.t_active_level == 1 &&
2352       (!master_th->th.th_teams_microtask || /* not in teams construct */
2353        master_th->th.th_teams_size.nteams == 1)) {
2354     master_th->th.th_ident = loc;
2355     // only one notification scheme (either "submit" or "forking/joined", not
2356     // both)
2357     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2358         __kmp_forkjoin_frames_mode == 3)
2359       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2360                              master_th->th.th_frame_time, 0, loc,
2361                              master_th->th.th_team_nproc, 1);
2362     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2363              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2364       __kmp_itt_region_joined(gtid);
2365   } // active_level == 1
2366 #endif /* USE_ITT_BUILD */
2367 
2368   if (master_th->th.th_teams_microtask && !exit_teams &&
2369       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2370       team->t.t_level == master_th->th.th_teams_level + 1) {
2371 // AC: We need to leave the team structure intact at the end of parallel
2372 // inside the teams construct, so that at the next parallel same (hot) team
2373 // works, only adjust nesting levels
2374 #if OMPT_SUPPORT
2375     ompt_data_t ompt_parallel_data = ompt_data_none;
2376     if (ompt_enabled.enabled) {
2377       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2378       if (ompt_enabled.ompt_callback_implicit_task) {
2379         int ompt_team_size = team->t.t_nproc;
2380         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2381             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2382             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2383       }
2384       task_info->frame.exit_frame = ompt_data_none;
2385       task_info->task_data = ompt_data_none;
2386       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2387       __ompt_lw_taskteam_unlink(master_th);
2388     }
2389 #endif
2390     /* Decrement our nested depth level */
2391     team->t.t_level--;
2392     team->t.t_active_level--;
2393     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2394 
2395     // Restore number of threads in the team if needed. This code relies on
2396     // the proper adjustment of th_teams_size.nth after the fork in
2397     // __kmp_teams_master on each teams master in the case that
2398     // __kmp_reserve_threads reduced it.
2399     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2400       int old_num = master_th->th.th_team_nproc;
2401       int new_num = master_th->th.th_teams_size.nth;
2402       kmp_info_t **other_threads = team->t.t_threads;
2403       team->t.t_nproc = new_num;
2404       for (int i = 0; i < old_num; ++i) {
2405         other_threads[i]->th.th_team_nproc = new_num;
2406       }
2407       // Adjust states of non-used threads of the team
2408       for (int i = old_num; i < new_num; ++i) {
2409         // Re-initialize thread's barrier data.
2410         KMP_DEBUG_ASSERT(other_threads[i]);
2411         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2412         for (int b = 0; b < bs_last_barrier; ++b) {
2413           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2414           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2415 #if USE_DEBUGGER
2416           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2417 #endif
2418         }
2419         if (__kmp_tasking_mode != tskm_immediate_exec) {
2420           // Synchronize thread's task state
2421           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2422         }
2423       }
2424     }
2425 
2426 #if OMPT_SUPPORT
2427     if (ompt_enabled.enabled) {
2428       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2429                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2430     }
2431 #endif
2432 
2433     return;
2434   }
2435 
2436   /* do cleanup and restore the parent team */
2437   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2438   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2439 
2440   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2441 
2442   /* jc: The following lock has instructions with REL and ACQ semantics,
2443      separating the parallel user code called in this parallel region
2444      from the serial user code called after this function returns. */
2445   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2446 
2447   if (!master_th->th.th_teams_microtask ||
2448       team->t.t_level > master_th->th.th_teams_level) {
2449     /* Decrement our nested depth level */
2450     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2451   }
2452   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2453 
2454 #if OMPT_SUPPORT
2455   if (ompt_enabled.enabled) {
2456     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2457     if (ompt_enabled.ompt_callback_implicit_task) {
2458       int flags = (team_microtask == (void *)__kmp_teams_master)
2459                       ? ompt_task_initial
2460                       : ompt_task_implicit;
2461       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2462       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2463           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2464           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2465     }
2466     task_info->frame.exit_frame = ompt_data_none;
2467     task_info->task_data = ompt_data_none;
2468   }
2469 #endif
2470 
2471   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2472                 master_th, team));
2473   __kmp_pop_current_task_from_thread(master_th);
2474 
2475 #if KMP_AFFINITY_SUPPORTED
2476   // Restore master thread's partition.
2477   master_th->th.th_first_place = team->t.t_first_place;
2478   master_th->th.th_last_place = team->t.t_last_place;
2479 #endif // KMP_AFFINITY_SUPPORTED
2480   master_th->th.th_def_allocator = team->t.t_def_allocator;
2481 
2482   updateHWFPControl(team);
2483 
2484   if (root->r.r_active != master_active)
2485     root->r.r_active = master_active;
2486 
2487   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2488                             master_th)); // this will free worker threads
2489 
2490   /* this race was fun to find. make sure the following is in the critical
2491      region otherwise assertions may fail occasionally since the old team may be
2492      reallocated and the hierarchy appears inconsistent. it is actually safe to
2493      run and won't cause any bugs, but will cause those assertion failures. it's
2494      only one deref&assign so might as well put this in the critical region */
2495   master_th->th.th_team = parent_team;
2496   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2497   master_th->th.th_team_master = parent_team->t.t_threads[0];
2498   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2499 
2500   /* restore serialized team, if need be */
2501   if (parent_team->t.t_serialized &&
2502       parent_team != master_th->th.th_serial_team &&
2503       parent_team != root->r.r_root_team) {
2504     __kmp_free_team(root,
2505                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2506     master_th->th.th_serial_team = parent_team;
2507   }
2508 
2509   if (__kmp_tasking_mode != tskm_immediate_exec) {
2510     if (master_th->th.th_task_state_top >
2511         0) { // Restore task state from memo stack
2512       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2513       // Remember master's state if we re-use this nested hot team
2514       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2515           master_th->th.th_task_state;
2516       --master_th->th.th_task_state_top; // pop
2517       // Now restore state at this level
2518       master_th->th.th_task_state =
2519           master_th->th
2520               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2521     }
2522     // Copy the task team from the parent team to the master thread
2523     master_th->th.th_task_team =
2524         parent_team->t.t_task_team[master_th->th.th_task_state];
2525     KA_TRACE(20,
2526              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2527               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2528               parent_team));
2529   }
2530 
2531   // TODO: GEH - cannot do this assertion because root thread not set up as
2532   // executing
2533   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2534   master_th->th.th_current_task->td_flags.executing = 1;
2535 
2536   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2537 
2538 #if OMPT_SUPPORT
2539   int flags =
2540       OMPT_INVOKER(fork_context) |
2541       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2542                                                       : ompt_parallel_team);
2543   if (ompt_enabled.enabled) {
2544     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2545                     codeptr);
2546   }
2547 #endif
2548 
2549   KMP_MB();
2550   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2551 }
2552 
2553 /* Check whether we should push an internal control record onto the
2554    serial team stack.  If so, do it.  */
2555 void __kmp_save_internal_controls(kmp_info_t *thread) {
2556 
2557   if (thread->th.th_team != thread->th.th_serial_team) {
2558     return;
2559   }
2560   if (thread->th.th_team->t.t_serialized > 1) {
2561     int push = 0;
2562 
2563     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2564       push = 1;
2565     } else {
2566       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2567           thread->th.th_team->t.t_serialized) {
2568         push = 1;
2569       }
2570     }
2571     if (push) { /* push a record on the serial team's stack */
2572       kmp_internal_control_t *control =
2573           (kmp_internal_control_t *)__kmp_allocate(
2574               sizeof(kmp_internal_control_t));
2575 
2576       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2577 
2578       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2579 
2580       control->next = thread->th.th_team->t.t_control_stack_top;
2581       thread->th.th_team->t.t_control_stack_top = control;
2582     }
2583   }
2584 }
2585 
2586 /* Changes set_nproc */
2587 void __kmp_set_num_threads(int new_nth, int gtid) {
2588   kmp_info_t *thread;
2589   kmp_root_t *root;
2590 
2591   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2592   KMP_DEBUG_ASSERT(__kmp_init_serial);
2593 
2594   if (new_nth < 1)
2595     new_nth = 1;
2596   else if (new_nth > __kmp_max_nth)
2597     new_nth = __kmp_max_nth;
2598 
2599   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2600   thread = __kmp_threads[gtid];
2601   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2602     return; // nothing to do
2603 
2604   __kmp_save_internal_controls(thread);
2605 
2606   set__nproc(thread, new_nth);
2607 
2608   // If this omp_set_num_threads() call will cause the hot team size to be
2609   // reduced (in the absence of a num_threads clause), then reduce it now,
2610   // rather than waiting for the next parallel region.
2611   root = thread->th.th_root;
2612   if (__kmp_init_parallel && (!root->r.r_active) &&
2613       (root->r.r_hot_team->t.t_nproc > new_nth)
2614 #if KMP_NESTED_HOT_TEAMS
2615       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2616 #endif
2617   ) {
2618     kmp_team_t *hot_team = root->r.r_hot_team;
2619     int f;
2620 
2621     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2622 
2623     // Release the extra threads we don't need any more.
2624     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2625       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2626       if (__kmp_tasking_mode != tskm_immediate_exec) {
2627         // When decreasing team size, threads no longer in the team should unref
2628         // task team.
2629         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2630       }
2631       __kmp_free_thread(hot_team->t.t_threads[f]);
2632       hot_team->t.t_threads[f] = NULL;
2633     }
2634     hot_team->t.t_nproc = new_nth;
2635 #if KMP_NESTED_HOT_TEAMS
2636     if (thread->th.th_hot_teams) {
2637       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2638       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2639     }
2640 #endif
2641 
2642     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2643 
2644     // Update the t_nproc field in the threads that are still active.
2645     for (f = 0; f < new_nth; f++) {
2646       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2647       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2648     }
2649     // Special flag in case omp_set_num_threads() call
2650     hot_team->t.t_size_changed = -1;
2651   }
2652 }
2653 
2654 /* Changes max_active_levels */
2655 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2656   kmp_info_t *thread;
2657 
2658   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2659                 "%d = (%d)\n",
2660                 gtid, max_active_levels));
2661   KMP_DEBUG_ASSERT(__kmp_init_serial);
2662 
2663   // validate max_active_levels
2664   if (max_active_levels < 0) {
2665     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2666     // We ignore this call if the user has specified a negative value.
2667     // The current setting won't be changed. The last valid setting will be
2668     // used. A warning will be issued (if warnings are allowed as controlled by
2669     // the KMP_WARNINGS env var).
2670     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2671                   "max_active_levels for thread %d = (%d)\n",
2672                   gtid, max_active_levels));
2673     return;
2674   }
2675   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2676     // it's OK, the max_active_levels is within the valid range: [ 0;
2677     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2678     // We allow a zero value. (implementation defined behavior)
2679   } else {
2680     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2681                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2682     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2683     // Current upper limit is MAX_INT. (implementation defined behavior)
2684     // If the input exceeds the upper limit, we correct the input to be the
2685     // upper limit. (implementation defined behavior)
2686     // Actually, the flow should never get here until we use MAX_INT limit.
2687   }
2688   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2689                 "max_active_levels for thread %d = (%d)\n",
2690                 gtid, max_active_levels));
2691 
2692   thread = __kmp_threads[gtid];
2693 
2694   __kmp_save_internal_controls(thread);
2695 
2696   set__max_active_levels(thread, max_active_levels);
2697 }
2698 
2699 /* Gets max_active_levels */
2700 int __kmp_get_max_active_levels(int gtid) {
2701   kmp_info_t *thread;
2702 
2703   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2704   KMP_DEBUG_ASSERT(__kmp_init_serial);
2705 
2706   thread = __kmp_threads[gtid];
2707   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2708   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2709                 "curtask_maxaclevel=%d\n",
2710                 gtid, thread->th.th_current_task,
2711                 thread->th.th_current_task->td_icvs.max_active_levels));
2712   return thread->th.th_current_task->td_icvs.max_active_levels;
2713 }
2714 
2715 // nteams-var per-device ICV
2716 void __kmp_set_num_teams(int num_teams) {
2717   if (num_teams > 0)
2718     __kmp_nteams = num_teams;
2719 }
2720 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2721 // teams-thread-limit-var per-device ICV
2722 void __kmp_set_teams_thread_limit(int limit) {
2723   if (limit > 0)
2724     __kmp_teams_thread_limit = limit;
2725 }
2726 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2727 
2728 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2729 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2730 
2731 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2732 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2733   kmp_info_t *thread;
2734   kmp_sched_t orig_kind;
2735   //    kmp_team_t *team;
2736 
2737   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2738                 gtid, (int)kind, chunk));
2739   KMP_DEBUG_ASSERT(__kmp_init_serial);
2740 
2741   // Check if the kind parameter is valid, correct if needed.
2742   // Valid parameters should fit in one of two intervals - standard or extended:
2743   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2744   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2745   orig_kind = kind;
2746   kind = __kmp_sched_without_mods(kind);
2747 
2748   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2749       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2750     // TODO: Hint needs attention in case we change the default schedule.
2751     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2752               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2753               __kmp_msg_null);
2754     kind = kmp_sched_default;
2755     chunk = 0; // ignore chunk value in case of bad kind
2756   }
2757 
2758   thread = __kmp_threads[gtid];
2759 
2760   __kmp_save_internal_controls(thread);
2761 
2762   if (kind < kmp_sched_upper_std) {
2763     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2764       // differ static chunked vs. unchunked:  chunk should be invalid to
2765       // indicate unchunked schedule (which is the default)
2766       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2767     } else {
2768       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2769           __kmp_sch_map[kind - kmp_sched_lower - 1];
2770     }
2771   } else {
2772     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2773     //    kmp_sched_lower - 2 ];
2774     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2775         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2776                       kmp_sched_lower - 2];
2777   }
2778   __kmp_sched_apply_mods_intkind(
2779       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2780   if (kind == kmp_sched_auto || chunk < 1) {
2781     // ignore parameter chunk for schedule auto
2782     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2783   } else {
2784     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2785   }
2786 }
2787 
2788 /* Gets def_sched_var ICV values */
2789 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2790   kmp_info_t *thread;
2791   enum sched_type th_type;
2792 
2793   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2794   KMP_DEBUG_ASSERT(__kmp_init_serial);
2795 
2796   thread = __kmp_threads[gtid];
2797 
2798   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2799   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2800   case kmp_sch_static:
2801   case kmp_sch_static_greedy:
2802   case kmp_sch_static_balanced:
2803     *kind = kmp_sched_static;
2804     __kmp_sched_apply_mods_stdkind(kind, th_type);
2805     *chunk = 0; // chunk was not set, try to show this fact via zero value
2806     return;
2807   case kmp_sch_static_chunked:
2808     *kind = kmp_sched_static;
2809     break;
2810   case kmp_sch_dynamic_chunked:
2811     *kind = kmp_sched_dynamic;
2812     break;
2813   case kmp_sch_guided_chunked:
2814   case kmp_sch_guided_iterative_chunked:
2815   case kmp_sch_guided_analytical_chunked:
2816     *kind = kmp_sched_guided;
2817     break;
2818   case kmp_sch_auto:
2819     *kind = kmp_sched_auto;
2820     break;
2821   case kmp_sch_trapezoidal:
2822     *kind = kmp_sched_trapezoidal;
2823     break;
2824 #if KMP_STATIC_STEAL_ENABLED
2825   case kmp_sch_static_steal:
2826     *kind = kmp_sched_static_steal;
2827     break;
2828 #endif
2829   default:
2830     KMP_FATAL(UnknownSchedulingType, th_type);
2831   }
2832 
2833   __kmp_sched_apply_mods_stdkind(kind, th_type);
2834   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2835 }
2836 
2837 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2838 
2839   int ii, dd;
2840   kmp_team_t *team;
2841   kmp_info_t *thr;
2842 
2843   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2844   KMP_DEBUG_ASSERT(__kmp_init_serial);
2845 
2846   // validate level
2847   if (level == 0)
2848     return 0;
2849   if (level < 0)
2850     return -1;
2851   thr = __kmp_threads[gtid];
2852   team = thr->th.th_team;
2853   ii = team->t.t_level;
2854   if (level > ii)
2855     return -1;
2856 
2857   if (thr->th.th_teams_microtask) {
2858     // AC: we are in teams region where multiple nested teams have same level
2859     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2860     if (level <=
2861         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2862       KMP_DEBUG_ASSERT(ii >= tlevel);
2863       // AC: As we need to pass by the teams league, we need to artificially
2864       // increase ii
2865       if (ii == tlevel) {
2866         ii += 2; // three teams have same level
2867       } else {
2868         ii++; // two teams have same level
2869       }
2870     }
2871   }
2872 
2873   if (ii == level)
2874     return __kmp_tid_from_gtid(gtid);
2875 
2876   dd = team->t.t_serialized;
2877   level++;
2878   while (ii > level) {
2879     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2880     }
2881     if ((team->t.t_serialized) && (!dd)) {
2882       team = team->t.t_parent;
2883       continue;
2884     }
2885     if (ii > level) {
2886       team = team->t.t_parent;
2887       dd = team->t.t_serialized;
2888       ii--;
2889     }
2890   }
2891 
2892   return (dd > 1) ? (0) : (team->t.t_master_tid);
2893 }
2894 
2895 int __kmp_get_team_size(int gtid, int level) {
2896 
2897   int ii, dd;
2898   kmp_team_t *team;
2899   kmp_info_t *thr;
2900 
2901   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2902   KMP_DEBUG_ASSERT(__kmp_init_serial);
2903 
2904   // validate level
2905   if (level == 0)
2906     return 1;
2907   if (level < 0)
2908     return -1;
2909   thr = __kmp_threads[gtid];
2910   team = thr->th.th_team;
2911   ii = team->t.t_level;
2912   if (level > ii)
2913     return -1;
2914 
2915   if (thr->th.th_teams_microtask) {
2916     // AC: we are in teams region where multiple nested teams have same level
2917     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2918     if (level <=
2919         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2920       KMP_DEBUG_ASSERT(ii >= tlevel);
2921       // AC: As we need to pass by the teams league, we need to artificially
2922       // increase ii
2923       if (ii == tlevel) {
2924         ii += 2; // three teams have same level
2925       } else {
2926         ii++; // two teams have same level
2927       }
2928     }
2929   }
2930 
2931   while (ii > level) {
2932     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2933     }
2934     if (team->t.t_serialized && (!dd)) {
2935       team = team->t.t_parent;
2936       continue;
2937     }
2938     if (ii > level) {
2939       team = team->t.t_parent;
2940       ii--;
2941     }
2942   }
2943 
2944   return team->t.t_nproc;
2945 }
2946 
2947 kmp_r_sched_t __kmp_get_schedule_global() {
2948   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2949   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2950   // independently. So one can get the updated schedule here.
2951 
2952   kmp_r_sched_t r_sched;
2953 
2954   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2955   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2956   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2957   // different roots (even in OMP 2.5)
2958   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2959   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2960   if (s == kmp_sch_static) {
2961     // replace STATIC with more detailed schedule (balanced or greedy)
2962     r_sched.r_sched_type = __kmp_static;
2963   } else if (s == kmp_sch_guided_chunked) {
2964     // replace GUIDED with more detailed schedule (iterative or analytical)
2965     r_sched.r_sched_type = __kmp_guided;
2966   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2967     r_sched.r_sched_type = __kmp_sched;
2968   }
2969   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2970 
2971   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2972     // __kmp_chunk may be wrong here (if it was not ever set)
2973     r_sched.chunk = KMP_DEFAULT_CHUNK;
2974   } else {
2975     r_sched.chunk = __kmp_chunk;
2976   }
2977 
2978   return r_sched;
2979 }
2980 
2981 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2982    at least argc number of *t_argv entries for the requested team. */
2983 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2984 
2985   KMP_DEBUG_ASSERT(team);
2986   if (!realloc || argc > team->t.t_max_argc) {
2987 
2988     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2989                    "current entries=%d\n",
2990                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2991     /* if previously allocated heap space for args, free them */
2992     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2993       __kmp_free((void *)team->t.t_argv);
2994 
2995     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2996       /* use unused space in the cache line for arguments */
2997       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2998       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2999                      "argv entries\n",
3000                      team->t.t_id, team->t.t_max_argc));
3001       team->t.t_argv = &team->t.t_inline_argv[0];
3002       if (__kmp_storage_map) {
3003         __kmp_print_storage_map_gtid(
3004             -1, &team->t.t_inline_argv[0],
3005             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3006             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3007             team->t.t_id);
3008       }
3009     } else {
3010       /* allocate space for arguments in the heap */
3011       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3012                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3013                                : 2 * argc;
3014       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3015                      "argv entries\n",
3016                      team->t.t_id, team->t.t_max_argc));
3017       team->t.t_argv =
3018           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3019       if (__kmp_storage_map) {
3020         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3021                                      &team->t.t_argv[team->t.t_max_argc],
3022                                      sizeof(void *) * team->t.t_max_argc,
3023                                      "team_%d.t_argv", team->t.t_id);
3024       }
3025     }
3026   }
3027 }
3028 
3029 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3030   int i;
3031   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3032   team->t.t_threads =
3033       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3034   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3035       sizeof(dispatch_shared_info_t) * num_disp_buff);
3036   team->t.t_dispatch =
3037       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3038   team->t.t_implicit_task_taskdata =
3039       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3040   team->t.t_max_nproc = max_nth;
3041 
3042   /* setup dispatch buffers */
3043   for (i = 0; i < num_disp_buff; ++i) {
3044     team->t.t_disp_buffer[i].buffer_index = i;
3045     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3046   }
3047 }
3048 
3049 static void __kmp_free_team_arrays(kmp_team_t *team) {
3050   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3051   int i;
3052   for (i = 0; i < team->t.t_max_nproc; ++i) {
3053     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3054       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3055       team->t.t_dispatch[i].th_disp_buffer = NULL;
3056     }
3057   }
3058 #if KMP_USE_HIER_SCHED
3059   __kmp_dispatch_free_hierarchies(team);
3060 #endif
3061   __kmp_free(team->t.t_threads);
3062   __kmp_free(team->t.t_disp_buffer);
3063   __kmp_free(team->t.t_dispatch);
3064   __kmp_free(team->t.t_implicit_task_taskdata);
3065   team->t.t_threads = NULL;
3066   team->t.t_disp_buffer = NULL;
3067   team->t.t_dispatch = NULL;
3068   team->t.t_implicit_task_taskdata = 0;
3069 }
3070 
3071 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3072   kmp_info_t **oldThreads = team->t.t_threads;
3073 
3074   __kmp_free(team->t.t_disp_buffer);
3075   __kmp_free(team->t.t_dispatch);
3076   __kmp_free(team->t.t_implicit_task_taskdata);
3077   __kmp_allocate_team_arrays(team, max_nth);
3078 
3079   KMP_MEMCPY(team->t.t_threads, oldThreads,
3080              team->t.t_nproc * sizeof(kmp_info_t *));
3081 
3082   __kmp_free(oldThreads);
3083 }
3084 
3085 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3086 
3087   kmp_r_sched_t r_sched =
3088       __kmp_get_schedule_global(); // get current state of scheduling globals
3089 
3090   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3091 
3092   kmp_internal_control_t g_icvs = {
3093     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3094     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3095     // adjustment of threads (per thread)
3096     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3097     // whether blocktime is explicitly set
3098     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3099 #if KMP_USE_MONITOR
3100     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3101 // intervals
3102 #endif
3103     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3104     // next parallel region (per thread)
3105     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3106     __kmp_cg_max_nth, // int thread_limit;
3107     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3108     // for max_active_levels
3109     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3110     // {sched,chunk} pair
3111     __kmp_nested_proc_bind.bind_types[0],
3112     __kmp_default_device,
3113     NULL // struct kmp_internal_control *next;
3114   };
3115 
3116   return g_icvs;
3117 }
3118 
3119 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3120 
3121   kmp_internal_control_t gx_icvs;
3122   gx_icvs.serial_nesting_level =
3123       0; // probably =team->t.t_serial like in save_inter_controls
3124   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3125   gx_icvs.next = NULL;
3126 
3127   return gx_icvs;
3128 }
3129 
3130 static void __kmp_initialize_root(kmp_root_t *root) {
3131   int f;
3132   kmp_team_t *root_team;
3133   kmp_team_t *hot_team;
3134   int hot_team_max_nth;
3135   kmp_r_sched_t r_sched =
3136       __kmp_get_schedule_global(); // get current state of scheduling globals
3137   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3138   KMP_DEBUG_ASSERT(root);
3139   KMP_ASSERT(!root->r.r_begin);
3140 
3141   /* setup the root state structure */
3142   __kmp_init_lock(&root->r.r_begin_lock);
3143   root->r.r_begin = FALSE;
3144   root->r.r_active = FALSE;
3145   root->r.r_in_parallel = 0;
3146   root->r.r_blocktime = __kmp_dflt_blocktime;
3147 
3148   /* setup the root team for this task */
3149   /* allocate the root team structure */
3150   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3151 
3152   root_team =
3153       __kmp_allocate_team(root,
3154                           1, // new_nproc
3155                           1, // max_nproc
3156 #if OMPT_SUPPORT
3157                           ompt_data_none, // root parallel id
3158 #endif
3159                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3160                           0 // argc
3161                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3162       );
3163 #if USE_DEBUGGER
3164   // Non-NULL value should be assigned to make the debugger display the root
3165   // team.
3166   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3167 #endif
3168 
3169   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3170 
3171   root->r.r_root_team = root_team;
3172   root_team->t.t_control_stack_top = NULL;
3173 
3174   /* initialize root team */
3175   root_team->t.t_threads[0] = NULL;
3176   root_team->t.t_nproc = 1;
3177   root_team->t.t_serialized = 1;
3178   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3179   root_team->t.t_sched.sched = r_sched.sched;
3180   KA_TRACE(
3181       20,
3182       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3183        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3184 
3185   /* setup the  hot team for this task */
3186   /* allocate the hot team structure */
3187   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3188 
3189   hot_team =
3190       __kmp_allocate_team(root,
3191                           1, // new_nproc
3192                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3193 #if OMPT_SUPPORT
3194                           ompt_data_none, // root parallel id
3195 #endif
3196                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3197                           0 // argc
3198                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3199       );
3200   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3201 
3202   root->r.r_hot_team = hot_team;
3203   root_team->t.t_control_stack_top = NULL;
3204 
3205   /* first-time initialization */
3206   hot_team->t.t_parent = root_team;
3207 
3208   /* initialize hot team */
3209   hot_team_max_nth = hot_team->t.t_max_nproc;
3210   for (f = 0; f < hot_team_max_nth; ++f) {
3211     hot_team->t.t_threads[f] = NULL;
3212   }
3213   hot_team->t.t_nproc = 1;
3214   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3215   hot_team->t.t_sched.sched = r_sched.sched;
3216   hot_team->t.t_size_changed = 0;
3217 }
3218 
3219 #ifdef KMP_DEBUG
3220 
3221 typedef struct kmp_team_list_item {
3222   kmp_team_p const *entry;
3223   struct kmp_team_list_item *next;
3224 } kmp_team_list_item_t;
3225 typedef kmp_team_list_item_t *kmp_team_list_t;
3226 
3227 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3228     kmp_team_list_t list, // List of teams.
3229     kmp_team_p const *team // Team to add.
3230 ) {
3231 
3232   // List must terminate with item where both entry and next are NULL.
3233   // Team is added to the list only once.
3234   // List is sorted in ascending order by team id.
3235   // Team id is *not* a key.
3236 
3237   kmp_team_list_t l;
3238 
3239   KMP_DEBUG_ASSERT(list != NULL);
3240   if (team == NULL) {
3241     return;
3242   }
3243 
3244   __kmp_print_structure_team_accum(list, team->t.t_parent);
3245   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3246 
3247   // Search list for the team.
3248   l = list;
3249   while (l->next != NULL && l->entry != team) {
3250     l = l->next;
3251   }
3252   if (l->next != NULL) {
3253     return; // Team has been added before, exit.
3254   }
3255 
3256   // Team is not found. Search list again for insertion point.
3257   l = list;
3258   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3259     l = l->next;
3260   }
3261 
3262   // Insert team.
3263   {
3264     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3265         sizeof(kmp_team_list_item_t));
3266     *item = *l;
3267     l->entry = team;
3268     l->next = item;
3269   }
3270 }
3271 
3272 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3273 
3274 ) {
3275   __kmp_printf("%s", title);
3276   if (team != NULL) {
3277     __kmp_printf("%2x %p\n", team->t.t_id, team);
3278   } else {
3279     __kmp_printf(" - (nil)\n");
3280   }
3281 }
3282 
3283 static void __kmp_print_structure_thread(char const *title,
3284                                          kmp_info_p const *thread) {
3285   __kmp_printf("%s", title);
3286   if (thread != NULL) {
3287     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3288   } else {
3289     __kmp_printf(" - (nil)\n");
3290   }
3291 }
3292 
3293 void __kmp_print_structure(void) {
3294 
3295   kmp_team_list_t list;
3296 
3297   // Initialize list of teams.
3298   list =
3299       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3300   list->entry = NULL;
3301   list->next = NULL;
3302 
3303   __kmp_printf("\n------------------------------\nGlobal Thread "
3304                "Table\n------------------------------\n");
3305   {
3306     int gtid;
3307     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3308       __kmp_printf("%2d", gtid);
3309       if (__kmp_threads != NULL) {
3310         __kmp_printf(" %p", __kmp_threads[gtid]);
3311       }
3312       if (__kmp_root != NULL) {
3313         __kmp_printf(" %p", __kmp_root[gtid]);
3314       }
3315       __kmp_printf("\n");
3316     }
3317   }
3318 
3319   // Print out __kmp_threads array.
3320   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3321                "----------\n");
3322   if (__kmp_threads != NULL) {
3323     int gtid;
3324     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3325       kmp_info_t const *thread = __kmp_threads[gtid];
3326       if (thread != NULL) {
3327         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3328         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3329         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3330         __kmp_print_structure_team("    Serial Team:  ",
3331                                    thread->th.th_serial_team);
3332         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3333         __kmp_print_structure_thread("    Master:       ",
3334                                      thread->th.th_team_master);
3335         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3336         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3337         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3338         __kmp_print_structure_thread("    Next in pool: ",
3339                                      thread->th.th_next_pool);
3340         __kmp_printf("\n");
3341         __kmp_print_structure_team_accum(list, thread->th.th_team);
3342         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3343       }
3344     }
3345   } else {
3346     __kmp_printf("Threads array is not allocated.\n");
3347   }
3348 
3349   // Print out __kmp_root array.
3350   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3351                "--------\n");
3352   if (__kmp_root != NULL) {
3353     int gtid;
3354     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3355       kmp_root_t const *root = __kmp_root[gtid];
3356       if (root != NULL) {
3357         __kmp_printf("GTID %2d %p:\n", gtid, root);
3358         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3359         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3360         __kmp_print_structure_thread("    Uber Thread:  ",
3361                                      root->r.r_uber_thread);
3362         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3363         __kmp_printf("    In Parallel:  %2d\n",
3364                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3365         __kmp_printf("\n");
3366         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3367         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3368       }
3369     }
3370   } else {
3371     __kmp_printf("Ubers array is not allocated.\n");
3372   }
3373 
3374   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3375                "--------\n");
3376   while (list->next != NULL) {
3377     kmp_team_p const *team = list->entry;
3378     int i;
3379     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3380     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3381     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3382     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3383     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3384     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3385     for (i = 0; i < team->t.t_nproc; ++i) {
3386       __kmp_printf("    Thread %2d:      ", i);
3387       __kmp_print_structure_thread("", team->t.t_threads[i]);
3388     }
3389     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3390     __kmp_printf("\n");
3391     list = list->next;
3392   }
3393 
3394   // Print out __kmp_thread_pool and __kmp_team_pool.
3395   __kmp_printf("\n------------------------------\nPools\n----------------------"
3396                "--------\n");
3397   __kmp_print_structure_thread("Thread pool:          ",
3398                                CCAST(kmp_info_t *, __kmp_thread_pool));
3399   __kmp_print_structure_team("Team pool:            ",
3400                              CCAST(kmp_team_t *, __kmp_team_pool));
3401   __kmp_printf("\n");
3402 
3403   // Free team list.
3404   while (list != NULL) {
3405     kmp_team_list_item_t *item = list;
3406     list = list->next;
3407     KMP_INTERNAL_FREE(item);
3408   }
3409 }
3410 
3411 #endif
3412 
3413 //---------------------------------------------------------------------------
3414 //  Stuff for per-thread fast random number generator
3415 //  Table of primes
3416 static const unsigned __kmp_primes[] = {
3417     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3418     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3419     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3420     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3421     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3422     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3423     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3424     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3425     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3426     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3427     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3428 
3429 //---------------------------------------------------------------------------
3430 //  __kmp_get_random: Get a random number using a linear congruential method.
3431 unsigned short __kmp_get_random(kmp_info_t *thread) {
3432   unsigned x = thread->th.th_x;
3433   unsigned short r = (unsigned short)(x >> 16);
3434 
3435   thread->th.th_x = x * thread->th.th_a + 1;
3436 
3437   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3438                 thread->th.th_info.ds.ds_tid, r));
3439 
3440   return r;
3441 }
3442 //--------------------------------------------------------
3443 // __kmp_init_random: Initialize a random number generator
3444 void __kmp_init_random(kmp_info_t *thread) {
3445   unsigned seed = thread->th.th_info.ds.ds_tid;
3446 
3447   thread->th.th_a =
3448       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3449   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3450   KA_TRACE(30,
3451            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3452 }
3453 
3454 #if KMP_OS_WINDOWS
3455 /* reclaim array entries for root threads that are already dead, returns number
3456  * reclaimed */
3457 static int __kmp_reclaim_dead_roots(void) {
3458   int i, r = 0;
3459 
3460   for (i = 0; i < __kmp_threads_capacity; ++i) {
3461     if (KMP_UBER_GTID(i) &&
3462         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3463         !__kmp_root[i]
3464              ->r.r_active) { // AC: reclaim only roots died in non-active state
3465       r += __kmp_unregister_root_other_thread(i);
3466     }
3467   }
3468   return r;
3469 }
3470 #endif
3471 
3472 /* This function attempts to create free entries in __kmp_threads and
3473    __kmp_root, and returns the number of free entries generated.
3474 
3475    For Windows* OS static library, the first mechanism used is to reclaim array
3476    entries for root threads that are already dead.
3477 
3478    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3479    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3480    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3481    threadprivate cache array has been created. Synchronization with
3482    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3483 
3484    After any dead root reclamation, if the clipping value allows array expansion
3485    to result in the generation of a total of nNeed free slots, the function does
3486    that expansion. If not, nothing is done beyond the possible initial root
3487    thread reclamation.
3488 
3489    If any argument is negative, the behavior is undefined. */
3490 static int __kmp_expand_threads(int nNeed) {
3491   int added = 0;
3492   int minimumRequiredCapacity;
3493   int newCapacity;
3494   kmp_info_t **newThreads;
3495   kmp_root_t **newRoot;
3496 
3497   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3498   // resizing __kmp_threads does not need additional protection if foreign
3499   // threads are present
3500 
3501 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3502   /* only for Windows static library */
3503   /* reclaim array entries for root threads that are already dead */
3504   added = __kmp_reclaim_dead_roots();
3505 
3506   if (nNeed) {
3507     nNeed -= added;
3508     if (nNeed < 0)
3509       nNeed = 0;
3510   }
3511 #endif
3512   if (nNeed <= 0)
3513     return added;
3514 
3515   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3516   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3517   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3518   // > __kmp_max_nth in one of two ways:
3519   //
3520   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3521   //    may not be reused by another thread, so we may need to increase
3522   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3523   //
3524   // 2) New foreign root(s) are encountered.  We always register new foreign
3525   //    roots. This may cause a smaller # of threads to be allocated at
3526   //    subsequent parallel regions, but the worker threads hang around (and
3527   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3528   //
3529   // Anyway, that is the reason for moving the check to see if
3530   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3531   // instead of having it performed here. -BB
3532 
3533   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3534 
3535   /* compute expansion headroom to check if we can expand */
3536   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3537     /* possible expansion too small -- give up */
3538     return added;
3539   }
3540   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3541 
3542   newCapacity = __kmp_threads_capacity;
3543   do {
3544     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3545                                                           : __kmp_sys_max_nth;
3546   } while (newCapacity < minimumRequiredCapacity);
3547   newThreads = (kmp_info_t **)__kmp_allocate(
3548       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3549   newRoot =
3550       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3551   KMP_MEMCPY(newThreads, __kmp_threads,
3552              __kmp_threads_capacity * sizeof(kmp_info_t *));
3553   KMP_MEMCPY(newRoot, __kmp_root,
3554              __kmp_threads_capacity * sizeof(kmp_root_t *));
3555 
3556   kmp_info_t **temp_threads = __kmp_threads;
3557   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3558   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3559   __kmp_free(temp_threads);
3560   added += newCapacity - __kmp_threads_capacity;
3561   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3562 
3563   if (newCapacity > __kmp_tp_capacity) {
3564     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3565     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3566       __kmp_threadprivate_resize_cache(newCapacity);
3567     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3568       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3569     }
3570     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3571   }
3572 
3573   return added;
3574 }
3575 
3576 /* Register the current thread as a root thread and obtain our gtid. We must
3577    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3578    thread that calls from __kmp_do_serial_initialize() */
3579 int __kmp_register_root(int initial_thread) {
3580   kmp_info_t *root_thread;
3581   kmp_root_t *root;
3582   int gtid;
3583   int capacity;
3584   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3585   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3586   KMP_MB();
3587 
3588   /* 2007-03-02:
3589      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3590      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3591      work as expected -- it may return false (that means there is at least one
3592      empty slot in __kmp_threads array), but it is possible the only free slot
3593      is #0, which is reserved for initial thread and so cannot be used for this
3594      one. Following code workarounds this bug.
3595 
3596      However, right solution seems to be not reserving slot #0 for initial
3597      thread because:
3598      (1) there is no magic in slot #0,
3599      (2) we cannot detect initial thread reliably (the first thread which does
3600         serial initialization may be not a real initial thread).
3601   */
3602   capacity = __kmp_threads_capacity;
3603   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3604     --capacity;
3605   }
3606 
3607   /* see if there are too many threads */
3608   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3609     if (__kmp_tp_cached) {
3610       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3611                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3612                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3613     } else {
3614       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3615                   __kmp_msg_null);
3616     }
3617   }
3618 
3619   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3620   // 0: initial thread, also a regular OpenMP thread.
3621   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3622   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3623   // regular OpenMP threads.
3624   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3625     // Find an available thread slot for hidden helper thread. Slots for hidden
3626     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3627     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3628                    gtid <= __kmp_hidden_helper_threads_num;
3629          gtid++)
3630       ;
3631     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3632     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3633                  "hidden helper thread: T#%d\n",
3634                  gtid));
3635   } else {
3636     /* find an available thread slot */
3637     // Don't reassign the zero slot since we need that to only be used by
3638     // initial thread. Slots for hidden helper threads should also be skipped.
3639     if (initial_thread && __kmp_threads[0] == NULL) {
3640       gtid = 0;
3641     } else {
3642       for (gtid = __kmp_hidden_helper_threads_num + 1;
3643            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3644         ;
3645     }
3646     KA_TRACE(
3647         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3648     KMP_ASSERT(gtid < __kmp_threads_capacity);
3649   }
3650 
3651   /* update global accounting */
3652   __kmp_all_nth++;
3653   TCW_4(__kmp_nth, __kmp_nth + 1);
3654 
3655   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3656   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3657   if (__kmp_adjust_gtid_mode) {
3658     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3659       if (TCR_4(__kmp_gtid_mode) != 2) {
3660         TCW_4(__kmp_gtid_mode, 2);
3661       }
3662     } else {
3663       if (TCR_4(__kmp_gtid_mode) != 1) {
3664         TCW_4(__kmp_gtid_mode, 1);
3665       }
3666     }
3667   }
3668 
3669 #ifdef KMP_ADJUST_BLOCKTIME
3670   /* Adjust blocktime to zero if necessary            */
3671   /* Middle initialization might not have occurred yet */
3672   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3673     if (__kmp_nth > __kmp_avail_proc) {
3674       __kmp_zero_bt = TRUE;
3675     }
3676   }
3677 #endif /* KMP_ADJUST_BLOCKTIME */
3678 
3679   /* setup this new hierarchy */
3680   if (!(root = __kmp_root[gtid])) {
3681     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3682     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3683   }
3684 
3685 #if KMP_STATS_ENABLED
3686   // Initialize stats as soon as possible (right after gtid assignment).
3687   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3688   __kmp_stats_thread_ptr->startLife();
3689   KMP_SET_THREAD_STATE(SERIAL_REGION);
3690   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3691 #endif
3692   __kmp_initialize_root(root);
3693 
3694   /* setup new root thread structure */
3695   if (root->r.r_uber_thread) {
3696     root_thread = root->r.r_uber_thread;
3697   } else {
3698     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3699     if (__kmp_storage_map) {
3700       __kmp_print_thread_storage_map(root_thread, gtid);
3701     }
3702     root_thread->th.th_info.ds.ds_gtid = gtid;
3703 #if OMPT_SUPPORT
3704     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3705 #endif
3706     root_thread->th.th_root = root;
3707     if (__kmp_env_consistency_check) {
3708       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3709     }
3710 #if USE_FAST_MEMORY
3711     __kmp_initialize_fast_memory(root_thread);
3712 #endif /* USE_FAST_MEMORY */
3713 
3714 #if KMP_USE_BGET
3715     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3716     __kmp_initialize_bget(root_thread);
3717 #endif
3718     __kmp_init_random(root_thread); // Initialize random number generator
3719   }
3720 
3721   /* setup the serial team held in reserve by the root thread */
3722   if (!root_thread->th.th_serial_team) {
3723     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3724     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3725     root_thread->th.th_serial_team = __kmp_allocate_team(
3726         root, 1, 1,
3727 #if OMPT_SUPPORT
3728         ompt_data_none, // root parallel id
3729 #endif
3730         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3731   }
3732   KMP_ASSERT(root_thread->th.th_serial_team);
3733   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3734                 root_thread->th.th_serial_team));
3735 
3736   /* drop root_thread into place */
3737   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3738 
3739   root->r.r_root_team->t.t_threads[0] = root_thread;
3740   root->r.r_hot_team->t.t_threads[0] = root_thread;
3741   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3742   // AC: the team created in reserve, not for execution (it is unused for now).
3743   root_thread->th.th_serial_team->t.t_serialized = 0;
3744   root->r.r_uber_thread = root_thread;
3745 
3746   /* initialize the thread, get it ready to go */
3747   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3748   TCW_4(__kmp_init_gtid, TRUE);
3749 
3750   /* prepare the master thread for get_gtid() */
3751   __kmp_gtid_set_specific(gtid);
3752 
3753 #if USE_ITT_BUILD
3754   __kmp_itt_thread_name(gtid);
3755 #endif /* USE_ITT_BUILD */
3756 
3757 #ifdef KMP_TDATA_GTID
3758   __kmp_gtid = gtid;
3759 #endif
3760   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3761   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3762 
3763   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3764                 "plain=%u\n",
3765                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3766                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3767                 KMP_INIT_BARRIER_STATE));
3768   { // Initialize barrier data.
3769     int b;
3770     for (b = 0; b < bs_last_barrier; ++b) {
3771       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3772 #if USE_DEBUGGER
3773       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3774 #endif
3775     }
3776   }
3777   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3778                    KMP_INIT_BARRIER_STATE);
3779 
3780 #if KMP_AFFINITY_SUPPORTED
3781   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3782   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3783   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3784   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3785   if (TCR_4(__kmp_init_middle)) {
3786     __kmp_affinity_set_init_mask(gtid, TRUE);
3787   }
3788 #endif /* KMP_AFFINITY_SUPPORTED */
3789   root_thread->th.th_def_allocator = __kmp_def_allocator;
3790   root_thread->th.th_prev_level = 0;
3791   root_thread->th.th_prev_num_threads = 1;
3792 
3793   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3794   tmp->cg_root = root_thread;
3795   tmp->cg_thread_limit = __kmp_cg_max_nth;
3796   tmp->cg_nthreads = 1;
3797   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3798                  " cg_nthreads init to 1\n",
3799                  root_thread, tmp));
3800   tmp->up = NULL;
3801   root_thread->th.th_cg_roots = tmp;
3802 
3803   __kmp_root_counter++;
3804 
3805 #if OMPT_SUPPORT
3806   if (!initial_thread && ompt_enabled.enabled) {
3807 
3808     kmp_info_t *root_thread = ompt_get_thread();
3809 
3810     ompt_set_thread_state(root_thread, ompt_state_overhead);
3811 
3812     if (ompt_enabled.ompt_callback_thread_begin) {
3813       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3814           ompt_thread_initial, __ompt_get_thread_data_internal());
3815     }
3816     ompt_data_t *task_data;
3817     ompt_data_t *parallel_data;
3818     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3819                                   NULL);
3820     if (ompt_enabled.ompt_callback_implicit_task) {
3821       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3822           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3823     }
3824 
3825     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3826   }
3827 #endif
3828 
3829   KMP_MB();
3830   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3831 
3832   return gtid;
3833 }
3834 
3835 #if KMP_NESTED_HOT_TEAMS
3836 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3837                                 const int max_level) {
3838   int i, n, nth;
3839   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3840   if (!hot_teams || !hot_teams[level].hot_team) {
3841     return 0;
3842   }
3843   KMP_DEBUG_ASSERT(level < max_level);
3844   kmp_team_t *team = hot_teams[level].hot_team;
3845   nth = hot_teams[level].hot_team_nth;
3846   n = nth - 1; // master is not freed
3847   if (level < max_level - 1) {
3848     for (i = 0; i < nth; ++i) {
3849       kmp_info_t *th = team->t.t_threads[i];
3850       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3851       if (i > 0 && th->th.th_hot_teams) {
3852         __kmp_free(th->th.th_hot_teams);
3853         th->th.th_hot_teams = NULL;
3854       }
3855     }
3856   }
3857   __kmp_free_team(root, team, NULL);
3858   return n;
3859 }
3860 #endif
3861 
3862 // Resets a root thread and clear its root and hot teams.
3863 // Returns the number of __kmp_threads entries directly and indirectly freed.
3864 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3865   kmp_team_t *root_team = root->r.r_root_team;
3866   kmp_team_t *hot_team = root->r.r_hot_team;
3867   int n = hot_team->t.t_nproc;
3868   int i;
3869 
3870   KMP_DEBUG_ASSERT(!root->r.r_active);
3871 
3872   root->r.r_root_team = NULL;
3873   root->r.r_hot_team = NULL;
3874   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3875   // before call to __kmp_free_team().
3876   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3877 #if KMP_NESTED_HOT_TEAMS
3878   if (__kmp_hot_teams_max_level >
3879       0) { // need to free nested hot teams and their threads if any
3880     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3881       kmp_info_t *th = hot_team->t.t_threads[i];
3882       if (__kmp_hot_teams_max_level > 1) {
3883         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3884       }
3885       if (th->th.th_hot_teams) {
3886         __kmp_free(th->th.th_hot_teams);
3887         th->th.th_hot_teams = NULL;
3888       }
3889     }
3890   }
3891 #endif
3892   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3893 
3894   // Before we can reap the thread, we need to make certain that all other
3895   // threads in the teams that had this root as ancestor have stopped trying to
3896   // steal tasks.
3897   if (__kmp_tasking_mode != tskm_immediate_exec) {
3898     __kmp_wait_to_unref_task_teams();
3899   }
3900 
3901 #if KMP_OS_WINDOWS
3902   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3903   KA_TRACE(
3904       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3905            "\n",
3906            (LPVOID) & (root->r.r_uber_thread->th),
3907            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3908   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3909 #endif /* KMP_OS_WINDOWS */
3910 
3911 #if OMPT_SUPPORT
3912   ompt_data_t *task_data;
3913   ompt_data_t *parallel_data;
3914   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3915                                 NULL);
3916   if (ompt_enabled.ompt_callback_implicit_task) {
3917     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3918         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3919   }
3920   if (ompt_enabled.ompt_callback_thread_end) {
3921     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3922         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3923   }
3924 #endif
3925 
3926   TCW_4(__kmp_nth,
3927         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3928   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3929   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3930                  " to %d\n",
3931                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3932                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3933   if (i == 1) {
3934     // need to free contention group structure
3935     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3936                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3937     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3938     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3939     root->r.r_uber_thread->th.th_cg_roots = NULL;
3940   }
3941   __kmp_reap_thread(root->r.r_uber_thread, 1);
3942 
3943   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3944   // instead of freeing.
3945   root->r.r_uber_thread = NULL;
3946   /* mark root as no longer in use */
3947   root->r.r_begin = FALSE;
3948 
3949   return n;
3950 }
3951 
3952 void __kmp_unregister_root_current_thread(int gtid) {
3953   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3954   /* this lock should be ok, since unregister_root_current_thread is never
3955      called during an abort, only during a normal close. furthermore, if you
3956      have the forkjoin lock, you should never try to get the initz lock */
3957   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3958   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3959     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3960                   "exiting T#%d\n",
3961                   gtid));
3962     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3963     return;
3964   }
3965   kmp_root_t *root = __kmp_root[gtid];
3966 
3967   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3968   KMP_ASSERT(KMP_UBER_GTID(gtid));
3969   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3970   KMP_ASSERT(root->r.r_active == FALSE);
3971 
3972   KMP_MB();
3973 
3974   kmp_info_t *thread = __kmp_threads[gtid];
3975   kmp_team_t *team = thread->th.th_team;
3976   kmp_task_team_t *task_team = thread->th.th_task_team;
3977 
3978   // we need to wait for the proxy tasks before finishing the thread
3979   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3980 #if OMPT_SUPPORT
3981     // the runtime is shutting down so we won't report any events
3982     thread->th.ompt_thread_info.state = ompt_state_undefined;
3983 #endif
3984     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3985   }
3986 
3987   __kmp_reset_root(gtid, root);
3988 
3989   KMP_MB();
3990   KC_TRACE(10,
3991            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3992 
3993   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3994 }
3995 
3996 #if KMP_OS_WINDOWS
3997 /* __kmp_forkjoin_lock must be already held
3998    Unregisters a root thread that is not the current thread.  Returns the number
3999    of __kmp_threads entries freed as a result. */
4000 static int __kmp_unregister_root_other_thread(int gtid) {
4001   kmp_root_t *root = __kmp_root[gtid];
4002   int r;
4003 
4004   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4005   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4006   KMP_ASSERT(KMP_UBER_GTID(gtid));
4007   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4008   KMP_ASSERT(root->r.r_active == FALSE);
4009 
4010   r = __kmp_reset_root(gtid, root);
4011   KC_TRACE(10,
4012            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4013   return r;
4014 }
4015 #endif
4016 
4017 #if KMP_DEBUG
4018 void __kmp_task_info() {
4019 
4020   kmp_int32 gtid = __kmp_entry_gtid();
4021   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4022   kmp_info_t *this_thr = __kmp_threads[gtid];
4023   kmp_team_t *steam = this_thr->th.th_serial_team;
4024   kmp_team_t *team = this_thr->th.th_team;
4025 
4026   __kmp_printf(
4027       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4028       "ptask=%p\n",
4029       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4030       team->t.t_implicit_task_taskdata[tid].td_parent);
4031 }
4032 #endif // KMP_DEBUG
4033 
4034 /* TODO optimize with one big memclr, take out what isn't needed, split
4035    responsibility to workers as much as possible, and delay initialization of
4036    features as much as possible  */
4037 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4038                                   int tid, int gtid) {
4039   /* this_thr->th.th_info.ds.ds_gtid is setup in
4040      kmp_allocate_thread/create_worker.
4041      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4042   kmp_info_t *master = team->t.t_threads[0];
4043   KMP_DEBUG_ASSERT(this_thr != NULL);
4044   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4045   KMP_DEBUG_ASSERT(team);
4046   KMP_DEBUG_ASSERT(team->t.t_threads);
4047   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4048   KMP_DEBUG_ASSERT(master);
4049   KMP_DEBUG_ASSERT(master->th.th_root);
4050 
4051   KMP_MB();
4052 
4053   TCW_SYNC_PTR(this_thr->th.th_team, team);
4054 
4055   this_thr->th.th_info.ds.ds_tid = tid;
4056   this_thr->th.th_set_nproc = 0;
4057   if (__kmp_tasking_mode != tskm_immediate_exec)
4058     // When tasking is possible, threads are not safe to reap until they are
4059     // done tasking; this will be set when tasking code is exited in wait
4060     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4061   else // no tasking --> always safe to reap
4062     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4063   this_thr->th.th_set_proc_bind = proc_bind_default;
4064 #if KMP_AFFINITY_SUPPORTED
4065   this_thr->th.th_new_place = this_thr->th.th_current_place;
4066 #endif
4067   this_thr->th.th_root = master->th.th_root;
4068 
4069   /* setup the thread's cache of the team structure */
4070   this_thr->th.th_team_nproc = team->t.t_nproc;
4071   this_thr->th.th_team_master = master;
4072   this_thr->th.th_team_serialized = team->t.t_serialized;
4073   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4074 
4075   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4076 
4077   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4078                 tid, gtid, this_thr, this_thr->th.th_current_task));
4079 
4080   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4081                            team, tid, TRUE);
4082 
4083   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4084                 tid, gtid, this_thr, this_thr->th.th_current_task));
4085   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4086   // __kmp_initialize_team()?
4087 
4088   /* TODO no worksharing in speculative threads */
4089   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4090 
4091   this_thr->th.th_local.this_construct = 0;
4092 
4093   if (!this_thr->th.th_pri_common) {
4094     this_thr->th.th_pri_common =
4095         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4096     if (__kmp_storage_map) {
4097       __kmp_print_storage_map_gtid(
4098           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4099           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4100     }
4101     this_thr->th.th_pri_head = NULL;
4102   }
4103 
4104   if (this_thr != master && // Master's CG root is initialized elsewhere
4105       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4106     // Make new thread's CG root same as master's
4107     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4108     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4109     if (tmp) {
4110       // worker changes CG, need to check if old CG should be freed
4111       int i = tmp->cg_nthreads--;
4112       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4113                      " on node %p of thread %p to %d\n",
4114                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4115       if (i == 1) {
4116         __kmp_free(tmp); // last thread left CG --> free it
4117       }
4118     }
4119     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4120     // Increment new thread's CG root's counter to add the new thread
4121     this_thr->th.th_cg_roots->cg_nthreads++;
4122     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4123                    " node %p of thread %p to %d\n",
4124                    this_thr, this_thr->th.th_cg_roots,
4125                    this_thr->th.th_cg_roots->cg_root,
4126                    this_thr->th.th_cg_roots->cg_nthreads));
4127     this_thr->th.th_current_task->td_icvs.thread_limit =
4128         this_thr->th.th_cg_roots->cg_thread_limit;
4129   }
4130 
4131   /* Initialize dynamic dispatch */
4132   {
4133     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4134     // Use team max_nproc since this will never change for the team.
4135     size_t disp_size =
4136         sizeof(dispatch_private_info_t) *
4137         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4138     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4139                   team->t.t_max_nproc));
4140     KMP_ASSERT(dispatch);
4141     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4142     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4143 
4144     dispatch->th_disp_index = 0;
4145     dispatch->th_doacross_buf_idx = 0;
4146     if (!dispatch->th_disp_buffer) {
4147       dispatch->th_disp_buffer =
4148           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4149 
4150       if (__kmp_storage_map) {
4151         __kmp_print_storage_map_gtid(
4152             gtid, &dispatch->th_disp_buffer[0],
4153             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4154                                           ? 1
4155                                           : __kmp_dispatch_num_buffers],
4156             disp_size,
4157             "th_%d.th_dispatch.th_disp_buffer "
4158             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4159             gtid, team->t.t_id, gtid);
4160       }
4161     } else {
4162       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4163     }
4164 
4165     dispatch->th_dispatch_pr_current = 0;
4166     dispatch->th_dispatch_sh_current = 0;
4167 
4168     dispatch->th_deo_fcn = 0; /* ORDERED     */
4169     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4170   }
4171 
4172   this_thr->th.th_next_pool = NULL;
4173 
4174   if (!this_thr->th.th_task_state_memo_stack) {
4175     size_t i;
4176     this_thr->th.th_task_state_memo_stack =
4177         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4178     this_thr->th.th_task_state_top = 0;
4179     this_thr->th.th_task_state_stack_sz = 4;
4180     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4181          ++i) // zero init the stack
4182       this_thr->th.th_task_state_memo_stack[i] = 0;
4183   }
4184 
4185   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4186   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4187 
4188   KMP_MB();
4189 }
4190 
4191 /* allocate a new thread for the requesting team. this is only called from
4192    within a forkjoin critical section. we will first try to get an available
4193    thread from the thread pool. if none is available, we will fork a new one
4194    assuming we are able to create a new one. this should be assured, as the
4195    caller should check on this first. */
4196 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4197                                   int new_tid) {
4198   kmp_team_t *serial_team;
4199   kmp_info_t *new_thr;
4200   int new_gtid;
4201 
4202   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4203   KMP_DEBUG_ASSERT(root && team);
4204 #if !KMP_NESTED_HOT_TEAMS
4205   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4206 #endif
4207   KMP_MB();
4208 
4209   /* first, try to get one from the thread pool */
4210   if (__kmp_thread_pool) {
4211     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4212     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4213     if (new_thr == __kmp_thread_pool_insert_pt) {
4214       __kmp_thread_pool_insert_pt = NULL;
4215     }
4216     TCW_4(new_thr->th.th_in_pool, FALSE);
4217     __kmp_suspend_initialize_thread(new_thr);
4218     __kmp_lock_suspend_mx(new_thr);
4219     if (new_thr->th.th_active_in_pool == TRUE) {
4220       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4221       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4222       new_thr->th.th_active_in_pool = FALSE;
4223     }
4224     __kmp_unlock_suspend_mx(new_thr);
4225 
4226     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4227                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4228     KMP_ASSERT(!new_thr->th.th_team);
4229     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4230 
4231     /* setup the thread structure */
4232     __kmp_initialize_info(new_thr, team, new_tid,
4233                           new_thr->th.th_info.ds.ds_gtid);
4234     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4235 
4236     TCW_4(__kmp_nth, __kmp_nth + 1);
4237 
4238     new_thr->th.th_task_state = 0;
4239     new_thr->th.th_task_state_top = 0;
4240     new_thr->th.th_task_state_stack_sz = 4;
4241 
4242 #ifdef KMP_ADJUST_BLOCKTIME
4243     /* Adjust blocktime back to zero if necessary */
4244     /* Middle initialization might not have occurred yet */
4245     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4246       if (__kmp_nth > __kmp_avail_proc) {
4247         __kmp_zero_bt = TRUE;
4248       }
4249     }
4250 #endif /* KMP_ADJUST_BLOCKTIME */
4251 
4252 #if KMP_DEBUG
4253     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4254     // KMP_BARRIER_PARENT_FLAG.
4255     int b;
4256     kmp_balign_t *balign = new_thr->th.th_bar;
4257     for (b = 0; b < bs_last_barrier; ++b)
4258       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4259 #endif
4260 
4261     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4262                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4263 
4264     KMP_MB();
4265     return new_thr;
4266   }
4267 
4268   /* no, well fork a new one */
4269   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4270   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4271 
4272 #if KMP_USE_MONITOR
4273   // If this is the first worker thread the RTL is creating, then also
4274   // launch the monitor thread.  We try to do this as early as possible.
4275   if (!TCR_4(__kmp_init_monitor)) {
4276     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4277     if (!TCR_4(__kmp_init_monitor)) {
4278       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4279       TCW_4(__kmp_init_monitor, 1);
4280       __kmp_create_monitor(&__kmp_monitor);
4281       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4282 #if KMP_OS_WINDOWS
4283       // AC: wait until monitor has started. This is a fix for CQ232808.
4284       // The reason is that if the library is loaded/unloaded in a loop with
4285       // small (parallel) work in between, then there is high probability that
4286       // monitor thread started after the library shutdown. At shutdown it is
4287       // too late to cope with the problem, because when the master is in
4288       // DllMain (process detach) the monitor has no chances to start (it is
4289       // blocked), and master has no means to inform the monitor that the
4290       // library has gone, because all the memory which the monitor can access
4291       // is going to be released/reset.
4292       while (TCR_4(__kmp_init_monitor) < 2) {
4293         KMP_YIELD(TRUE);
4294       }
4295       KF_TRACE(10, ("after monitor thread has started\n"));
4296 #endif
4297     }
4298     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4299   }
4300 #endif
4301 
4302   KMP_MB();
4303 
4304   {
4305     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4306                              ? 1
4307                              : __kmp_hidden_helper_threads_num + 1;
4308 
4309     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4310          ++new_gtid) {
4311       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4312     }
4313 
4314     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4315       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4316     }
4317   }
4318 
4319   /* allocate space for it. */
4320   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4321 
4322   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4323 
4324 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4325   // suppress race conditions detection on synchronization flags in debug mode
4326   // this helps to analyze library internals eliminating false positives
4327   __itt_suppress_mark_range(
4328       __itt_suppress_range, __itt_suppress_threading_errors,
4329       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4330   __itt_suppress_mark_range(
4331       __itt_suppress_range, __itt_suppress_threading_errors,
4332       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4333 #if KMP_OS_WINDOWS
4334   __itt_suppress_mark_range(
4335       __itt_suppress_range, __itt_suppress_threading_errors,
4336       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4337 #else
4338   __itt_suppress_mark_range(__itt_suppress_range,
4339                             __itt_suppress_threading_errors,
4340                             &new_thr->th.th_suspend_init_count,
4341                             sizeof(new_thr->th.th_suspend_init_count));
4342 #endif
4343   // TODO: check if we need to also suppress b_arrived flags
4344   __itt_suppress_mark_range(__itt_suppress_range,
4345                             __itt_suppress_threading_errors,
4346                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4347                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4348   __itt_suppress_mark_range(__itt_suppress_range,
4349                             __itt_suppress_threading_errors,
4350                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4351                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4352   __itt_suppress_mark_range(__itt_suppress_range,
4353                             __itt_suppress_threading_errors,
4354                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4355                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4356 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4357   if (__kmp_storage_map) {
4358     __kmp_print_thread_storage_map(new_thr, new_gtid);
4359   }
4360 
4361   // add the reserve serialized team, initialized from the team's master thread
4362   {
4363     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4364     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4365     new_thr->th.th_serial_team = serial_team =
4366         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4367 #if OMPT_SUPPORT
4368                                           ompt_data_none, // root parallel id
4369 #endif
4370                                           proc_bind_default, &r_icvs,
4371                                           0 USE_NESTED_HOT_ARG(NULL));
4372   }
4373   KMP_ASSERT(serial_team);
4374   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4375   // execution (it is unused for now).
4376   serial_team->t.t_threads[0] = new_thr;
4377   KF_TRACE(10,
4378            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4379             new_thr));
4380 
4381   /* setup the thread structures */
4382   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4383 
4384 #if USE_FAST_MEMORY
4385   __kmp_initialize_fast_memory(new_thr);
4386 #endif /* USE_FAST_MEMORY */
4387 
4388 #if KMP_USE_BGET
4389   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4390   __kmp_initialize_bget(new_thr);
4391 #endif
4392 
4393   __kmp_init_random(new_thr); // Initialize random number generator
4394 
4395   /* Initialize these only once when thread is grabbed for a team allocation */
4396   KA_TRACE(20,
4397            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4398             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4399 
4400   int b;
4401   kmp_balign_t *balign = new_thr->th.th_bar;
4402   for (b = 0; b < bs_last_barrier; ++b) {
4403     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4404     balign[b].bb.team = NULL;
4405     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4406     balign[b].bb.use_oncore_barrier = 0;
4407   }
4408 
4409   new_thr->th.th_spin_here = FALSE;
4410   new_thr->th.th_next_waiting = 0;
4411 #if KMP_OS_UNIX
4412   new_thr->th.th_blocking = false;
4413 #endif
4414 
4415 #if KMP_AFFINITY_SUPPORTED
4416   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4417   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4418   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4419   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4420 #endif
4421   new_thr->th.th_def_allocator = __kmp_def_allocator;
4422   new_thr->th.th_prev_level = 0;
4423   new_thr->th.th_prev_num_threads = 1;
4424 
4425   TCW_4(new_thr->th.th_in_pool, FALSE);
4426   new_thr->th.th_active_in_pool = FALSE;
4427   TCW_4(new_thr->th.th_active, TRUE);
4428 
4429   /* adjust the global counters */
4430   __kmp_all_nth++;
4431   __kmp_nth++;
4432 
4433   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4434   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4435   if (__kmp_adjust_gtid_mode) {
4436     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4437       if (TCR_4(__kmp_gtid_mode) != 2) {
4438         TCW_4(__kmp_gtid_mode, 2);
4439       }
4440     } else {
4441       if (TCR_4(__kmp_gtid_mode) != 1) {
4442         TCW_4(__kmp_gtid_mode, 1);
4443       }
4444     }
4445   }
4446 
4447 #ifdef KMP_ADJUST_BLOCKTIME
4448   /* Adjust blocktime back to zero if necessary       */
4449   /* Middle initialization might not have occurred yet */
4450   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4451     if (__kmp_nth > __kmp_avail_proc) {
4452       __kmp_zero_bt = TRUE;
4453     }
4454   }
4455 #endif /* KMP_ADJUST_BLOCKTIME */
4456 
4457   /* actually fork it and create the new worker thread */
4458   KF_TRACE(
4459       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4460   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4461   KF_TRACE(10,
4462            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4463 
4464   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4465                 new_gtid));
4466   KMP_MB();
4467   return new_thr;
4468 }
4469 
4470 /* Reinitialize team for reuse.
4471    The hot team code calls this case at every fork barrier, so EPCC barrier
4472    test are extremely sensitive to changes in it, esp. writes to the team
4473    struct, which cause a cache invalidation in all threads.
4474    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4475 static void __kmp_reinitialize_team(kmp_team_t *team,
4476                                     kmp_internal_control_t *new_icvs,
4477                                     ident_t *loc) {
4478   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4479                 team->t.t_threads[0], team));
4480   KMP_DEBUG_ASSERT(team && new_icvs);
4481   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4482   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4483 
4484   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4485   // Copy ICVs to the master thread's implicit taskdata
4486   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4487   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4488 
4489   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4490                 team->t.t_threads[0], team));
4491 }
4492 
4493 /* Initialize the team data structure.
4494    This assumes the t_threads and t_max_nproc are already set.
4495    Also, we don't touch the arguments */
4496 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4497                                   kmp_internal_control_t *new_icvs,
4498                                   ident_t *loc) {
4499   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4500 
4501   /* verify */
4502   KMP_DEBUG_ASSERT(team);
4503   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4504   KMP_DEBUG_ASSERT(team->t.t_threads);
4505   KMP_MB();
4506 
4507   team->t.t_master_tid = 0; /* not needed */
4508   /* team->t.t_master_bar;        not needed */
4509   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4510   team->t.t_nproc = new_nproc;
4511 
4512   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4513   team->t.t_next_pool = NULL;
4514   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4515    * up hot team */
4516 
4517   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4518   team->t.t_invoke = NULL; /* not needed */
4519 
4520   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4521   team->t.t_sched.sched = new_icvs->sched.sched;
4522 
4523 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4524   team->t.t_fp_control_saved = FALSE; /* not needed */
4525   team->t.t_x87_fpu_control_word = 0; /* not needed */
4526   team->t.t_mxcsr = 0; /* not needed */
4527 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4528 
4529   team->t.t_construct = 0;
4530 
4531   team->t.t_ordered.dt.t_value = 0;
4532   team->t.t_master_active = FALSE;
4533 
4534 #ifdef KMP_DEBUG
4535   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4536 #endif
4537 #if KMP_OS_WINDOWS
4538   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4539 #endif
4540 
4541   team->t.t_control_stack_top = NULL;
4542 
4543   __kmp_reinitialize_team(team, new_icvs, loc);
4544 
4545   KMP_MB();
4546   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4547 }
4548 
4549 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4550 /* Sets full mask for thread and returns old mask, no changes to structures. */
4551 static void
4552 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4553   if (KMP_AFFINITY_CAPABLE()) {
4554     int status;
4555     if (old_mask != NULL) {
4556       status = __kmp_get_system_affinity(old_mask, TRUE);
4557       int error = errno;
4558       if (status != 0) {
4559         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4560                     __kmp_msg_null);
4561       }
4562     }
4563     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4564   }
4565 }
4566 #endif
4567 
4568 #if KMP_AFFINITY_SUPPORTED
4569 
4570 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4571 // It calculates the worker + master thread's partition based upon the parent
4572 // thread's partition, and binds each worker to a thread in their partition.
4573 // The master thread's partition should already include its current binding.
4574 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4575   // Copy the master thread's place partition to the team struct
4576   kmp_info_t *master_th = team->t.t_threads[0];
4577   KMP_DEBUG_ASSERT(master_th != NULL);
4578   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4579   int first_place = master_th->th.th_first_place;
4580   int last_place = master_th->th.th_last_place;
4581   int masters_place = master_th->th.th_current_place;
4582   team->t.t_first_place = first_place;
4583   team->t.t_last_place = last_place;
4584 
4585   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4586                 "bound to place %d partition = [%d,%d]\n",
4587                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4588                 team->t.t_id, masters_place, first_place, last_place));
4589 
4590   switch (proc_bind) {
4591 
4592   case proc_bind_default:
4593     // serial teams might have the proc_bind policy set to proc_bind_default. It
4594     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4595     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4596     break;
4597 
4598   case proc_bind_master: {
4599     int f;
4600     int n_th = team->t.t_nproc;
4601     for (f = 1; f < n_th; f++) {
4602       kmp_info_t *th = team->t.t_threads[f];
4603       KMP_DEBUG_ASSERT(th != NULL);
4604       th->th.th_first_place = first_place;
4605       th->th.th_last_place = last_place;
4606       th->th.th_new_place = masters_place;
4607       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4608           team->t.t_display_affinity != 1) {
4609         team->t.t_display_affinity = 1;
4610       }
4611 
4612       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4613                      "partition = [%d,%d]\n",
4614                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4615                      f, masters_place, first_place, last_place));
4616     }
4617   } break;
4618 
4619   case proc_bind_close: {
4620     int f;
4621     int n_th = team->t.t_nproc;
4622     int n_places;
4623     if (first_place <= last_place) {
4624       n_places = last_place - first_place + 1;
4625     } else {
4626       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4627     }
4628     if (n_th <= n_places) {
4629       int place = masters_place;
4630       for (f = 1; f < n_th; f++) {
4631         kmp_info_t *th = team->t.t_threads[f];
4632         KMP_DEBUG_ASSERT(th != NULL);
4633 
4634         if (place == last_place) {
4635           place = first_place;
4636         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4637           place = 0;
4638         } else {
4639           place++;
4640         }
4641         th->th.th_first_place = first_place;
4642         th->th.th_last_place = last_place;
4643         th->th.th_new_place = place;
4644         if (__kmp_display_affinity && place != th->th.th_current_place &&
4645             team->t.t_display_affinity != 1) {
4646           team->t.t_display_affinity = 1;
4647         }
4648 
4649         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4650                        "partition = [%d,%d]\n",
4651                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4652                        team->t.t_id, f, place, first_place, last_place));
4653       }
4654     } else {
4655       int S, rem, gap, s_count;
4656       S = n_th / n_places;
4657       s_count = 0;
4658       rem = n_th - (S * n_places);
4659       gap = rem > 0 ? n_places / rem : n_places;
4660       int place = masters_place;
4661       int gap_ct = gap;
4662       for (f = 0; f < n_th; f++) {
4663         kmp_info_t *th = team->t.t_threads[f];
4664         KMP_DEBUG_ASSERT(th != NULL);
4665 
4666         th->th.th_first_place = first_place;
4667         th->th.th_last_place = last_place;
4668         th->th.th_new_place = place;
4669         if (__kmp_display_affinity && place != th->th.th_current_place &&
4670             team->t.t_display_affinity != 1) {
4671           team->t.t_display_affinity = 1;
4672         }
4673         s_count++;
4674 
4675         if ((s_count == S) && rem && (gap_ct == gap)) {
4676           // do nothing, add an extra thread to place on next iteration
4677         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4678           // we added an extra thread to this place; move to next place
4679           if (place == last_place) {
4680             place = first_place;
4681           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4682             place = 0;
4683           } else {
4684             place++;
4685           }
4686           s_count = 0;
4687           gap_ct = 1;
4688           rem--;
4689         } else if (s_count == S) { // place full; don't add extra
4690           if (place == last_place) {
4691             place = first_place;
4692           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4693             place = 0;
4694           } else {
4695             place++;
4696           }
4697           gap_ct++;
4698           s_count = 0;
4699         }
4700 
4701         KA_TRACE(100,
4702                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4703                   "partition = [%d,%d]\n",
4704                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4705                   th->th.th_new_place, first_place, last_place));
4706       }
4707       KMP_DEBUG_ASSERT(place == masters_place);
4708     }
4709   } break;
4710 
4711   case proc_bind_spread: {
4712     int f;
4713     int n_th = team->t.t_nproc;
4714     int n_places;
4715     int thidx;
4716     if (first_place <= last_place) {
4717       n_places = last_place - first_place + 1;
4718     } else {
4719       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4720     }
4721     if (n_th <= n_places) {
4722       int place = -1;
4723 
4724       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4725         int S = n_places / n_th;
4726         int s_count, rem, gap, gap_ct;
4727 
4728         place = masters_place;
4729         rem = n_places - n_th * S;
4730         gap = rem ? n_th / rem : 1;
4731         gap_ct = gap;
4732         thidx = n_th;
4733         if (update_master_only == 1)
4734           thidx = 1;
4735         for (f = 0; f < thidx; f++) {
4736           kmp_info_t *th = team->t.t_threads[f];
4737           KMP_DEBUG_ASSERT(th != NULL);
4738 
4739           th->th.th_first_place = place;
4740           th->th.th_new_place = place;
4741           if (__kmp_display_affinity && place != th->th.th_current_place &&
4742               team->t.t_display_affinity != 1) {
4743             team->t.t_display_affinity = 1;
4744           }
4745           s_count = 1;
4746           while (s_count < S) {
4747             if (place == last_place) {
4748               place = first_place;
4749             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4750               place = 0;
4751             } else {
4752               place++;
4753             }
4754             s_count++;
4755           }
4756           if (rem && (gap_ct == gap)) {
4757             if (place == last_place) {
4758               place = first_place;
4759             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4760               place = 0;
4761             } else {
4762               place++;
4763             }
4764             rem--;
4765             gap_ct = 0;
4766           }
4767           th->th.th_last_place = place;
4768           gap_ct++;
4769 
4770           if (place == last_place) {
4771             place = first_place;
4772           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4773             place = 0;
4774           } else {
4775             place++;
4776           }
4777 
4778           KA_TRACE(100,
4779                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4780                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4781                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4782                     f, th->th.th_new_place, th->th.th_first_place,
4783                     th->th.th_last_place, __kmp_affinity_num_masks));
4784         }
4785       } else {
4786         /* Having uniform space of available computation places I can create
4787            T partitions of round(P/T) size and put threads into the first
4788            place of each partition. */
4789         double current = static_cast<double>(masters_place);
4790         double spacing =
4791             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4792         int first, last;
4793         kmp_info_t *th;
4794 
4795         thidx = n_th + 1;
4796         if (update_master_only == 1)
4797           thidx = 1;
4798         for (f = 0; f < thidx; f++) {
4799           first = static_cast<int>(current);
4800           last = static_cast<int>(current + spacing) - 1;
4801           KMP_DEBUG_ASSERT(last >= first);
4802           if (first >= n_places) {
4803             if (masters_place) {
4804               first -= n_places;
4805               last -= n_places;
4806               if (first == (masters_place + 1)) {
4807                 KMP_DEBUG_ASSERT(f == n_th);
4808                 first--;
4809               }
4810               if (last == masters_place) {
4811                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4812                 last--;
4813               }
4814             } else {
4815               KMP_DEBUG_ASSERT(f == n_th);
4816               first = 0;
4817               last = 0;
4818             }
4819           }
4820           if (last >= n_places) {
4821             last = (n_places - 1);
4822           }
4823           place = first;
4824           current += spacing;
4825           if (f < n_th) {
4826             KMP_DEBUG_ASSERT(0 <= first);
4827             KMP_DEBUG_ASSERT(n_places > first);
4828             KMP_DEBUG_ASSERT(0 <= last);
4829             KMP_DEBUG_ASSERT(n_places > last);
4830             KMP_DEBUG_ASSERT(last_place >= first_place);
4831             th = team->t.t_threads[f];
4832             KMP_DEBUG_ASSERT(th);
4833             th->th.th_first_place = first;
4834             th->th.th_new_place = place;
4835             th->th.th_last_place = last;
4836             if (__kmp_display_affinity && place != th->th.th_current_place &&
4837                 team->t.t_display_affinity != 1) {
4838               team->t.t_display_affinity = 1;
4839             }
4840             KA_TRACE(100,
4841                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4842                       "partition = [%d,%d], spacing = %.4f\n",
4843                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4844                       team->t.t_id, f, th->th.th_new_place,
4845                       th->th.th_first_place, th->th.th_last_place, spacing));
4846           }
4847         }
4848       }
4849       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4850     } else {
4851       int S, rem, gap, s_count;
4852       S = n_th / n_places;
4853       s_count = 0;
4854       rem = n_th - (S * n_places);
4855       gap = rem > 0 ? n_places / rem : n_places;
4856       int place = masters_place;
4857       int gap_ct = gap;
4858       thidx = n_th;
4859       if (update_master_only == 1)
4860         thidx = 1;
4861       for (f = 0; f < thidx; f++) {
4862         kmp_info_t *th = team->t.t_threads[f];
4863         KMP_DEBUG_ASSERT(th != NULL);
4864 
4865         th->th.th_first_place = place;
4866         th->th.th_last_place = place;
4867         th->th.th_new_place = place;
4868         if (__kmp_display_affinity && place != th->th.th_current_place &&
4869             team->t.t_display_affinity != 1) {
4870           team->t.t_display_affinity = 1;
4871         }
4872         s_count++;
4873 
4874         if ((s_count == S) && rem && (gap_ct == gap)) {
4875           // do nothing, add an extra thread to place on next iteration
4876         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4877           // we added an extra thread to this place; move on to next place
4878           if (place == last_place) {
4879             place = first_place;
4880           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4881             place = 0;
4882           } else {
4883             place++;
4884           }
4885           s_count = 0;
4886           gap_ct = 1;
4887           rem--;
4888         } else if (s_count == S) { // place is full; don't add extra thread
4889           if (place == last_place) {
4890             place = first_place;
4891           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4892             place = 0;
4893           } else {
4894             place++;
4895           }
4896           gap_ct++;
4897           s_count = 0;
4898         }
4899 
4900         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4901                        "partition = [%d,%d]\n",
4902                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4903                        team->t.t_id, f, th->th.th_new_place,
4904                        th->th.th_first_place, th->th.th_last_place));
4905       }
4906       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4907     }
4908   } break;
4909 
4910   default:
4911     break;
4912   }
4913 
4914   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4915 }
4916 
4917 #endif // KMP_AFFINITY_SUPPORTED
4918 
4919 /* allocate a new team data structure to use.  take one off of the free pool if
4920    available */
4921 kmp_team_t *
4922 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4923 #if OMPT_SUPPORT
4924                     ompt_data_t ompt_parallel_data,
4925 #endif
4926                     kmp_proc_bind_t new_proc_bind,
4927                     kmp_internal_control_t *new_icvs,
4928                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4929   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4930   int f;
4931   kmp_team_t *team;
4932   int use_hot_team = !root->r.r_active;
4933   int level = 0;
4934 
4935   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4936   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4937   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4938   KMP_MB();
4939 
4940 #if KMP_NESTED_HOT_TEAMS
4941   kmp_hot_team_ptr_t *hot_teams;
4942   if (master) {
4943     team = master->th.th_team;
4944     level = team->t.t_active_level;
4945     if (master->th.th_teams_microtask) { // in teams construct?
4946       if (master->th.th_teams_size.nteams > 1 &&
4947           ( // #teams > 1
4948               team->t.t_pkfn ==
4949                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4950               master->th.th_teams_level <
4951                   team->t.t_level)) { // or nested parallel inside the teams
4952         ++level; // not increment if #teams==1, or for outer fork of the teams;
4953         // increment otherwise
4954       }
4955     }
4956     hot_teams = master->th.th_hot_teams;
4957     if (level < __kmp_hot_teams_max_level && hot_teams &&
4958         hot_teams[level].hot_team) {
4959       // hot team has already been allocated for given level
4960       use_hot_team = 1;
4961     } else {
4962       use_hot_team = 0;
4963     }
4964   } else {
4965     // check we won't access uninitialized hot_teams, just in case
4966     KMP_DEBUG_ASSERT(new_nproc == 1);
4967   }
4968 #endif
4969   // Optimization to use a "hot" team
4970   if (use_hot_team && new_nproc > 1) {
4971     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4972 #if KMP_NESTED_HOT_TEAMS
4973     team = hot_teams[level].hot_team;
4974 #else
4975     team = root->r.r_hot_team;
4976 #endif
4977 #if KMP_DEBUG
4978     if (__kmp_tasking_mode != tskm_immediate_exec) {
4979       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4980                     "task_team[1] = %p before reinit\n",
4981                     team->t.t_task_team[0], team->t.t_task_team[1]));
4982     }
4983 #endif
4984 
4985     // Has the number of threads changed?
4986     /* Let's assume the most common case is that the number of threads is
4987        unchanged, and put that case first. */
4988     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4989       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4990       // This case can mean that omp_set_num_threads() was called and the hot
4991       // team size was already reduced, so we check the special flag
4992       if (team->t.t_size_changed == -1) {
4993         team->t.t_size_changed = 1;
4994       } else {
4995         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4996       }
4997 
4998       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4999       kmp_r_sched_t new_sched = new_icvs->sched;
5000       // set master's schedule as new run-time schedule
5001       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5002 
5003       __kmp_reinitialize_team(team, new_icvs,
5004                               root->r.r_uber_thread->th.th_ident);
5005 
5006       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5007                     team->t.t_threads[0], team));
5008       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5009 
5010 #if KMP_AFFINITY_SUPPORTED
5011       if ((team->t.t_size_changed == 0) &&
5012           (team->t.t_proc_bind == new_proc_bind)) {
5013         if (new_proc_bind == proc_bind_spread) {
5014           __kmp_partition_places(
5015               team, 1); // add flag to update only master for spread
5016         }
5017         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5018                        "proc_bind = %d, partition = [%d,%d]\n",
5019                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5020                        team->t.t_last_place));
5021       } else {
5022         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5023         __kmp_partition_places(team);
5024       }
5025 #else
5026       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5027 #endif /* KMP_AFFINITY_SUPPORTED */
5028     } else if (team->t.t_nproc > new_nproc) {
5029       KA_TRACE(20,
5030                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5031                 new_nproc));
5032 
5033       team->t.t_size_changed = 1;
5034 #if KMP_NESTED_HOT_TEAMS
5035       if (__kmp_hot_teams_mode == 0) {
5036         // AC: saved number of threads should correspond to team's value in this
5037         // mode, can be bigger in mode 1, when hot team has threads in reserve
5038         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5039         hot_teams[level].hot_team_nth = new_nproc;
5040 #endif // KMP_NESTED_HOT_TEAMS
5041         /* release the extra threads we don't need any more */
5042         for (f = new_nproc; f < team->t.t_nproc; f++) {
5043           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5044           if (__kmp_tasking_mode != tskm_immediate_exec) {
5045             // When decreasing team size, threads no longer in the team should
5046             // unref task team.
5047             team->t.t_threads[f]->th.th_task_team = NULL;
5048           }
5049           __kmp_free_thread(team->t.t_threads[f]);
5050           team->t.t_threads[f] = NULL;
5051         }
5052 #if KMP_NESTED_HOT_TEAMS
5053       } // (__kmp_hot_teams_mode == 0)
5054       else {
5055         // When keeping extra threads in team, switch threads to wait on own
5056         // b_go flag
5057         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5058           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5059           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5060           for (int b = 0; b < bs_last_barrier; ++b) {
5061             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5062               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5063             }
5064             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5065           }
5066         }
5067       }
5068 #endif // KMP_NESTED_HOT_TEAMS
5069       team->t.t_nproc = new_nproc;
5070       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5071       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5072       __kmp_reinitialize_team(team, new_icvs,
5073                               root->r.r_uber_thread->th.th_ident);
5074 
5075       // Update remaining threads
5076       for (f = 0; f < new_nproc; ++f) {
5077         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5078       }
5079 
5080       // restore the current task state of the master thread: should be the
5081       // implicit task
5082       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5083                     team->t.t_threads[0], team));
5084 
5085       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5086 
5087 #ifdef KMP_DEBUG
5088       for (f = 0; f < team->t.t_nproc; f++) {
5089         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5090                          team->t.t_threads[f]->th.th_team_nproc ==
5091                              team->t.t_nproc);
5092       }
5093 #endif
5094 
5095       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5096 #if KMP_AFFINITY_SUPPORTED
5097       __kmp_partition_places(team);
5098 #endif
5099     } else { // team->t.t_nproc < new_nproc
5100 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5101       kmp_affin_mask_t *old_mask;
5102       if (KMP_AFFINITY_CAPABLE()) {
5103         KMP_CPU_ALLOC(old_mask);
5104       }
5105 #endif
5106 
5107       KA_TRACE(20,
5108                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5109                 new_nproc));
5110 
5111       team->t.t_size_changed = 1;
5112 
5113 #if KMP_NESTED_HOT_TEAMS
5114       int avail_threads = hot_teams[level].hot_team_nth;
5115       if (new_nproc < avail_threads)
5116         avail_threads = new_nproc;
5117       kmp_info_t **other_threads = team->t.t_threads;
5118       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5119         // Adjust barrier data of reserved threads (if any) of the team
5120         // Other data will be set in __kmp_initialize_info() below.
5121         int b;
5122         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5123         for (b = 0; b < bs_last_barrier; ++b) {
5124           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5125           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5126 #if USE_DEBUGGER
5127           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5128 #endif
5129         }
5130       }
5131       if (hot_teams[level].hot_team_nth >= new_nproc) {
5132         // we have all needed threads in reserve, no need to allocate any
5133         // this only possible in mode 1, cannot have reserved threads in mode 0
5134         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5135         team->t.t_nproc = new_nproc; // just get reserved threads involved
5136       } else {
5137         // we may have some threads in reserve, but not enough
5138         team->t.t_nproc =
5139             hot_teams[level]
5140                 .hot_team_nth; // get reserved threads involved if any
5141         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5142 #endif // KMP_NESTED_HOT_TEAMS
5143         if (team->t.t_max_nproc < new_nproc) {
5144           /* reallocate larger arrays */
5145           __kmp_reallocate_team_arrays(team, new_nproc);
5146           __kmp_reinitialize_team(team, new_icvs, NULL);
5147         }
5148 
5149 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5150         /* Temporarily set full mask for master thread before creation of
5151            workers. The reason is that workers inherit the affinity from master,
5152            so if a lot of workers are created on the single core quickly, they
5153            don't get a chance to set their own affinity for a long time. */
5154         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5155 #endif
5156 
5157         /* allocate new threads for the hot team */
5158         for (f = team->t.t_nproc; f < new_nproc; f++) {
5159           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5160           KMP_DEBUG_ASSERT(new_worker);
5161           team->t.t_threads[f] = new_worker;
5162 
5163           KA_TRACE(20,
5164                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5165                     "join=%llu, plain=%llu\n",
5166                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5167                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5168                     team->t.t_bar[bs_plain_barrier].b_arrived));
5169 
5170           { // Initialize barrier data for new threads.
5171             int b;
5172             kmp_balign_t *balign = new_worker->th.th_bar;
5173             for (b = 0; b < bs_last_barrier; ++b) {
5174               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5175               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5176                                KMP_BARRIER_PARENT_FLAG);
5177 #if USE_DEBUGGER
5178               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5179 #endif
5180             }
5181           }
5182         }
5183 
5184 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5185         if (KMP_AFFINITY_CAPABLE()) {
5186           /* Restore initial master thread's affinity mask */
5187           __kmp_set_system_affinity(old_mask, TRUE);
5188           KMP_CPU_FREE(old_mask);
5189         }
5190 #endif
5191 #if KMP_NESTED_HOT_TEAMS
5192       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5193 #endif // KMP_NESTED_HOT_TEAMS
5194       /* make sure everyone is syncronized */
5195       int old_nproc = team->t.t_nproc; // save old value and use to update only
5196       // new threads below
5197       __kmp_initialize_team(team, new_nproc, new_icvs,
5198                             root->r.r_uber_thread->th.th_ident);
5199 
5200       /* reinitialize the threads */
5201       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5202       for (f = 0; f < team->t.t_nproc; ++f)
5203         __kmp_initialize_info(team->t.t_threads[f], team, f,
5204                               __kmp_gtid_from_tid(f, team));
5205 
5206       if (level) { // set th_task_state for new threads in nested hot team
5207         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5208         // only need to set the th_task_state for the new threads. th_task_state
5209         // for master thread will not be accurate until after this in
5210         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5211         // correct value.
5212         for (f = old_nproc; f < team->t.t_nproc; ++f)
5213           team->t.t_threads[f]->th.th_task_state =
5214               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5215       } else { // set th_task_state for new threads in non-nested hot team
5216         kmp_uint8 old_state =
5217             team->t.t_threads[0]->th.th_task_state; // copy master's state
5218         for (f = old_nproc; f < team->t.t_nproc; ++f)
5219           team->t.t_threads[f]->th.th_task_state = old_state;
5220       }
5221 
5222 #ifdef KMP_DEBUG
5223       for (f = 0; f < team->t.t_nproc; ++f) {
5224         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5225                          team->t.t_threads[f]->th.th_team_nproc ==
5226                              team->t.t_nproc);
5227       }
5228 #endif
5229 
5230       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5231 #if KMP_AFFINITY_SUPPORTED
5232       __kmp_partition_places(team);
5233 #endif
5234     } // Check changes in number of threads
5235 
5236     kmp_info_t *master = team->t.t_threads[0];
5237     if (master->th.th_teams_microtask) {
5238       for (f = 1; f < new_nproc; ++f) {
5239         // propagate teams construct specific info to workers
5240         kmp_info_t *thr = team->t.t_threads[f];
5241         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5242         thr->th.th_teams_level = master->th.th_teams_level;
5243         thr->th.th_teams_size = master->th.th_teams_size;
5244       }
5245     }
5246 #if KMP_NESTED_HOT_TEAMS
5247     if (level) {
5248       // Sync barrier state for nested hot teams, not needed for outermost hot
5249       // team.
5250       for (f = 1; f < new_nproc; ++f) {
5251         kmp_info_t *thr = team->t.t_threads[f];
5252         int b;
5253         kmp_balign_t *balign = thr->th.th_bar;
5254         for (b = 0; b < bs_last_barrier; ++b) {
5255           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5256           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5257 #if USE_DEBUGGER
5258           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5259 #endif
5260         }
5261       }
5262     }
5263 #endif // KMP_NESTED_HOT_TEAMS
5264 
5265     /* reallocate space for arguments if necessary */
5266     __kmp_alloc_argv_entries(argc, team, TRUE);
5267     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5268     // The hot team re-uses the previous task team,
5269     // if untouched during the previous release->gather phase.
5270 
5271     KF_TRACE(10, (" hot_team = %p\n", team));
5272 
5273 #if KMP_DEBUG
5274     if (__kmp_tasking_mode != tskm_immediate_exec) {
5275       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5276                     "task_team[1] = %p after reinit\n",
5277                     team->t.t_task_team[0], team->t.t_task_team[1]));
5278     }
5279 #endif
5280 
5281 #if OMPT_SUPPORT
5282     __ompt_team_assign_id(team, ompt_parallel_data);
5283 #endif
5284 
5285     KMP_MB();
5286 
5287     return team;
5288   }
5289 
5290   /* next, let's try to take one from the team pool */
5291   KMP_MB();
5292   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5293     /* TODO: consider resizing undersized teams instead of reaping them, now
5294        that we have a resizing mechanism */
5295     if (team->t.t_max_nproc >= max_nproc) {
5296       /* take this team from the team pool */
5297       __kmp_team_pool = team->t.t_next_pool;
5298 
5299       /* setup the team for fresh use */
5300       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5301 
5302       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5303                     "task_team[1] %p to NULL\n",
5304                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5305       team->t.t_task_team[0] = NULL;
5306       team->t.t_task_team[1] = NULL;
5307 
5308       /* reallocate space for arguments if necessary */
5309       __kmp_alloc_argv_entries(argc, team, TRUE);
5310       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5311 
5312       KA_TRACE(
5313           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5314                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5315       { // Initialize barrier data.
5316         int b;
5317         for (b = 0; b < bs_last_barrier; ++b) {
5318           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5319 #if USE_DEBUGGER
5320           team->t.t_bar[b].b_master_arrived = 0;
5321           team->t.t_bar[b].b_team_arrived = 0;
5322 #endif
5323         }
5324       }
5325 
5326       team->t.t_proc_bind = new_proc_bind;
5327 
5328       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5329                     team->t.t_id));
5330 
5331 #if OMPT_SUPPORT
5332       __ompt_team_assign_id(team, ompt_parallel_data);
5333 #endif
5334 
5335       KMP_MB();
5336 
5337       return team;
5338     }
5339 
5340     /* reap team if it is too small, then loop back and check the next one */
5341     // not sure if this is wise, but, will be redone during the hot-teams
5342     // rewrite.
5343     /* TODO: Use technique to find the right size hot-team, don't reap them */
5344     team = __kmp_reap_team(team);
5345     __kmp_team_pool = team;
5346   }
5347 
5348   /* nothing available in the pool, no matter, make a new team! */
5349   KMP_MB();
5350   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5351 
5352   /* and set it up */
5353   team->t.t_max_nproc = max_nproc;
5354   /* NOTE well, for some reason allocating one big buffer and dividing it up
5355      seems to really hurt performance a lot on the P4, so, let's not use this */
5356   __kmp_allocate_team_arrays(team, max_nproc);
5357 
5358   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5359   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5360 
5361   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5362                 "%p to NULL\n",
5363                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5364   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5365   // memory, no need to duplicate
5366   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5367   // memory, no need to duplicate
5368 
5369   if (__kmp_storage_map) {
5370     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5371   }
5372 
5373   /* allocate space for arguments */
5374   __kmp_alloc_argv_entries(argc, team, FALSE);
5375   team->t.t_argc = argc;
5376 
5377   KA_TRACE(20,
5378            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5379             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5380   { // Initialize barrier data.
5381     int b;
5382     for (b = 0; b < bs_last_barrier; ++b) {
5383       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5384 #if USE_DEBUGGER
5385       team->t.t_bar[b].b_master_arrived = 0;
5386       team->t.t_bar[b].b_team_arrived = 0;
5387 #endif
5388     }
5389   }
5390 
5391   team->t.t_proc_bind = new_proc_bind;
5392 
5393 #if OMPT_SUPPORT
5394   __ompt_team_assign_id(team, ompt_parallel_data);
5395   team->t.ompt_serialized_team_info = NULL;
5396 #endif
5397 
5398   KMP_MB();
5399 
5400   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5401                 team->t.t_id));
5402 
5403   return team;
5404 }
5405 
5406 /* TODO implement hot-teams at all levels */
5407 /* TODO implement lazy thread release on demand (disband request) */
5408 
5409 /* free the team.  return it to the team pool.  release all the threads
5410  * associated with it */
5411 void __kmp_free_team(kmp_root_t *root,
5412                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5413   int f;
5414   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5415                 team->t.t_id));
5416 
5417   /* verify state */
5418   KMP_DEBUG_ASSERT(root);
5419   KMP_DEBUG_ASSERT(team);
5420   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5421   KMP_DEBUG_ASSERT(team->t.t_threads);
5422 
5423   int use_hot_team = team == root->r.r_hot_team;
5424 #if KMP_NESTED_HOT_TEAMS
5425   int level;
5426   kmp_hot_team_ptr_t *hot_teams;
5427   if (master) {
5428     level = team->t.t_active_level - 1;
5429     if (master->th.th_teams_microtask) { // in teams construct?
5430       if (master->th.th_teams_size.nteams > 1) {
5431         ++level; // level was not increased in teams construct for
5432         // team_of_masters
5433       }
5434       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5435           master->th.th_teams_level == team->t.t_level) {
5436         ++level; // level was not increased in teams construct for
5437         // team_of_workers before the parallel
5438       } // team->t.t_level will be increased inside parallel
5439     }
5440     hot_teams = master->th.th_hot_teams;
5441     if (level < __kmp_hot_teams_max_level) {
5442       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5443       use_hot_team = 1;
5444     }
5445   }
5446 #endif // KMP_NESTED_HOT_TEAMS
5447 
5448   /* team is done working */
5449   TCW_SYNC_PTR(team->t.t_pkfn,
5450                NULL); // Important for Debugging Support Library.
5451 #if KMP_OS_WINDOWS
5452   team->t.t_copyin_counter = 0; // init counter for possible reuse
5453 #endif
5454   // Do not reset pointer to parent team to NULL for hot teams.
5455 
5456   /* if we are non-hot team, release our threads */
5457   if (!use_hot_team) {
5458     if (__kmp_tasking_mode != tskm_immediate_exec) {
5459       // Wait for threads to reach reapable state
5460       for (f = 1; f < team->t.t_nproc; ++f) {
5461         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5462         kmp_info_t *th = team->t.t_threads[f];
5463         volatile kmp_uint32 *state = &th->th.th_reap_state;
5464         while (*state != KMP_SAFE_TO_REAP) {
5465 #if KMP_OS_WINDOWS
5466           // On Windows a thread can be killed at any time, check this
5467           DWORD ecode;
5468           if (!__kmp_is_thread_alive(th, &ecode)) {
5469             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5470             break;
5471           }
5472 #endif
5473           // first check if thread is sleeping
5474           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5475           if (fl.is_sleeping())
5476             fl.resume(__kmp_gtid_from_thread(th));
5477           KMP_CPU_PAUSE();
5478         }
5479       }
5480 
5481       // Delete task teams
5482       int tt_idx;
5483       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5484         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5485         if (task_team != NULL) {
5486           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5487             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5488             team->t.t_threads[f]->th.th_task_team = NULL;
5489           }
5490           KA_TRACE(
5491               20,
5492               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5493                __kmp_get_gtid(), task_team, team->t.t_id));
5494 #if KMP_NESTED_HOT_TEAMS
5495           __kmp_free_task_team(master, task_team);
5496 #endif
5497           team->t.t_task_team[tt_idx] = NULL;
5498         }
5499       }
5500     }
5501 
5502     // Reset pointer to parent team only for non-hot teams.
5503     team->t.t_parent = NULL;
5504     team->t.t_level = 0;
5505     team->t.t_active_level = 0;
5506 
5507     /* free the worker threads */
5508     for (f = 1; f < team->t.t_nproc; ++f) {
5509       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5510       __kmp_free_thread(team->t.t_threads[f]);
5511       team->t.t_threads[f] = NULL;
5512     }
5513 
5514     /* put the team back in the team pool */
5515     /* TODO limit size of team pool, call reap_team if pool too large */
5516     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5517     __kmp_team_pool = (volatile kmp_team_t *)team;
5518   } else { // Check if team was created for the masters in a teams construct
5519     // See if first worker is a CG root
5520     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5521                      team->t.t_threads[1]->th.th_cg_roots);
5522     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5523       // Clean up the CG root nodes on workers so that this team can be re-used
5524       for (f = 1; f < team->t.t_nproc; ++f) {
5525         kmp_info_t *thr = team->t.t_threads[f];
5526         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5527                          thr->th.th_cg_roots->cg_root == thr);
5528         // Pop current CG root off list
5529         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5530         thr->th.th_cg_roots = tmp->up;
5531         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5532                        " up to node %p. cg_nthreads was %d\n",
5533                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5534         int i = tmp->cg_nthreads--;
5535         if (i == 1) {
5536           __kmp_free(tmp); // free CG if we are the last thread in it
5537         }
5538         // Restore current task's thread_limit from CG root
5539         if (thr->th.th_cg_roots)
5540           thr->th.th_current_task->td_icvs.thread_limit =
5541               thr->th.th_cg_roots->cg_thread_limit;
5542       }
5543     }
5544   }
5545 
5546   KMP_MB();
5547 }
5548 
5549 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5550 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5551   kmp_team_t *next_pool = team->t.t_next_pool;
5552 
5553   KMP_DEBUG_ASSERT(team);
5554   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5555   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5556   KMP_DEBUG_ASSERT(team->t.t_threads);
5557   KMP_DEBUG_ASSERT(team->t.t_argv);
5558 
5559   /* TODO clean the threads that are a part of this? */
5560 
5561   /* free stuff */
5562   __kmp_free_team_arrays(team);
5563   if (team->t.t_argv != &team->t.t_inline_argv[0])
5564     __kmp_free((void *)team->t.t_argv);
5565   __kmp_free(team);
5566 
5567   KMP_MB();
5568   return next_pool;
5569 }
5570 
5571 // Free the thread.  Don't reap it, just place it on the pool of available
5572 // threads.
5573 //
5574 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5575 // binding for the affinity mechanism to be useful.
5576 //
5577 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5578 // However, we want to avoid a potential performance problem by always
5579 // scanning through the list to find the correct point at which to insert
5580 // the thread (potential N**2 behavior).  To do this we keep track of the
5581 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5582 // With single-level parallelism, threads will always be added to the tail
5583 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5584 // parallelism, all bets are off and we may need to scan through the entire
5585 // free list.
5586 //
5587 // This change also has a potentially large performance benefit, for some
5588 // applications.  Previously, as threads were freed from the hot team, they
5589 // would be placed back on the free list in inverse order.  If the hot team
5590 // grew back to it's original size, then the freed thread would be placed
5591 // back on the hot team in reverse order.  This could cause bad cache
5592 // locality problems on programs where the size of the hot team regularly
5593 // grew and shrunk.
5594 //
5595 // Now, for single-level parallelism, the OMP tid is always == gtid.
5596 void __kmp_free_thread(kmp_info_t *this_th) {
5597   int gtid;
5598   kmp_info_t **scan;
5599 
5600   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5601                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5602 
5603   KMP_DEBUG_ASSERT(this_th);
5604 
5605   // When moving thread to pool, switch thread to wait on own b_go flag, and
5606   // uninitialized (NULL team).
5607   int b;
5608   kmp_balign_t *balign = this_th->th.th_bar;
5609   for (b = 0; b < bs_last_barrier; ++b) {
5610     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5611       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5612     balign[b].bb.team = NULL;
5613     balign[b].bb.leaf_kids = 0;
5614   }
5615   this_th->th.th_task_state = 0;
5616   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5617 
5618   /* put thread back on the free pool */
5619   TCW_PTR(this_th->th.th_team, NULL);
5620   TCW_PTR(this_th->th.th_root, NULL);
5621   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5622 
5623   while (this_th->th.th_cg_roots) {
5624     this_th->th.th_cg_roots->cg_nthreads--;
5625     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5626                    " %p of thread  %p to %d\n",
5627                    this_th, this_th->th.th_cg_roots,
5628                    this_th->th.th_cg_roots->cg_root,
5629                    this_th->th.th_cg_roots->cg_nthreads));
5630     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5631     if (tmp->cg_root == this_th) { // Thread is a cg_root
5632       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5633       KA_TRACE(
5634           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5635       this_th->th.th_cg_roots = tmp->up;
5636       __kmp_free(tmp);
5637     } else { // Worker thread
5638       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5639         __kmp_free(tmp);
5640       }
5641       this_th->th.th_cg_roots = NULL;
5642       break;
5643     }
5644   }
5645 
5646   /* If the implicit task assigned to this thread can be used by other threads
5647    * -> multiple threads can share the data and try to free the task at
5648    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5649    * with higher probability when hot team is disabled but can occurs even when
5650    * the hot team is enabled */
5651   __kmp_free_implicit_task(this_th);
5652   this_th->th.th_current_task = NULL;
5653 
5654   // If the __kmp_thread_pool_insert_pt is already past the new insert
5655   // point, then we need to re-scan the entire list.
5656   gtid = this_th->th.th_info.ds.ds_gtid;
5657   if (__kmp_thread_pool_insert_pt != NULL) {
5658     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5659     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5660       __kmp_thread_pool_insert_pt = NULL;
5661     }
5662   }
5663 
5664   // Scan down the list to find the place to insert the thread.
5665   // scan is the address of a link in the list, possibly the address of
5666   // __kmp_thread_pool itself.
5667   //
5668   // In the absence of nested parallelism, the for loop will have 0 iterations.
5669   if (__kmp_thread_pool_insert_pt != NULL) {
5670     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5671   } else {
5672     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5673   }
5674   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5675        scan = &((*scan)->th.th_next_pool))
5676     ;
5677 
5678   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5679   // to its address.
5680   TCW_PTR(this_th->th.th_next_pool, *scan);
5681   __kmp_thread_pool_insert_pt = *scan = this_th;
5682   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5683                    (this_th->th.th_info.ds.ds_gtid <
5684                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5685   TCW_4(this_th->th.th_in_pool, TRUE);
5686   __kmp_suspend_initialize_thread(this_th);
5687   __kmp_lock_suspend_mx(this_th);
5688   if (this_th->th.th_active == TRUE) {
5689     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5690     this_th->th.th_active_in_pool = TRUE;
5691   }
5692 #if KMP_DEBUG
5693   else {
5694     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5695   }
5696 #endif
5697   __kmp_unlock_suspend_mx(this_th);
5698 
5699   TCW_4(__kmp_nth, __kmp_nth - 1);
5700 
5701 #ifdef KMP_ADJUST_BLOCKTIME
5702   /* Adjust blocktime back to user setting or default if necessary */
5703   /* Middle initialization might never have occurred                */
5704   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5705     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5706     if (__kmp_nth <= __kmp_avail_proc) {
5707       __kmp_zero_bt = FALSE;
5708     }
5709   }
5710 #endif /* KMP_ADJUST_BLOCKTIME */
5711 
5712   KMP_MB();
5713 }
5714 
5715 /* ------------------------------------------------------------------------ */
5716 
5717 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5718 #if OMP_PROFILING_SUPPORT
5719   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5720   // TODO: add a configuration option for time granularity
5721   if (ProfileTraceFile)
5722     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5723 #endif
5724 
5725   int gtid = this_thr->th.th_info.ds.ds_gtid;
5726   /*    void                 *stack_data;*/
5727   kmp_team_t **volatile pteam;
5728 
5729   KMP_MB();
5730   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5731 
5732   if (__kmp_env_consistency_check) {
5733     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5734   }
5735 
5736 #if OMPT_SUPPORT
5737   ompt_data_t *thread_data;
5738   if (ompt_enabled.enabled) {
5739     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5740     *thread_data = ompt_data_none;
5741 
5742     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5743     this_thr->th.ompt_thread_info.wait_id = 0;
5744     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5745     this_thr->th.ompt_thread_info.parallel_flags = 0;
5746     if (ompt_enabled.ompt_callback_thread_begin) {
5747       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5748           ompt_thread_worker, thread_data);
5749     }
5750     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5751   }
5752 #endif
5753 
5754   /* This is the place where threads wait for work */
5755   while (!TCR_4(__kmp_global.g.g_done)) {
5756     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5757     KMP_MB();
5758 
5759     /* wait for work to do */
5760     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5761 
5762     /* No tid yet since not part of a team */
5763     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5764 
5765 #if OMPT_SUPPORT
5766     if (ompt_enabled.enabled) {
5767       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5768     }
5769 #endif
5770 
5771     pteam = &this_thr->th.th_team;
5772 
5773     /* have we been allocated? */
5774     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5775       /* we were just woken up, so run our new task */
5776       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5777         int rc;
5778         KA_TRACE(20,
5779                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5780                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5781                   (*pteam)->t.t_pkfn));
5782 
5783         updateHWFPControl(*pteam);
5784 
5785 #if OMPT_SUPPORT
5786         if (ompt_enabled.enabled) {
5787           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5788         }
5789 #endif
5790 
5791         rc = (*pteam)->t.t_invoke(gtid);
5792         KMP_ASSERT(rc);
5793 
5794         KMP_MB();
5795         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5796                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5797                       (*pteam)->t.t_pkfn));
5798       }
5799 #if OMPT_SUPPORT
5800       if (ompt_enabled.enabled) {
5801         /* no frame set while outside task */
5802         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5803 
5804         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5805       }
5806 #endif
5807       /* join barrier after parallel region */
5808       __kmp_join_barrier(gtid);
5809     }
5810   }
5811   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5812 
5813 #if OMPT_SUPPORT
5814   if (ompt_enabled.ompt_callback_thread_end) {
5815     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5816   }
5817 #endif
5818 
5819   this_thr->th.th_task_team = NULL;
5820   /* run the destructors for the threadprivate data for this thread */
5821   __kmp_common_destroy_gtid(gtid);
5822 
5823   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5824   KMP_MB();
5825 
5826 #if OMP_PROFILING_SUPPORT
5827   llvm::timeTraceProfilerFinishThread();
5828 #endif
5829   return this_thr;
5830 }
5831 
5832 /* ------------------------------------------------------------------------ */
5833 
5834 void __kmp_internal_end_dest(void *specific_gtid) {
5835   // Make sure no significant bits are lost
5836   int gtid;
5837   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5838 
5839   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5840   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5841    * this is because 0 is reserved for the nothing-stored case */
5842 
5843   __kmp_internal_end_thread(gtid);
5844 }
5845 
5846 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5847 
5848 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5849   __kmp_internal_end_atexit();
5850 }
5851 
5852 #endif
5853 
5854 /* [Windows] josh: when the atexit handler is called, there may still be more
5855    than one thread alive */
5856 void __kmp_internal_end_atexit(void) {
5857   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5858   /* [Windows]
5859      josh: ideally, we want to completely shutdown the library in this atexit
5860      handler, but stat code that depends on thread specific data for gtid fails
5861      because that data becomes unavailable at some point during the shutdown, so
5862      we call __kmp_internal_end_thread instead. We should eventually remove the
5863      dependency on __kmp_get_specific_gtid in the stat code and use
5864      __kmp_internal_end_library to cleanly shutdown the library.
5865 
5866      // TODO: Can some of this comment about GVS be removed?
5867      I suspect that the offending stat code is executed when the calling thread
5868      tries to clean up a dead root thread's data structures, resulting in GVS
5869      code trying to close the GVS structures for that thread, but since the stat
5870      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5871      the calling thread is cleaning up itself instead of another thread, it get
5872      confused. This happens because allowing a thread to unregister and cleanup
5873      another thread is a recent modification for addressing an issue.
5874      Based on the current design (20050722), a thread may end up
5875      trying to unregister another thread only if thread death does not trigger
5876      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5877      thread specific data destructor function to detect thread death. For
5878      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5879      is nothing.  Thus, the workaround is applicable only for Windows static
5880      stat library. */
5881   __kmp_internal_end_library(-1);
5882 #if KMP_OS_WINDOWS
5883   __kmp_close_console();
5884 #endif
5885 }
5886 
5887 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5888   // It is assumed __kmp_forkjoin_lock is acquired.
5889 
5890   int gtid;
5891 
5892   KMP_DEBUG_ASSERT(thread != NULL);
5893 
5894   gtid = thread->th.th_info.ds.ds_gtid;
5895 
5896   if (!is_root) {
5897     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5898       /* Assume the threads are at the fork barrier here */
5899       KA_TRACE(
5900           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5901                gtid));
5902       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5903        * (GEH) */
5904       ANNOTATE_HAPPENS_BEFORE(thread);
5905       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5906                          thread);
5907       __kmp_release_64(&flag);
5908     }
5909 
5910     // Terminate OS thread.
5911     __kmp_reap_worker(thread);
5912 
5913     // The thread was killed asynchronously.  If it was actively
5914     // spinning in the thread pool, decrement the global count.
5915     //
5916     // There is a small timing hole here - if the worker thread was just waking
5917     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5918     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5919     // the global counter might not get updated.
5920     //
5921     // Currently, this can only happen as the library is unloaded,
5922     // so there are no harmful side effects.
5923     if (thread->th.th_active_in_pool) {
5924       thread->th.th_active_in_pool = FALSE;
5925       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5926       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5927     }
5928   }
5929 
5930   __kmp_free_implicit_task(thread);
5931 
5932 // Free the fast memory for tasking
5933 #if USE_FAST_MEMORY
5934   __kmp_free_fast_memory(thread);
5935 #endif /* USE_FAST_MEMORY */
5936 
5937   __kmp_suspend_uninitialize_thread(thread);
5938 
5939   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5940   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5941 
5942   --__kmp_all_nth;
5943   // __kmp_nth was decremented when thread is added to the pool.
5944 
5945 #ifdef KMP_ADJUST_BLOCKTIME
5946   /* Adjust blocktime back to user setting or default if necessary */
5947   /* Middle initialization might never have occurred                */
5948   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5949     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5950     if (__kmp_nth <= __kmp_avail_proc) {
5951       __kmp_zero_bt = FALSE;
5952     }
5953   }
5954 #endif /* KMP_ADJUST_BLOCKTIME */
5955 
5956   /* free the memory being used */
5957   if (__kmp_env_consistency_check) {
5958     if (thread->th.th_cons) {
5959       __kmp_free_cons_stack(thread->th.th_cons);
5960       thread->th.th_cons = NULL;
5961     }
5962   }
5963 
5964   if (thread->th.th_pri_common != NULL) {
5965     __kmp_free(thread->th.th_pri_common);
5966     thread->th.th_pri_common = NULL;
5967   }
5968 
5969   if (thread->th.th_task_state_memo_stack != NULL) {
5970     __kmp_free(thread->th.th_task_state_memo_stack);
5971     thread->th.th_task_state_memo_stack = NULL;
5972   }
5973 
5974 #if KMP_USE_BGET
5975   if (thread->th.th_local.bget_data != NULL) {
5976     __kmp_finalize_bget(thread);
5977   }
5978 #endif
5979 
5980 #if KMP_AFFINITY_SUPPORTED
5981   if (thread->th.th_affin_mask != NULL) {
5982     KMP_CPU_FREE(thread->th.th_affin_mask);
5983     thread->th.th_affin_mask = NULL;
5984   }
5985 #endif /* KMP_AFFINITY_SUPPORTED */
5986 
5987 #if KMP_USE_HIER_SCHED
5988   if (thread->th.th_hier_bar_data != NULL) {
5989     __kmp_free(thread->th.th_hier_bar_data);
5990     thread->th.th_hier_bar_data = NULL;
5991   }
5992 #endif
5993 
5994   __kmp_reap_team(thread->th.th_serial_team);
5995   thread->th.th_serial_team = NULL;
5996   __kmp_free(thread);
5997 
5998   KMP_MB();
5999 
6000 } // __kmp_reap_thread
6001 
6002 static void __kmp_internal_end(void) {
6003   int i;
6004 
6005   /* First, unregister the library */
6006   __kmp_unregister_library();
6007 
6008 #if KMP_OS_WINDOWS
6009   /* In Win static library, we can't tell when a root actually dies, so we
6010      reclaim the data structures for any root threads that have died but not
6011      unregistered themselves, in order to shut down cleanly.
6012      In Win dynamic library we also can't tell when a thread dies.  */
6013   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6014 // dead roots
6015 #endif
6016 
6017   for (i = 0; i < __kmp_threads_capacity; i++)
6018     if (__kmp_root[i])
6019       if (__kmp_root[i]->r.r_active)
6020         break;
6021   KMP_MB(); /* Flush all pending memory write invalidates.  */
6022   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6023 
6024   if (i < __kmp_threads_capacity) {
6025 #if KMP_USE_MONITOR
6026     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6027     KMP_MB(); /* Flush all pending memory write invalidates.  */
6028 
6029     // Need to check that monitor was initialized before reaping it. If we are
6030     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6031     // __kmp_monitor will appear to contain valid data, but it is only valid in
6032     // the parent process, not the child.
6033     // New behavior (201008): instead of keying off of the flag
6034     // __kmp_init_parallel, the monitor thread creation is keyed off
6035     // of the new flag __kmp_init_monitor.
6036     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6037     if (TCR_4(__kmp_init_monitor)) {
6038       __kmp_reap_monitor(&__kmp_monitor);
6039       TCW_4(__kmp_init_monitor, 0);
6040     }
6041     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6042     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6043 #endif // KMP_USE_MONITOR
6044   } else {
6045 /* TODO move this to cleanup code */
6046 #ifdef KMP_DEBUG
6047     /* make sure that everything has properly ended */
6048     for (i = 0; i < __kmp_threads_capacity; i++) {
6049       if (__kmp_root[i]) {
6050         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6051         //                    there can be uber threads alive here
6052         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6053       }
6054     }
6055 #endif
6056 
6057     KMP_MB();
6058 
6059     // Reap the worker threads.
6060     // This is valid for now, but be careful if threads are reaped sooner.
6061     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6062       // Get the next thread from the pool.
6063       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6064       __kmp_thread_pool = thread->th.th_next_pool;
6065       // Reap it.
6066       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6067       thread->th.th_next_pool = NULL;
6068       thread->th.th_in_pool = FALSE;
6069       __kmp_reap_thread(thread, 0);
6070     }
6071     __kmp_thread_pool_insert_pt = NULL;
6072 
6073     // Reap teams.
6074     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6075       // Get the next team from the pool.
6076       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6077       __kmp_team_pool = team->t.t_next_pool;
6078       // Reap it.
6079       team->t.t_next_pool = NULL;
6080       __kmp_reap_team(team);
6081     }
6082 
6083     __kmp_reap_task_teams();
6084 
6085 #if KMP_OS_UNIX
6086     // Threads that are not reaped should not access any resources since they
6087     // are going to be deallocated soon, so the shutdown sequence should wait
6088     // until all threads either exit the final spin-waiting loop or begin
6089     // sleeping after the given blocktime.
6090     for (i = 0; i < __kmp_threads_capacity; i++) {
6091       kmp_info_t *thr = __kmp_threads[i];
6092       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6093         KMP_CPU_PAUSE();
6094     }
6095 #endif
6096 
6097     for (i = 0; i < __kmp_threads_capacity; ++i) {
6098       // TBD: Add some checking...
6099       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6100     }
6101 
6102     /* Make sure all threadprivate destructors get run by joining with all
6103        worker threads before resetting this flag */
6104     TCW_SYNC_4(__kmp_init_common, FALSE);
6105 
6106     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6107     KMP_MB();
6108 
6109 #if KMP_USE_MONITOR
6110     // See note above: One of the possible fixes for CQ138434 / CQ140126
6111     //
6112     // FIXME: push both code fragments down and CSE them?
6113     // push them into __kmp_cleanup() ?
6114     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6115     if (TCR_4(__kmp_init_monitor)) {
6116       __kmp_reap_monitor(&__kmp_monitor);
6117       TCW_4(__kmp_init_monitor, 0);
6118     }
6119     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6120     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6121 #endif
6122   } /* else !__kmp_global.t_active */
6123   TCW_4(__kmp_init_gtid, FALSE);
6124   KMP_MB(); /* Flush all pending memory write invalidates.  */
6125 
6126   __kmp_cleanup();
6127 #if OMPT_SUPPORT
6128   ompt_fini();
6129 #endif
6130 }
6131 
6132 void __kmp_internal_end_library(int gtid_req) {
6133   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6134   /* this shouldn't be a race condition because __kmp_internal_end() is the
6135      only place to clear __kmp_serial_init */
6136   /* we'll check this later too, after we get the lock */
6137   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6138   // redundant, because the next check will work in any case.
6139   if (__kmp_global.g.g_abort) {
6140     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6141     /* TODO abort? */
6142     return;
6143   }
6144   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6145     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6146     return;
6147   }
6148 
6149   KMP_MB(); /* Flush all pending memory write invalidates.  */
6150   /* find out who we are and what we should do */
6151   {
6152     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6153     KA_TRACE(
6154         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6155     if (gtid == KMP_GTID_SHUTDOWN) {
6156       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6157                     "already shutdown\n"));
6158       return;
6159     } else if (gtid == KMP_GTID_MONITOR) {
6160       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6161                     "registered, or system shutdown\n"));
6162       return;
6163     } else if (gtid == KMP_GTID_DNE) {
6164       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6165                     "shutdown\n"));
6166       /* we don't know who we are, but we may still shutdown the library */
6167     } else if (KMP_UBER_GTID(gtid)) {
6168       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6169       if (__kmp_root[gtid]->r.r_active) {
6170         __kmp_global.g.g_abort = -1;
6171         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6172         __kmp_unregister_library();
6173         KA_TRACE(10,
6174                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6175                   gtid));
6176         return;
6177       } else {
6178         KA_TRACE(
6179             10,
6180             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6181         __kmp_unregister_root_current_thread(gtid);
6182       }
6183     } else {
6184 /* worker threads may call this function through the atexit handler, if they
6185  * call exit() */
6186 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6187    TODO: do a thorough shutdown instead */
6188 #ifdef DUMP_DEBUG_ON_EXIT
6189       if (__kmp_debug_buf)
6190         __kmp_dump_debug_buffer();
6191 #endif
6192       // added unregister library call here when we switch to shm linux
6193       // if we don't, it will leave lots of files in /dev/shm
6194       // cleanup shared memory file before exiting.
6195       __kmp_unregister_library();
6196       return;
6197     }
6198   }
6199   /* synchronize the termination process */
6200   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6201 
6202   /* have we already finished */
6203   if (__kmp_global.g.g_abort) {
6204     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6205     /* TODO abort? */
6206     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6207     return;
6208   }
6209   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6210     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6211     return;
6212   }
6213 
6214   /* We need this lock to enforce mutex between this reading of
6215      __kmp_threads_capacity and the writing by __kmp_register_root.
6216      Alternatively, we can use a counter of roots that is atomically updated by
6217      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6218      __kmp_internal_end_*.  */
6219   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6220 
6221   /* now we can safely conduct the actual termination */
6222   __kmp_internal_end();
6223 
6224   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6225   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6226 
6227   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6228 
6229 #ifdef DUMP_DEBUG_ON_EXIT
6230   if (__kmp_debug_buf)
6231     __kmp_dump_debug_buffer();
6232 #endif
6233 
6234 #if KMP_OS_WINDOWS
6235   __kmp_close_console();
6236 #endif
6237 
6238   __kmp_fini_allocator();
6239 
6240 } // __kmp_internal_end_library
6241 
6242 void __kmp_internal_end_thread(int gtid_req) {
6243   int i;
6244 
6245   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6246   /* this shouldn't be a race condition because __kmp_internal_end() is the
6247    * only place to clear __kmp_serial_init */
6248   /* we'll check this later too, after we get the lock */
6249   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6250   // redundant, because the next check will work in any case.
6251   if (__kmp_global.g.g_abort) {
6252     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6253     /* TODO abort? */
6254     return;
6255   }
6256   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6257     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6258     return;
6259   }
6260 
6261   // If hidden helper team has been initialized, we need to deinit it
6262   if (TCR_4(__kmp_init_hidden_helper)) {
6263     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6264     // First release the main thread to let it continue its work
6265     __kmp_hidden_helper_main_thread_release();
6266     // Wait until the hidden helper team has been destroyed
6267     __kmp_hidden_helper_threads_deinitz_wait();
6268   }
6269 
6270   KMP_MB(); /* Flush all pending memory write invalidates.  */
6271 
6272   /* find out who we are and what we should do */
6273   {
6274     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6275     KA_TRACE(10,
6276              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6277     if (gtid == KMP_GTID_SHUTDOWN) {
6278       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6279                     "already shutdown\n"));
6280       return;
6281     } else if (gtid == KMP_GTID_MONITOR) {
6282       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6283                     "registered, or system shutdown\n"));
6284       return;
6285     } else if (gtid == KMP_GTID_DNE) {
6286       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6287                     "shutdown\n"));
6288       return;
6289       /* we don't know who we are */
6290     } else if (KMP_UBER_GTID(gtid)) {
6291       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6292       if (__kmp_root[gtid]->r.r_active) {
6293         __kmp_global.g.g_abort = -1;
6294         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6295         KA_TRACE(10,
6296                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6297                   gtid));
6298         return;
6299       } else {
6300         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6301                       gtid));
6302         __kmp_unregister_root_current_thread(gtid);
6303       }
6304     } else {
6305       /* just a worker thread, let's leave */
6306       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6307 
6308       if (gtid >= 0) {
6309         __kmp_threads[gtid]->th.th_task_team = NULL;
6310       }
6311 
6312       KA_TRACE(10,
6313                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6314                 gtid));
6315       return;
6316     }
6317   }
6318 #if KMP_DYNAMIC_LIB
6319   if (__kmp_pause_status != kmp_hard_paused)
6320   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6321   // because we will better shutdown later in the library destructor.
6322   {
6323     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6324     return;
6325   }
6326 #endif
6327   /* synchronize the termination process */
6328   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6329 
6330   /* have we already finished */
6331   if (__kmp_global.g.g_abort) {
6332     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6333     /* TODO abort? */
6334     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6335     return;
6336   }
6337   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6338     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6339     return;
6340   }
6341 
6342   /* We need this lock to enforce mutex between this reading of
6343      __kmp_threads_capacity and the writing by __kmp_register_root.
6344      Alternatively, we can use a counter of roots that is atomically updated by
6345      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6346      __kmp_internal_end_*.  */
6347 
6348   /* should we finish the run-time?  are all siblings done? */
6349   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6350 
6351   for (i = 0; i < __kmp_threads_capacity; ++i) {
6352     if (KMP_UBER_GTID(i)) {
6353       KA_TRACE(
6354           10,
6355           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6356       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6357       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6358       return;
6359     }
6360   }
6361 
6362   /* now we can safely conduct the actual termination */
6363 
6364   __kmp_internal_end();
6365 
6366   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6367   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6368 
6369   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6370 
6371 #ifdef DUMP_DEBUG_ON_EXIT
6372   if (__kmp_debug_buf)
6373     __kmp_dump_debug_buffer();
6374 #endif
6375 } // __kmp_internal_end_thread
6376 
6377 // -----------------------------------------------------------------------------
6378 // Library registration stuff.
6379 
6380 static long __kmp_registration_flag = 0;
6381 // Random value used to indicate library initialization.
6382 static char *__kmp_registration_str = NULL;
6383 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6384 
6385 static inline char *__kmp_reg_status_name() {
6386 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6387    each thread. If registration and unregistration go in different threads
6388    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6389    env var can not be found, because the name will contain different pid. */
6390 // macOS* complains about name being too long with additional getuid()
6391 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6392   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6393                           (int)getuid());
6394 #else
6395   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6396 #endif
6397 } // __kmp_reg_status_get
6398 
6399 void __kmp_register_library_startup(void) {
6400 
6401   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6402   int done = 0;
6403   union {
6404     double dtime;
6405     long ltime;
6406   } time;
6407 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6408   __kmp_initialize_system_tick();
6409 #endif
6410   __kmp_read_system_time(&time.dtime);
6411   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6412   __kmp_registration_str =
6413       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6414                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6415 
6416   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6417                 __kmp_registration_str));
6418 
6419   while (!done) {
6420 
6421     char *value = NULL; // Actual value of the environment variable.
6422 
6423 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6424     char *shm_name = __kmp_str_format("/%s", name);
6425     int shm_preexist = 0;
6426     char *data1;
6427     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6428     if ((fd1 == -1) && (errno == EEXIST)) {
6429       // file didn't open because it already exists.
6430       // try opening existing file
6431       fd1 = shm_open(shm_name, O_RDWR, 0666);
6432       if (fd1 == -1) { // file didn't open
6433         // error out here
6434         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6435                     __kmp_msg_null);
6436       } else {
6437         // able to open existing file
6438         shm_preexist = 1;
6439       }
6440     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6441       // already exists.
6442       // error out here.
6443       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6444                   __kmp_msg_null);
6445     }
6446     if (shm_preexist == 0) {
6447       // we created SHM now set size
6448       if (ftruncate(fd1, SHM_SIZE) == -1) {
6449         // error occured setting size;
6450         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6451                     KMP_ERR(errno), __kmp_msg_null);
6452       }
6453     }
6454     data1 =
6455         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6456     if (data1 == MAP_FAILED) {
6457       // failed to map shared memory
6458       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6459                   __kmp_msg_null);
6460     }
6461     if (shm_preexist == 0) { // set data to SHM, set value
6462       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6463     }
6464     // Read value from either what we just wrote or existing file.
6465     value = __kmp_str_format("%s", data1); // read value from SHM
6466     munmap(data1, SHM_SIZE);
6467     close(fd1);
6468 #else // Windows and unix with static library
6469     // Set environment variable, but do not overwrite if it is exist.
6470     __kmp_env_set(name, __kmp_registration_str, 0);
6471     // read value to see if it got set
6472     value = __kmp_env_get(name);
6473 #endif
6474 
6475     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6476       done = 1; // Ok, environment variable set successfully, exit the loop.
6477     } else {
6478       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6479       // Check whether it alive or dead.
6480       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6481       char *tail = value;
6482       char *flag_addr_str = NULL;
6483       char *flag_val_str = NULL;
6484       char const *file_name = NULL;
6485       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6486       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6487       file_name = tail;
6488       if (tail != NULL) {
6489         long *flag_addr = 0;
6490         unsigned long flag_val = 0;
6491         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6492         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6493         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6494           // First, check whether environment-encoded address is mapped into
6495           // addr space.
6496           // If so, dereference it to see if it still has the right value.
6497           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6498             neighbor = 1;
6499           } else {
6500             // If not, then we know the other copy of the library is no longer
6501             // running.
6502             neighbor = 2;
6503           }
6504         }
6505       }
6506       switch (neighbor) {
6507       case 0: // Cannot parse environment variable -- neighbor status unknown.
6508         // Assume it is the incompatible format of future version of the
6509         // library. Assume the other library is alive.
6510         // WARN( ... ); // TODO: Issue a warning.
6511         file_name = "unknown library";
6512         KMP_FALLTHROUGH();
6513       // Attention! Falling to the next case. That's intentional.
6514       case 1: { // Neighbor is alive.
6515         // Check it is allowed.
6516         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6517         if (!__kmp_str_match_true(duplicate_ok)) {
6518           // That's not allowed. Issue fatal error.
6519           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6520                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6521         }
6522         KMP_INTERNAL_FREE(duplicate_ok);
6523         __kmp_duplicate_library_ok = 1;
6524         done = 1; // Exit the loop.
6525       } break;
6526       case 2: { // Neighbor is dead.
6527 
6528 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6529         // close shared memory.
6530         shm_unlink(shm_name); // this removes file in /dev/shm
6531 #else
6532         // Clear the variable and try to register library again.
6533         __kmp_env_unset(name);
6534 #endif
6535       } break;
6536       default: {
6537         KMP_DEBUG_ASSERT(0);
6538       } break;
6539       }
6540     }
6541     KMP_INTERNAL_FREE((void *)value);
6542 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6543     KMP_INTERNAL_FREE((void *)shm_name);
6544 #endif
6545   } // while
6546   KMP_INTERNAL_FREE((void *)name);
6547 
6548 } // func __kmp_register_library_startup
6549 
6550 void __kmp_unregister_library(void) {
6551 
6552   char *name = __kmp_reg_status_name();
6553   char *value = NULL;
6554 
6555 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6556   char *shm_name = __kmp_str_format("/%s", name);
6557   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6558   if (fd1 == -1) {
6559     // file did not open. return.
6560     return;
6561   }
6562   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6563   if (data1 != MAP_FAILED) {
6564     value = __kmp_str_format("%s", data1); // read value from SHM
6565     munmap(data1, SHM_SIZE);
6566   }
6567   close(fd1);
6568 #else
6569   value = __kmp_env_get(name);
6570 #endif
6571 
6572   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6573   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6574   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6575 //  Ok, this is our variable. Delete it.
6576 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6577     shm_unlink(shm_name); // this removes file in /dev/shm
6578 #else
6579     __kmp_env_unset(name);
6580 #endif
6581   }
6582 
6583 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6584   KMP_INTERNAL_FREE(shm_name);
6585 #endif
6586 
6587   KMP_INTERNAL_FREE(__kmp_registration_str);
6588   KMP_INTERNAL_FREE(value);
6589   KMP_INTERNAL_FREE(name);
6590 
6591   __kmp_registration_flag = 0;
6592   __kmp_registration_str = NULL;
6593 
6594 } // __kmp_unregister_library
6595 
6596 // End of Library registration stuff.
6597 // -----------------------------------------------------------------------------
6598 
6599 #if KMP_MIC_SUPPORTED
6600 
6601 static void __kmp_check_mic_type() {
6602   kmp_cpuid_t cpuid_state = {0};
6603   kmp_cpuid_t *cs_p = &cpuid_state;
6604   __kmp_x86_cpuid(1, 0, cs_p);
6605   // We don't support mic1 at the moment
6606   if ((cs_p->eax & 0xff0) == 0xB10) {
6607     __kmp_mic_type = mic2;
6608   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6609     __kmp_mic_type = mic3;
6610   } else {
6611     __kmp_mic_type = non_mic;
6612   }
6613 }
6614 
6615 #endif /* KMP_MIC_SUPPORTED */
6616 
6617 #if KMP_HAVE_UMWAIT
6618 static void __kmp_user_level_mwait_init() {
6619   struct kmp_cpuid buf;
6620   __kmp_x86_cpuid(7, 0, &buf);
6621   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6622   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6623                 __kmp_umwait_enabled));
6624 }
6625 #elif KMP_HAVE_MWAIT
6626 #ifndef AT_INTELPHIUSERMWAIT
6627 // Spurious, non-existent value that should always fail to return anything.
6628 // Will be replaced with the correct value when we know that.
6629 #define AT_INTELPHIUSERMWAIT 10000
6630 #endif
6631 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6632 // earlier OS is used to build the RTL, we'll use the following internal
6633 // function when the entry is not found.
6634 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6635 unsigned long getauxval(unsigned long) { return 0; }
6636 
6637 static void __kmp_user_level_mwait_init() {
6638   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6639   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6640   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6641   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6642   if (__kmp_mic_type == mic3) {
6643     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6644     if ((res & 0x1) || __kmp_user_level_mwait) {
6645       __kmp_mwait_enabled = TRUE;
6646       if (__kmp_user_level_mwait) {
6647         KMP_INFORM(EnvMwaitWarn);
6648       }
6649     } else {
6650       __kmp_mwait_enabled = FALSE;
6651     }
6652   }
6653   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6654                 "__kmp_mwait_enabled = %d\n",
6655                 __kmp_mic_type, __kmp_mwait_enabled));
6656 }
6657 #endif /* KMP_HAVE_UMWAIT */
6658 
6659 static void __kmp_do_serial_initialize(void) {
6660   int i, gtid;
6661   size_t size;
6662 
6663   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6664 
6665   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6666   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6667   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6668   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6669   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6670 
6671 #if OMPT_SUPPORT
6672   ompt_pre_init();
6673 #endif
6674 
6675   __kmp_validate_locks();
6676 
6677   /* Initialize internal memory allocator */
6678   __kmp_init_allocator();
6679 
6680   /* Register the library startup via an environment variable and check to see
6681      whether another copy of the library is already registered. */
6682 
6683   __kmp_register_library_startup();
6684 
6685   /* TODO reinitialization of library */
6686   if (TCR_4(__kmp_global.g.g_done)) {
6687     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6688   }
6689 
6690   __kmp_global.g.g_abort = 0;
6691   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6692 
6693 /* initialize the locks */
6694 #if KMP_USE_ADAPTIVE_LOCKS
6695 #if KMP_DEBUG_ADAPTIVE_LOCKS
6696   __kmp_init_speculative_stats();
6697 #endif
6698 #endif
6699 #if KMP_STATS_ENABLED
6700   __kmp_stats_init();
6701 #endif
6702   __kmp_init_lock(&__kmp_global_lock);
6703   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6704   __kmp_init_lock(&__kmp_debug_lock);
6705   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6706   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6707   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6708   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6709   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6710   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6711   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6712   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6713   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6714   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6715   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6716   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6717   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6718   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6719   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6720 #if KMP_USE_MONITOR
6721   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6722 #endif
6723   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6724 
6725   /* conduct initialization and initial setup of configuration */
6726 
6727   __kmp_runtime_initialize();
6728 
6729 #if KMP_MIC_SUPPORTED
6730   __kmp_check_mic_type();
6731 #endif
6732 
6733 // Some global variable initialization moved here from kmp_env_initialize()
6734 #ifdef KMP_DEBUG
6735   kmp_diag = 0;
6736 #endif
6737   __kmp_abort_delay = 0;
6738 
6739   // From __kmp_init_dflt_team_nth()
6740   /* assume the entire machine will be used */
6741   __kmp_dflt_team_nth_ub = __kmp_xproc;
6742   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6743     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6744   }
6745   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6746     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6747   }
6748   __kmp_max_nth = __kmp_sys_max_nth;
6749   __kmp_cg_max_nth = __kmp_sys_max_nth;
6750   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6751   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6752     __kmp_teams_max_nth = __kmp_sys_max_nth;
6753   }
6754 
6755   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6756   // part
6757   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6758 #if KMP_USE_MONITOR
6759   __kmp_monitor_wakeups =
6760       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6761   __kmp_bt_intervals =
6762       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6763 #endif
6764   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6765   __kmp_library = library_throughput;
6766   // From KMP_SCHEDULE initialization
6767   __kmp_static = kmp_sch_static_balanced;
6768 // AC: do not use analytical here, because it is non-monotonous
6769 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6770 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6771 // need to repeat assignment
6772 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6773 // bit control and barrier method control parts
6774 #if KMP_FAST_REDUCTION_BARRIER
6775 #define kmp_reduction_barrier_gather_bb ((int)1)
6776 #define kmp_reduction_barrier_release_bb ((int)1)
6777 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6778 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6779 #endif // KMP_FAST_REDUCTION_BARRIER
6780   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6781     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6782     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6783     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6784     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6785 #if KMP_FAST_REDUCTION_BARRIER
6786     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6787       // lin_64 ): hyper,1
6788       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6789       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6790       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6791       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6792     }
6793 #endif // KMP_FAST_REDUCTION_BARRIER
6794   }
6795 #if KMP_FAST_REDUCTION_BARRIER
6796 #undef kmp_reduction_barrier_release_pat
6797 #undef kmp_reduction_barrier_gather_pat
6798 #undef kmp_reduction_barrier_release_bb
6799 #undef kmp_reduction_barrier_gather_bb
6800 #endif // KMP_FAST_REDUCTION_BARRIER
6801 #if KMP_MIC_SUPPORTED
6802   if (__kmp_mic_type == mic2) { // KNC
6803     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6804     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6805     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6806         1; // forkjoin release
6807     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6808     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6809   }
6810 #if KMP_FAST_REDUCTION_BARRIER
6811   if (__kmp_mic_type == mic2) { // KNC
6812     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6813     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6814   }
6815 #endif // KMP_FAST_REDUCTION_BARRIER
6816 #endif // KMP_MIC_SUPPORTED
6817 
6818 // From KMP_CHECKS initialization
6819 #ifdef KMP_DEBUG
6820   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6821 #else
6822   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6823 #endif
6824 
6825   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6826   __kmp_foreign_tp = TRUE;
6827 
6828   __kmp_global.g.g_dynamic = FALSE;
6829   __kmp_global.g.g_dynamic_mode = dynamic_default;
6830 
6831   __kmp_env_initialize(NULL);
6832 
6833 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6834   __kmp_user_level_mwait_init();
6835 #endif
6836 // Print all messages in message catalog for testing purposes.
6837 #ifdef KMP_DEBUG
6838   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6839   if (__kmp_str_match_true(val)) {
6840     kmp_str_buf_t buffer;
6841     __kmp_str_buf_init(&buffer);
6842     __kmp_i18n_dump_catalog(&buffer);
6843     __kmp_printf("%s", buffer.str);
6844     __kmp_str_buf_free(&buffer);
6845   }
6846   __kmp_env_free(&val);
6847 #endif
6848 
6849   __kmp_threads_capacity =
6850       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6851   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6852   __kmp_tp_capacity = __kmp_default_tp_capacity(
6853       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6854 
6855   // If the library is shut down properly, both pools must be NULL. Just in
6856   // case, set them to NULL -- some memory may leak, but subsequent code will
6857   // work even if pools are not freed.
6858   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6859   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6860   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6861   __kmp_thread_pool = NULL;
6862   __kmp_thread_pool_insert_pt = NULL;
6863   __kmp_team_pool = NULL;
6864 
6865   /* Allocate all of the variable sized records */
6866   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6867    * expandable */
6868   /* Since allocation is cache-aligned, just add extra padding at the end */
6869   size =
6870       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6871       CACHE_LINE;
6872   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6873   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6874                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6875 
6876   /* init thread counts */
6877   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6878                    0); // Asserts fail if the library is reinitializing and
6879   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6880   __kmp_all_nth = 0;
6881   __kmp_nth = 0;
6882 
6883   /* setup the uber master thread and hierarchy */
6884   gtid = __kmp_register_root(TRUE);
6885   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6886   KMP_ASSERT(KMP_UBER_GTID(gtid));
6887   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6888 
6889   KMP_MB(); /* Flush all pending memory write invalidates.  */
6890 
6891   __kmp_common_initialize();
6892 
6893 #if KMP_OS_UNIX
6894   /* invoke the child fork handler */
6895   __kmp_register_atfork();
6896 #endif
6897 
6898 #if !KMP_DYNAMIC_LIB
6899   {
6900     /* Invoke the exit handler when the program finishes, only for static
6901        library. For dynamic library, we already have _fini and DllMain. */
6902     int rc = atexit(__kmp_internal_end_atexit);
6903     if (rc != 0) {
6904       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6905                   __kmp_msg_null);
6906     }
6907   }
6908 #endif
6909 
6910 #if KMP_HANDLE_SIGNALS
6911 #if KMP_OS_UNIX
6912   /* NOTE: make sure that this is called before the user installs their own
6913      signal handlers so that the user handlers are called first. this way they
6914      can return false, not call our handler, avoid terminating the library, and
6915      continue execution where they left off. */
6916   __kmp_install_signals(FALSE);
6917 #endif /* KMP_OS_UNIX */
6918 #if KMP_OS_WINDOWS
6919   __kmp_install_signals(TRUE);
6920 #endif /* KMP_OS_WINDOWS */
6921 #endif
6922 
6923   /* we have finished the serial initialization */
6924   __kmp_init_counter++;
6925 
6926   __kmp_init_serial = TRUE;
6927 
6928   if (__kmp_settings) {
6929     __kmp_env_print();
6930   }
6931 
6932   if (__kmp_display_env || __kmp_display_env_verbose) {
6933     __kmp_env_print_2();
6934   }
6935 
6936 #if OMPT_SUPPORT
6937   ompt_post_init();
6938 #endif
6939 
6940   KMP_MB();
6941 
6942   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6943 }
6944 
6945 void __kmp_serial_initialize(void) {
6946   if (__kmp_init_serial) {
6947     return;
6948   }
6949   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6950   if (__kmp_init_serial) {
6951     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6952     return;
6953   }
6954   __kmp_do_serial_initialize();
6955   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6956 }
6957 
6958 static void __kmp_do_middle_initialize(void) {
6959   int i, j;
6960   int prev_dflt_team_nth;
6961 
6962   if (!__kmp_init_serial) {
6963     __kmp_do_serial_initialize();
6964   }
6965 
6966   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6967 
6968   // Save the previous value for the __kmp_dflt_team_nth so that
6969   // we can avoid some reinitialization if it hasn't changed.
6970   prev_dflt_team_nth = __kmp_dflt_team_nth;
6971 
6972 #if KMP_AFFINITY_SUPPORTED
6973   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6974   // number of cores on the machine.
6975   __kmp_affinity_initialize();
6976 
6977   // Run through the __kmp_threads array and set the affinity mask
6978   // for each root thread that is currently registered with the RTL.
6979   for (i = 0; i < __kmp_threads_capacity; i++) {
6980     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6981       __kmp_affinity_set_init_mask(i, TRUE);
6982     }
6983   }
6984 #endif /* KMP_AFFINITY_SUPPORTED */
6985 
6986   KMP_ASSERT(__kmp_xproc > 0);
6987   if (__kmp_avail_proc == 0) {
6988     __kmp_avail_proc = __kmp_xproc;
6989   }
6990 
6991   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6992   // correct them now
6993   j = 0;
6994   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6995     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6996         __kmp_avail_proc;
6997     j++;
6998   }
6999 
7000   if (__kmp_dflt_team_nth == 0) {
7001 #ifdef KMP_DFLT_NTH_CORES
7002     // Default #threads = #cores
7003     __kmp_dflt_team_nth = __kmp_ncores;
7004     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7005                   "__kmp_ncores (%d)\n",
7006                   __kmp_dflt_team_nth));
7007 #else
7008     // Default #threads = #available OS procs
7009     __kmp_dflt_team_nth = __kmp_avail_proc;
7010     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7011                   "__kmp_avail_proc(%d)\n",
7012                   __kmp_dflt_team_nth));
7013 #endif /* KMP_DFLT_NTH_CORES */
7014   }
7015 
7016   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7017     __kmp_dflt_team_nth = KMP_MIN_NTH;
7018   }
7019   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7020     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7021   }
7022 
7023   // There's no harm in continuing if the following check fails,
7024   // but it indicates an error in the previous logic.
7025   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7026 
7027   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7028     // Run through the __kmp_threads array and set the num threads icv for each
7029     // root thread that is currently registered with the RTL (which has not
7030     // already explicitly set its nthreads-var with a call to
7031     // omp_set_num_threads()).
7032     for (i = 0; i < __kmp_threads_capacity; i++) {
7033       kmp_info_t *thread = __kmp_threads[i];
7034       if (thread == NULL)
7035         continue;
7036       if (thread->th.th_current_task->td_icvs.nproc != 0)
7037         continue;
7038 
7039       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7040     }
7041   }
7042   KA_TRACE(
7043       20,
7044       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7045        __kmp_dflt_team_nth));
7046 
7047 #ifdef KMP_ADJUST_BLOCKTIME
7048   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7049   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7050     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7051     if (__kmp_nth > __kmp_avail_proc) {
7052       __kmp_zero_bt = TRUE;
7053     }
7054   }
7055 #endif /* KMP_ADJUST_BLOCKTIME */
7056 
7057   /* we have finished middle initialization */
7058   TCW_SYNC_4(__kmp_init_middle, TRUE);
7059 
7060   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7061 }
7062 
7063 void __kmp_middle_initialize(void) {
7064   if (__kmp_init_middle) {
7065     return;
7066   }
7067   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7068   if (__kmp_init_middle) {
7069     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7070     return;
7071   }
7072   __kmp_do_middle_initialize();
7073   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7074 }
7075 
7076 void __kmp_parallel_initialize(void) {
7077   int gtid = __kmp_entry_gtid(); // this might be a new root
7078 
7079   /* synchronize parallel initialization (for sibling) */
7080   if (TCR_4(__kmp_init_parallel))
7081     return;
7082   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7083   if (TCR_4(__kmp_init_parallel)) {
7084     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7085     return;
7086   }
7087 
7088   /* TODO reinitialization after we have already shut down */
7089   if (TCR_4(__kmp_global.g.g_done)) {
7090     KA_TRACE(
7091         10,
7092         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7093     __kmp_infinite_loop();
7094   }
7095 
7096   /* jc: The lock __kmp_initz_lock is already held, so calling
7097      __kmp_serial_initialize would cause a deadlock.  So we call
7098      __kmp_do_serial_initialize directly. */
7099   if (!__kmp_init_middle) {
7100     __kmp_do_middle_initialize();
7101   }
7102   __kmp_resume_if_hard_paused();
7103 
7104   /* begin initialization */
7105   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7106   KMP_ASSERT(KMP_UBER_GTID(gtid));
7107 
7108 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7109   // Save the FP control regs.
7110   // Worker threads will set theirs to these values at thread startup.
7111   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7112   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7113   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7114 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7115 
7116 #if KMP_OS_UNIX
7117 #if KMP_HANDLE_SIGNALS
7118   /*  must be after __kmp_serial_initialize  */
7119   __kmp_install_signals(TRUE);
7120 #endif
7121 #endif
7122 
7123   __kmp_suspend_initialize();
7124 
7125 #if defined(USE_LOAD_BALANCE)
7126   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7127     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7128   }
7129 #else
7130   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7131     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7132   }
7133 #endif
7134 
7135   if (__kmp_version) {
7136     __kmp_print_version_2();
7137   }
7138 
7139   /* we have finished parallel initialization */
7140   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7141 
7142   KMP_MB();
7143   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7144 
7145   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7146 }
7147 
7148 void __kmp_hidden_helper_initialize() {
7149   if (TCR_4(__kmp_init_hidden_helper))
7150     return;
7151 
7152   // __kmp_parallel_initialize is required before we initialize hidden helper
7153   if (!TCR_4(__kmp_init_parallel))
7154     __kmp_parallel_initialize();
7155 
7156   // Double check. Note that this double check should not be placed before
7157   // __kmp_parallel_initialize as it will cause dead lock.
7158   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7159   if (TCR_4(__kmp_init_hidden_helper)) {
7160     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7161     return;
7162   }
7163 
7164   // Set the count of hidden helper tasks to be executed to zero
7165   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7166 
7167   // Set the global variable indicating that we're initializing hidden helper
7168   // team/threads
7169   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7170 
7171   // Platform independent initialization
7172   __kmp_do_initialize_hidden_helper_threads();
7173 
7174   // Wait here for the finish of initialization of hidden helper teams
7175   __kmp_hidden_helper_threads_initz_wait();
7176 
7177   // We have finished hidden helper initialization
7178   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7179 
7180   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7181 }
7182 
7183 /* ------------------------------------------------------------------------ */
7184 
7185 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7186                                    kmp_team_t *team) {
7187   kmp_disp_t *dispatch;
7188 
7189   KMP_MB();
7190 
7191   /* none of the threads have encountered any constructs, yet. */
7192   this_thr->th.th_local.this_construct = 0;
7193 #if KMP_CACHE_MANAGE
7194   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7195 #endif /* KMP_CACHE_MANAGE */
7196   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7197   KMP_DEBUG_ASSERT(dispatch);
7198   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7199   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7200   // this_thr->th.th_info.ds.ds_tid ] );
7201 
7202   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7203   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7204   if (__kmp_env_consistency_check)
7205     __kmp_push_parallel(gtid, team->t.t_ident);
7206 
7207   KMP_MB(); /* Flush all pending memory write invalidates.  */
7208 }
7209 
7210 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7211                                   kmp_team_t *team) {
7212   if (__kmp_env_consistency_check)
7213     __kmp_pop_parallel(gtid, team->t.t_ident);
7214 
7215   __kmp_finish_implicit_task(this_thr);
7216 }
7217 
7218 int __kmp_invoke_task_func(int gtid) {
7219   int rc;
7220   int tid = __kmp_tid_from_gtid(gtid);
7221   kmp_info_t *this_thr = __kmp_threads[gtid];
7222   kmp_team_t *team = this_thr->th.th_team;
7223 
7224   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7225 #if USE_ITT_BUILD
7226   if (__itt_stack_caller_create_ptr) {
7227     // inform ittnotify about entering user's code
7228     if (team->t.t_stack_id != NULL) {
7229       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7230     } else {
7231       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7232       __kmp_itt_stack_callee_enter(
7233           (__itt_caller)team->t.t_parent->t.t_stack_id);
7234     }
7235   }
7236 #endif /* USE_ITT_BUILD */
7237 #if INCLUDE_SSC_MARKS
7238   SSC_MARK_INVOKING();
7239 #endif
7240 
7241 #if OMPT_SUPPORT
7242   void *dummy;
7243   void **exit_frame_p;
7244   ompt_data_t *my_task_data;
7245   ompt_data_t *my_parallel_data;
7246   int ompt_team_size;
7247 
7248   if (ompt_enabled.enabled) {
7249     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7250                          .ompt_task_info.frame.exit_frame.ptr);
7251   } else {
7252     exit_frame_p = &dummy;
7253   }
7254 
7255   my_task_data =
7256       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7257   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7258   if (ompt_enabled.ompt_callback_implicit_task) {
7259     ompt_team_size = team->t.t_nproc;
7260     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7261         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7262         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7263     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7264   }
7265 #endif
7266 
7267 #if KMP_STATS_ENABLED
7268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7269   if (previous_state == stats_state_e::TEAMS_REGION) {
7270     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7271   } else {
7272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7273   }
7274   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7275 #endif
7276 
7277   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7278                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7279 #if OMPT_SUPPORT
7280                               ,
7281                               exit_frame_p
7282 #endif
7283   );
7284 #if OMPT_SUPPORT
7285   *exit_frame_p = NULL;
7286   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7287 #endif
7288 
7289 #if KMP_STATS_ENABLED
7290   if (previous_state == stats_state_e::TEAMS_REGION) {
7291     KMP_SET_THREAD_STATE(previous_state);
7292   }
7293   KMP_POP_PARTITIONED_TIMER();
7294 #endif
7295 
7296 #if USE_ITT_BUILD
7297   if (__itt_stack_caller_create_ptr) {
7298     // inform ittnotify about leaving user's code
7299     if (team->t.t_stack_id != NULL) {
7300       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7301     } else {
7302       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7303       __kmp_itt_stack_callee_leave(
7304           (__itt_caller)team->t.t_parent->t.t_stack_id);
7305     }
7306   }
7307 #endif /* USE_ITT_BUILD */
7308   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7309 
7310   return rc;
7311 }
7312 
7313 void __kmp_teams_master(int gtid) {
7314   // This routine is called by all master threads in teams construct
7315   kmp_info_t *thr = __kmp_threads[gtid];
7316   kmp_team_t *team = thr->th.th_team;
7317   ident_t *loc = team->t.t_ident;
7318   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7319   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7320   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7321   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7322                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7323 
7324   // This thread is a new CG root.  Set up the proper variables.
7325   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7326   tmp->cg_root = thr; // Make thr the CG root
7327   // Init to thread limit that was stored when league masters were forked
7328   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7329   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7330   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7331                  " cg_nthreads to 1\n",
7332                  thr, tmp));
7333   tmp->up = thr->th.th_cg_roots;
7334   thr->th.th_cg_roots = tmp;
7335 
7336 // Launch league of teams now, but not let workers execute
7337 // (they hang on fork barrier until next parallel)
7338 #if INCLUDE_SSC_MARKS
7339   SSC_MARK_FORKING();
7340 #endif
7341   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7342                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7343                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7344 #if INCLUDE_SSC_MARKS
7345   SSC_MARK_JOINING();
7346 #endif
7347   // If the team size was reduced from the limit, set it to the new size
7348   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7349     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7350   // AC: last parameter "1" eliminates join barrier which won't work because
7351   // worker threads are in a fork barrier waiting for more parallel regions
7352   __kmp_join_call(loc, gtid
7353 #if OMPT_SUPPORT
7354                   ,
7355                   fork_context_intel
7356 #endif
7357                   ,
7358                   1);
7359 }
7360 
7361 int __kmp_invoke_teams_master(int gtid) {
7362   kmp_info_t *this_thr = __kmp_threads[gtid];
7363   kmp_team_t *team = this_thr->th.th_team;
7364 #if KMP_DEBUG
7365   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7366     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7367                      (void *)__kmp_teams_master);
7368 #endif
7369   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7370 #if OMPT_SUPPORT
7371   int tid = __kmp_tid_from_gtid(gtid);
7372   ompt_data_t *task_data =
7373       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7374   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7375   if (ompt_enabled.ompt_callback_implicit_task) {
7376     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7377         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7378         ompt_task_initial);
7379     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7380   }
7381 #endif
7382   __kmp_teams_master(gtid);
7383 #if OMPT_SUPPORT
7384   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7385 #endif
7386   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7387   return 1;
7388 }
7389 
7390 /* this sets the requested number of threads for the next parallel region
7391    encountered by this team. since this should be enclosed in the forkjoin
7392    critical section it should avoid race conditions with asymmetrical nested
7393    parallelism */
7394 
7395 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7396   kmp_info_t *thr = __kmp_threads[gtid];
7397 
7398   if (num_threads > 0)
7399     thr->th.th_set_nproc = num_threads;
7400 }
7401 
7402 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7403                                     int num_threads) {
7404   KMP_DEBUG_ASSERT(thr);
7405   // Remember the number of threads for inner parallel regions
7406   if (!TCR_4(__kmp_init_middle))
7407     __kmp_middle_initialize(); // get internal globals calculated
7408   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7409   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7410 
7411   if (num_threads == 0) {
7412     if (__kmp_teams_thread_limit > 0) {
7413       num_threads = __kmp_teams_thread_limit;
7414     } else {
7415       num_threads = __kmp_avail_proc / num_teams;
7416     }
7417     // adjust num_threads w/o warning as it is not user setting
7418     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7419     // no thread_limit clause specified -  do not change thread-limit-var ICV
7420     if (num_threads > __kmp_dflt_team_nth) {
7421       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7422     }
7423     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7424       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7425     } // prevent team size to exceed thread-limit-var
7426     if (num_teams * num_threads > __kmp_teams_max_nth) {
7427       num_threads = __kmp_teams_max_nth / num_teams;
7428     }
7429     if (num_threads == 0) {
7430       num_threads = 1;
7431     }
7432   } else {
7433     // This thread will be the master of the league masters
7434     // Store new thread limit; old limit is saved in th_cg_roots list
7435     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7436     // num_threads = min(num_threads, nthreads-var)
7437     if (num_threads > __kmp_dflt_team_nth) {
7438       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7439     }
7440     if (num_teams * num_threads > __kmp_teams_max_nth) {
7441       int new_threads = __kmp_teams_max_nth / num_teams;
7442       if (new_threads == 0) {
7443         new_threads = 1;
7444       }
7445       if (new_threads != num_threads) {
7446         if (!__kmp_reserve_warn) { // user asked for too many threads
7447           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7448           __kmp_msg(kmp_ms_warning,
7449                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7450                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7451         }
7452       }
7453       num_threads = new_threads;
7454     }
7455   }
7456   thr->th.th_teams_size.nth = num_threads;
7457 }
7458 
7459 /* this sets the requested number of teams for the teams region and/or
7460    the number of threads for the next parallel region encountered  */
7461 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7462                           int num_threads) {
7463   kmp_info_t *thr = __kmp_threads[gtid];
7464   KMP_DEBUG_ASSERT(num_teams >= 0);
7465   KMP_DEBUG_ASSERT(num_threads >= 0);
7466 
7467   if (num_teams == 0) {
7468     if (__kmp_nteams > 0) {
7469       num_teams = __kmp_nteams;
7470     } else {
7471       num_teams = 1; // default number of teams is 1.
7472     }
7473   }
7474   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7475     if (!__kmp_reserve_warn) {
7476       __kmp_reserve_warn = 1;
7477       __kmp_msg(kmp_ms_warning,
7478                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7479                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7480     }
7481     num_teams = __kmp_teams_max_nth;
7482   }
7483   // Set number of teams (number of threads in the outer "parallel" of the
7484   // teams)
7485   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7486 
7487   __kmp_push_thread_limit(thr, num_teams, num_threads);
7488 }
7489 
7490 /* This sets the requested number of teams for the teams region and/or
7491    the number of threads for the next parallel region encountered  */
7492 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7493                              int num_teams_ub, int num_threads) {
7494   kmp_info_t *thr = __kmp_threads[gtid];
7495   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7496   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7497   KMP_DEBUG_ASSERT(num_threads >= 0);
7498 
7499   if (num_teams_lb > num_teams_ub) {
7500     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7501                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7502   }
7503 
7504   int num_teams = 1; // defalt number of teams is 1.
7505 
7506   if (num_teams_lb == 0 && num_teams_ub > 0)
7507     num_teams_lb = num_teams_ub;
7508 
7509   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7510     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7511     if (num_teams > __kmp_teams_max_nth) {
7512       if (!__kmp_reserve_warn) {
7513         __kmp_reserve_warn = 1;
7514         __kmp_msg(kmp_ms_warning,
7515                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7516                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7517       }
7518       num_teams = __kmp_teams_max_nth;
7519     }
7520   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7521     num_teams = num_teams_ub;
7522   } else { // num_teams_lb <= num_teams <= num_teams_ub
7523     if (num_threads == 0) {
7524       if (num_teams_ub > __kmp_teams_max_nth) {
7525         num_teams = num_teams_lb;
7526       } else {
7527         num_teams = num_teams_ub;
7528       }
7529     } else {
7530       num_teams = (num_threads > __kmp_teams_max_nth)
7531                       ? num_teams
7532                       : __kmp_teams_max_nth / num_threads;
7533       if (num_teams < num_teams_lb) {
7534         num_teams = num_teams_lb;
7535       } else if (num_teams > num_teams_ub) {
7536         num_teams = num_teams_ub;
7537       }
7538     }
7539   }
7540   // Set number of teams (number of threads in the outer "parallel" of the
7541   // teams)
7542   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7543 
7544   __kmp_push_thread_limit(thr, num_teams, num_threads);
7545 }
7546 
7547 // Set the proc_bind var to use in the following parallel region.
7548 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7549   kmp_info_t *thr = __kmp_threads[gtid];
7550   thr->th.th_set_proc_bind = proc_bind;
7551 }
7552 
7553 /* Launch the worker threads into the microtask. */
7554 
7555 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7556   kmp_info_t *this_thr = __kmp_threads[gtid];
7557 
7558 #ifdef KMP_DEBUG
7559   int f;
7560 #endif /* KMP_DEBUG */
7561 
7562   KMP_DEBUG_ASSERT(team);
7563   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7564   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7565   KMP_MB(); /* Flush all pending memory write invalidates.  */
7566 
7567   team->t.t_construct = 0; /* no single directives seen yet */
7568   team->t.t_ordered.dt.t_value =
7569       0; /* thread 0 enters the ordered section first */
7570 
7571   /* Reset the identifiers on the dispatch buffer */
7572   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7573   if (team->t.t_max_nproc > 1) {
7574     int i;
7575     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7576       team->t.t_disp_buffer[i].buffer_index = i;
7577       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7578     }
7579   } else {
7580     team->t.t_disp_buffer[0].buffer_index = 0;
7581     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7582   }
7583 
7584   KMP_MB(); /* Flush all pending memory write invalidates.  */
7585   KMP_ASSERT(this_thr->th.th_team == team);
7586 
7587 #ifdef KMP_DEBUG
7588   for (f = 0; f < team->t.t_nproc; f++) {
7589     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7590                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7591   }
7592 #endif /* KMP_DEBUG */
7593 
7594   /* release the worker threads so they may begin working */
7595   __kmp_fork_barrier(gtid, 0);
7596 }
7597 
7598 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7599   kmp_info_t *this_thr = __kmp_threads[gtid];
7600 
7601   KMP_DEBUG_ASSERT(team);
7602   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7603   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7604   KMP_MB(); /* Flush all pending memory write invalidates.  */
7605 
7606   /* Join barrier after fork */
7607 
7608 #ifdef KMP_DEBUG
7609   if (__kmp_threads[gtid] &&
7610       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7611     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7612                  __kmp_threads[gtid]);
7613     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7614                  "team->t.t_nproc=%d\n",
7615                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7616                  team->t.t_nproc);
7617     __kmp_print_structure();
7618   }
7619   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7620                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7621 #endif /* KMP_DEBUG */
7622 
7623   __kmp_join_barrier(gtid); /* wait for everyone */
7624 #if OMPT_SUPPORT
7625   if (ompt_enabled.enabled &&
7626       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7627     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7628     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7629     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7630 #if OMPT_OPTIONAL
7631     void *codeptr = NULL;
7632     if (KMP_MASTER_TID(ds_tid) &&
7633         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7634          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7635       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7636 
7637     if (ompt_enabled.ompt_callback_sync_region_wait) {
7638       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7639           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7640           codeptr);
7641     }
7642     if (ompt_enabled.ompt_callback_sync_region) {
7643       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7644           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7645           codeptr);
7646     }
7647 #endif
7648     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7649       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7650           ompt_scope_end, NULL, task_data, 0, ds_tid,
7651           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7652     }
7653   }
7654 #endif
7655 
7656   KMP_MB(); /* Flush all pending memory write invalidates.  */
7657   KMP_ASSERT(this_thr->th.th_team == team);
7658 }
7659 
7660 /* ------------------------------------------------------------------------ */
7661 
7662 #ifdef USE_LOAD_BALANCE
7663 
7664 // Return the worker threads actively spinning in the hot team, if we
7665 // are at the outermost level of parallelism.  Otherwise, return 0.
7666 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7667   int i;
7668   int retval;
7669   kmp_team_t *hot_team;
7670 
7671   if (root->r.r_active) {
7672     return 0;
7673   }
7674   hot_team = root->r.r_hot_team;
7675   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7676     return hot_team->t.t_nproc - 1; // Don't count master thread
7677   }
7678 
7679   // Skip the master thread - it is accounted for elsewhere.
7680   retval = 0;
7681   for (i = 1; i < hot_team->t.t_nproc; i++) {
7682     if (hot_team->t.t_threads[i]->th.th_active) {
7683       retval++;
7684     }
7685   }
7686   return retval;
7687 }
7688 
7689 // Perform an automatic adjustment to the number of
7690 // threads used by the next parallel region.
7691 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7692   int retval;
7693   int pool_active;
7694   int hot_team_active;
7695   int team_curr_active;
7696   int system_active;
7697 
7698   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7699                 set_nproc));
7700   KMP_DEBUG_ASSERT(root);
7701   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7702                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7703   KMP_DEBUG_ASSERT(set_nproc > 1);
7704 
7705   if (set_nproc == 1) {
7706     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7707     return 1;
7708   }
7709 
7710   // Threads that are active in the thread pool, active in the hot team for this
7711   // particular root (if we are at the outer par level), and the currently
7712   // executing thread (to become the master) are available to add to the new
7713   // team, but are currently contributing to the system load, and must be
7714   // accounted for.
7715   pool_active = __kmp_thread_pool_active_nth;
7716   hot_team_active = __kmp_active_hot_team_nproc(root);
7717   team_curr_active = pool_active + hot_team_active + 1;
7718 
7719   // Check the system load.
7720   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7721   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7722                 "hot team active = %d\n",
7723                 system_active, pool_active, hot_team_active));
7724 
7725   if (system_active < 0) {
7726     // There was an error reading the necessary info from /proc, so use the
7727     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7728     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7729     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7730     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7731 
7732     // Make this call behave like the thread limit algorithm.
7733     retval = __kmp_avail_proc - __kmp_nth +
7734              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7735     if (retval > set_nproc) {
7736       retval = set_nproc;
7737     }
7738     if (retval < KMP_MIN_NTH) {
7739       retval = KMP_MIN_NTH;
7740     }
7741 
7742     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7743                   retval));
7744     return retval;
7745   }
7746 
7747   // There is a slight delay in the load balance algorithm in detecting new
7748   // running procs. The real system load at this instant should be at least as
7749   // large as the #active omp thread that are available to add to the team.
7750   if (system_active < team_curr_active) {
7751     system_active = team_curr_active;
7752   }
7753   retval = __kmp_avail_proc - system_active + team_curr_active;
7754   if (retval > set_nproc) {
7755     retval = set_nproc;
7756   }
7757   if (retval < KMP_MIN_NTH) {
7758     retval = KMP_MIN_NTH;
7759   }
7760 
7761   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7762   return retval;
7763 } // __kmp_load_balance_nproc()
7764 
7765 #endif /* USE_LOAD_BALANCE */
7766 
7767 /* ------------------------------------------------------------------------ */
7768 
7769 /* NOTE: this is called with the __kmp_init_lock held */
7770 void __kmp_cleanup(void) {
7771   int f;
7772 
7773   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7774 
7775   if (TCR_4(__kmp_init_parallel)) {
7776 #if KMP_HANDLE_SIGNALS
7777     __kmp_remove_signals();
7778 #endif
7779     TCW_4(__kmp_init_parallel, FALSE);
7780   }
7781 
7782   if (TCR_4(__kmp_init_middle)) {
7783 #if KMP_AFFINITY_SUPPORTED
7784     __kmp_affinity_uninitialize();
7785 #endif /* KMP_AFFINITY_SUPPORTED */
7786     __kmp_cleanup_hierarchy();
7787     TCW_4(__kmp_init_middle, FALSE);
7788   }
7789 
7790   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7791 
7792   if (__kmp_init_serial) {
7793     __kmp_runtime_destroy();
7794     __kmp_init_serial = FALSE;
7795   }
7796 
7797   __kmp_cleanup_threadprivate_caches();
7798 
7799   for (f = 0; f < __kmp_threads_capacity; f++) {
7800     if (__kmp_root[f] != NULL) {
7801       __kmp_free(__kmp_root[f]);
7802       __kmp_root[f] = NULL;
7803     }
7804   }
7805   __kmp_free(__kmp_threads);
7806   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7807   // there is no need in freeing __kmp_root.
7808   __kmp_threads = NULL;
7809   __kmp_root = NULL;
7810   __kmp_threads_capacity = 0;
7811 
7812 #if KMP_USE_DYNAMIC_LOCK
7813   __kmp_cleanup_indirect_user_locks();
7814 #else
7815   __kmp_cleanup_user_locks();
7816 #endif
7817 
7818 #if KMP_AFFINITY_SUPPORTED
7819   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7820   __kmp_cpuinfo_file = NULL;
7821 #endif /* KMP_AFFINITY_SUPPORTED */
7822 
7823 #if KMP_USE_ADAPTIVE_LOCKS
7824 #if KMP_DEBUG_ADAPTIVE_LOCKS
7825   __kmp_print_speculative_stats();
7826 #endif
7827 #endif
7828   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7829   __kmp_nested_nth.nth = NULL;
7830   __kmp_nested_nth.size = 0;
7831   __kmp_nested_nth.used = 0;
7832   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7833   __kmp_nested_proc_bind.bind_types = NULL;
7834   __kmp_nested_proc_bind.size = 0;
7835   __kmp_nested_proc_bind.used = 0;
7836   if (__kmp_affinity_format) {
7837     KMP_INTERNAL_FREE(__kmp_affinity_format);
7838     __kmp_affinity_format = NULL;
7839   }
7840 
7841   __kmp_i18n_catclose();
7842 
7843 #if KMP_USE_HIER_SCHED
7844   __kmp_hier_scheds.deallocate();
7845 #endif
7846 
7847 #if KMP_STATS_ENABLED
7848   __kmp_stats_fini();
7849 #endif
7850 
7851   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7852 }
7853 
7854 /* ------------------------------------------------------------------------ */
7855 
7856 int __kmp_ignore_mppbeg(void) {
7857   char *env;
7858 
7859   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7860     if (__kmp_str_match_false(env))
7861       return FALSE;
7862   }
7863   // By default __kmpc_begin() is no-op.
7864   return TRUE;
7865 }
7866 
7867 int __kmp_ignore_mppend(void) {
7868   char *env;
7869 
7870   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7871     if (__kmp_str_match_false(env))
7872       return FALSE;
7873   }
7874   // By default __kmpc_end() is no-op.
7875   return TRUE;
7876 }
7877 
7878 void __kmp_internal_begin(void) {
7879   int gtid;
7880   kmp_root_t *root;
7881 
7882   /* this is a very important step as it will register new sibling threads
7883      and assign these new uber threads a new gtid */
7884   gtid = __kmp_entry_gtid();
7885   root = __kmp_threads[gtid]->th.th_root;
7886   KMP_ASSERT(KMP_UBER_GTID(gtid));
7887 
7888   if (root->r.r_begin)
7889     return;
7890   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7891   if (root->r.r_begin) {
7892     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7893     return;
7894   }
7895 
7896   root->r.r_begin = TRUE;
7897 
7898   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7899 }
7900 
7901 /* ------------------------------------------------------------------------ */
7902 
7903 void __kmp_user_set_library(enum library_type arg) {
7904   int gtid;
7905   kmp_root_t *root;
7906   kmp_info_t *thread;
7907 
7908   /* first, make sure we are initialized so we can get our gtid */
7909 
7910   gtid = __kmp_entry_gtid();
7911   thread = __kmp_threads[gtid];
7912 
7913   root = thread->th.th_root;
7914 
7915   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7916                 library_serial));
7917   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7918                                   thread */
7919     KMP_WARNING(SetLibraryIncorrectCall);
7920     return;
7921   }
7922 
7923   switch (arg) {
7924   case library_serial:
7925     thread->th.th_set_nproc = 0;
7926     set__nproc(thread, 1);
7927     break;
7928   case library_turnaround:
7929     thread->th.th_set_nproc = 0;
7930     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7931                                            : __kmp_dflt_team_nth_ub);
7932     break;
7933   case library_throughput:
7934     thread->th.th_set_nproc = 0;
7935     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7936                                            : __kmp_dflt_team_nth_ub);
7937     break;
7938   default:
7939     KMP_FATAL(UnknownLibraryType, arg);
7940   }
7941 
7942   __kmp_aux_set_library(arg);
7943 }
7944 
7945 void __kmp_aux_set_stacksize(size_t arg) {
7946   if (!__kmp_init_serial)
7947     __kmp_serial_initialize();
7948 
7949 #if KMP_OS_DARWIN
7950   if (arg & (0x1000 - 1)) {
7951     arg &= ~(0x1000 - 1);
7952     if (arg + 0x1000) /* check for overflow if we round up */
7953       arg += 0x1000;
7954   }
7955 #endif
7956   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7957 
7958   /* only change the default stacksize before the first parallel region */
7959   if (!TCR_4(__kmp_init_parallel)) {
7960     size_t value = arg; /* argument is in bytes */
7961 
7962     if (value < __kmp_sys_min_stksize)
7963       value = __kmp_sys_min_stksize;
7964     else if (value > KMP_MAX_STKSIZE)
7965       value = KMP_MAX_STKSIZE;
7966 
7967     __kmp_stksize = value;
7968 
7969     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7970   }
7971 
7972   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7973 }
7974 
7975 /* set the behaviour of the runtime library */
7976 /* TODO this can cause some odd behaviour with sibling parallelism... */
7977 void __kmp_aux_set_library(enum library_type arg) {
7978   __kmp_library = arg;
7979 
7980   switch (__kmp_library) {
7981   case library_serial: {
7982     KMP_INFORM(LibraryIsSerial);
7983   } break;
7984   case library_turnaround:
7985     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7986       __kmp_use_yield = 2; // only yield when oversubscribed
7987     break;
7988   case library_throughput:
7989     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7990       __kmp_dflt_blocktime = 200;
7991     break;
7992   default:
7993     KMP_FATAL(UnknownLibraryType, arg);
7994   }
7995 }
7996 
7997 /* Getting team information common for all team API */
7998 // Returns NULL if not in teams construct
7999 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8000   kmp_info_t *thr = __kmp_entry_thread();
8001   teams_serialized = 0;
8002   if (thr->th.th_teams_microtask) {
8003     kmp_team_t *team = thr->th.th_team;
8004     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8005     int ii = team->t.t_level;
8006     teams_serialized = team->t.t_serialized;
8007     int level = tlevel + 1;
8008     KMP_DEBUG_ASSERT(ii >= tlevel);
8009     while (ii > level) {
8010       for (teams_serialized = team->t.t_serialized;
8011            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8012       }
8013       if (team->t.t_serialized && (!teams_serialized)) {
8014         team = team->t.t_parent;
8015         continue;
8016       }
8017       if (ii > level) {
8018         team = team->t.t_parent;
8019         ii--;
8020       }
8021     }
8022     return team;
8023   }
8024   return NULL;
8025 }
8026 
8027 int __kmp_aux_get_team_num() {
8028   int serialized;
8029   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8030   if (team) {
8031     if (serialized > 1) {
8032       return 0; // teams region is serialized ( 1 team of 1 thread ).
8033     } else {
8034       return team->t.t_master_tid;
8035     }
8036   }
8037   return 0;
8038 }
8039 
8040 int __kmp_aux_get_num_teams() {
8041   int serialized;
8042   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8043   if (team) {
8044     if (serialized > 1) {
8045       return 1;
8046     } else {
8047       return team->t.t_parent->t.t_nproc;
8048     }
8049   }
8050   return 1;
8051 }
8052 
8053 /* ------------------------------------------------------------------------ */
8054 
8055 /*
8056  * Affinity Format Parser
8057  *
8058  * Field is in form of: %[[[0].]size]type
8059  * % and type are required (%% means print a literal '%')
8060  * type is either single char or long name surrounded by {},
8061  * e.g., N or {num_threads}
8062  * 0 => leading zeros
8063  * . => right justified when size is specified
8064  * by default output is left justified
8065  * size is the *minimum* field length
8066  * All other characters are printed as is
8067  *
8068  * Available field types:
8069  * L {thread_level}      - omp_get_level()
8070  * n {thread_num}        - omp_get_thread_num()
8071  * h {host}              - name of host machine
8072  * P {process_id}        - process id (integer)
8073  * T {thread_identifier} - native thread identifier (integer)
8074  * N {num_threads}       - omp_get_num_threads()
8075  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8076  * a {thread_affinity}   - comma separated list of integers or integer ranges
8077  *                         (values of affinity mask)
8078  *
8079  * Implementation-specific field types can be added
8080  * If a type is unknown, print "undefined"
8081  */
8082 
8083 // Structure holding the short name, long name, and corresponding data type
8084 // for snprintf.  A table of these will represent the entire valid keyword
8085 // field types.
8086 typedef struct kmp_affinity_format_field_t {
8087   char short_name; // from spec e.g., L -> thread level
8088   const char *long_name; // from spec thread_level -> thread level
8089   char field_format; // data type for snprintf (typically 'd' or 's'
8090   // for integer or string)
8091 } kmp_affinity_format_field_t;
8092 
8093 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8094 #if KMP_AFFINITY_SUPPORTED
8095     {'A', "thread_affinity", 's'},
8096 #endif
8097     {'t', "team_num", 'd'},
8098     {'T', "num_teams", 'd'},
8099     {'L', "nesting_level", 'd'},
8100     {'n', "thread_num", 'd'},
8101     {'N', "num_threads", 'd'},
8102     {'a', "ancestor_tnum", 'd'},
8103     {'H', "host", 's'},
8104     {'P', "process_id", 'd'},
8105     {'i', "native_thread_id", 'd'}};
8106 
8107 // Return the number of characters it takes to hold field
8108 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8109                                             const char **ptr,
8110                                             kmp_str_buf_t *field_buffer) {
8111   int rc, format_index, field_value;
8112   const char *width_left, *width_right;
8113   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8114   static const int FORMAT_SIZE = 20;
8115   char format[FORMAT_SIZE] = {0};
8116   char absolute_short_name = 0;
8117 
8118   KMP_DEBUG_ASSERT(gtid >= 0);
8119   KMP_DEBUG_ASSERT(th);
8120   KMP_DEBUG_ASSERT(**ptr == '%');
8121   KMP_DEBUG_ASSERT(field_buffer);
8122 
8123   __kmp_str_buf_clear(field_buffer);
8124 
8125   // Skip the initial %
8126   (*ptr)++;
8127 
8128   // Check for %% first
8129   if (**ptr == '%') {
8130     __kmp_str_buf_cat(field_buffer, "%", 1);
8131     (*ptr)++; // skip over the second %
8132     return 1;
8133   }
8134 
8135   // Parse field modifiers if they are present
8136   pad_zeros = false;
8137   if (**ptr == '0') {
8138     pad_zeros = true;
8139     (*ptr)++; // skip over 0
8140   }
8141   right_justify = false;
8142   if (**ptr == '.') {
8143     right_justify = true;
8144     (*ptr)++; // skip over .
8145   }
8146   // Parse width of field: [width_left, width_right)
8147   width_left = width_right = NULL;
8148   if (**ptr >= '0' && **ptr <= '9') {
8149     width_left = *ptr;
8150     SKIP_DIGITS(*ptr);
8151     width_right = *ptr;
8152   }
8153 
8154   // Create the format for KMP_SNPRINTF based on flags parsed above
8155   format_index = 0;
8156   format[format_index++] = '%';
8157   if (!right_justify)
8158     format[format_index++] = '-';
8159   if (pad_zeros)
8160     format[format_index++] = '0';
8161   if (width_left && width_right) {
8162     int i = 0;
8163     // Only allow 8 digit number widths.
8164     // This also prevents overflowing format variable
8165     while (i < 8 && width_left < width_right) {
8166       format[format_index++] = *width_left;
8167       width_left++;
8168       i++;
8169     }
8170   }
8171 
8172   // Parse a name (long or short)
8173   // Canonicalize the name into absolute_short_name
8174   found_valid_name = false;
8175   parse_long_name = (**ptr == '{');
8176   if (parse_long_name)
8177     (*ptr)++; // skip initial left brace
8178   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8179                              sizeof(__kmp_affinity_format_table[0]);
8180        ++i) {
8181     char short_name = __kmp_affinity_format_table[i].short_name;
8182     const char *long_name = __kmp_affinity_format_table[i].long_name;
8183     char field_format = __kmp_affinity_format_table[i].field_format;
8184     if (parse_long_name) {
8185       size_t length = KMP_STRLEN(long_name);
8186       if (strncmp(*ptr, long_name, length) == 0) {
8187         found_valid_name = true;
8188         (*ptr) += length; // skip the long name
8189       }
8190     } else if (**ptr == short_name) {
8191       found_valid_name = true;
8192       (*ptr)++; // skip the short name
8193     }
8194     if (found_valid_name) {
8195       format[format_index++] = field_format;
8196       format[format_index++] = '\0';
8197       absolute_short_name = short_name;
8198       break;
8199     }
8200   }
8201   if (parse_long_name) {
8202     if (**ptr != '}') {
8203       absolute_short_name = 0;
8204     } else {
8205       (*ptr)++; // skip over the right brace
8206     }
8207   }
8208 
8209   // Attempt to fill the buffer with the requested
8210   // value using snprintf within __kmp_str_buf_print()
8211   switch (absolute_short_name) {
8212   case 't':
8213     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8214     break;
8215   case 'T':
8216     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8217     break;
8218   case 'L':
8219     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8220     break;
8221   case 'n':
8222     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8223     break;
8224   case 'H': {
8225     static const int BUFFER_SIZE = 256;
8226     char buf[BUFFER_SIZE];
8227     __kmp_expand_host_name(buf, BUFFER_SIZE);
8228     rc = __kmp_str_buf_print(field_buffer, format, buf);
8229   } break;
8230   case 'P':
8231     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8232     break;
8233   case 'i':
8234     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8235     break;
8236   case 'N':
8237     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8238     break;
8239   case 'a':
8240     field_value =
8241         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8242     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8243     break;
8244 #if KMP_AFFINITY_SUPPORTED
8245   case 'A': {
8246     kmp_str_buf_t buf;
8247     __kmp_str_buf_init(&buf);
8248     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8249     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8250     __kmp_str_buf_free(&buf);
8251   } break;
8252 #endif
8253   default:
8254     // According to spec, If an implementation does not have info for field
8255     // type, then "undefined" is printed
8256     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8257     // Skip the field
8258     if (parse_long_name) {
8259       SKIP_TOKEN(*ptr);
8260       if (**ptr == '}')
8261         (*ptr)++;
8262     } else {
8263       (*ptr)++;
8264     }
8265   }
8266 
8267   KMP_ASSERT(format_index <= FORMAT_SIZE);
8268   return rc;
8269 }
8270 
8271 /*
8272  * Return number of characters needed to hold the affinity string
8273  * (not including null byte character)
8274  * The resultant string is printed to buffer, which the caller can then
8275  * handle afterwards
8276  */
8277 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8278                                   kmp_str_buf_t *buffer) {
8279   const char *parse_ptr;
8280   size_t retval;
8281   const kmp_info_t *th;
8282   kmp_str_buf_t field;
8283 
8284   KMP_DEBUG_ASSERT(buffer);
8285   KMP_DEBUG_ASSERT(gtid >= 0);
8286 
8287   __kmp_str_buf_init(&field);
8288   __kmp_str_buf_clear(buffer);
8289 
8290   th = __kmp_threads[gtid];
8291   retval = 0;
8292 
8293   // If format is NULL or zero-length string, then we use
8294   // affinity-format-var ICV
8295   parse_ptr = format;
8296   if (parse_ptr == NULL || *parse_ptr == '\0') {
8297     parse_ptr = __kmp_affinity_format;
8298   }
8299   KMP_DEBUG_ASSERT(parse_ptr);
8300 
8301   while (*parse_ptr != '\0') {
8302     // Parse a field
8303     if (*parse_ptr == '%') {
8304       // Put field in the buffer
8305       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8306       __kmp_str_buf_catbuf(buffer, &field);
8307       retval += rc;
8308     } else {
8309       // Put literal character in buffer
8310       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8311       retval++;
8312       parse_ptr++;
8313     }
8314   }
8315   __kmp_str_buf_free(&field);
8316   return retval;
8317 }
8318 
8319 // Displays the affinity string to stdout
8320 void __kmp_aux_display_affinity(int gtid, const char *format) {
8321   kmp_str_buf_t buf;
8322   __kmp_str_buf_init(&buf);
8323   __kmp_aux_capture_affinity(gtid, format, &buf);
8324   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8325   __kmp_str_buf_free(&buf);
8326 }
8327 
8328 /* ------------------------------------------------------------------------ */
8329 
8330 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8331   int blocktime = arg; /* argument is in milliseconds */
8332 #if KMP_USE_MONITOR
8333   int bt_intervals;
8334 #endif
8335   kmp_int8 bt_set;
8336 
8337   __kmp_save_internal_controls(thread);
8338 
8339   /* Normalize and set blocktime for the teams */
8340   if (blocktime < KMP_MIN_BLOCKTIME)
8341     blocktime = KMP_MIN_BLOCKTIME;
8342   else if (blocktime > KMP_MAX_BLOCKTIME)
8343     blocktime = KMP_MAX_BLOCKTIME;
8344 
8345   set__blocktime_team(thread->th.th_team, tid, blocktime);
8346   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8347 
8348 #if KMP_USE_MONITOR
8349   /* Calculate and set blocktime intervals for the teams */
8350   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8351 
8352   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8353   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8354 #endif
8355 
8356   /* Set whether blocktime has been set to "TRUE" */
8357   bt_set = TRUE;
8358 
8359   set__bt_set_team(thread->th.th_team, tid, bt_set);
8360   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8361 #if KMP_USE_MONITOR
8362   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8363                 "bt_intervals=%d, monitor_updates=%d\n",
8364                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8365                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8366                 __kmp_monitor_wakeups));
8367 #else
8368   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8369                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8370                 thread->th.th_team->t.t_id, tid, blocktime));
8371 #endif
8372 }
8373 
8374 void __kmp_aux_set_defaults(char const *str, size_t len) {
8375   if (!__kmp_init_serial) {
8376     __kmp_serial_initialize();
8377   }
8378   __kmp_env_initialize(str);
8379 
8380   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8381     __kmp_env_print();
8382   }
8383 } // __kmp_aux_set_defaults
8384 
8385 /* ------------------------------------------------------------------------ */
8386 /* internal fast reduction routines */
8387 
8388 PACKED_REDUCTION_METHOD_T
8389 __kmp_determine_reduction_method(
8390     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8391     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8392     kmp_critical_name *lck) {
8393 
8394   // Default reduction method: critical construct ( lck != NULL, like in current
8395   // PAROPT )
8396   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8397   // can be selected by RTL
8398   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8399   // can be selected by RTL
8400   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8401   // among generated by PAROPT.
8402 
8403   PACKED_REDUCTION_METHOD_T retval;
8404 
8405   int team_size;
8406 
8407   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8408   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8409 
8410 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8411   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8412 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8413 
8414   retval = critical_reduce_block;
8415 
8416   // another choice of getting a team size (with 1 dynamic deference) is slower
8417   team_size = __kmp_get_team_num_threads(global_tid);
8418   if (team_size == 1) {
8419 
8420     retval = empty_reduce_block;
8421 
8422   } else {
8423 
8424     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8425 
8426 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8427     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8428 
8429 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8430     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8431 
8432     int teamsize_cutoff = 4;
8433 
8434 #if KMP_MIC_SUPPORTED
8435     if (__kmp_mic_type != non_mic) {
8436       teamsize_cutoff = 8;
8437     }
8438 #endif
8439     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8440     if (tree_available) {
8441       if (team_size <= teamsize_cutoff) {
8442         if (atomic_available) {
8443           retval = atomic_reduce_block;
8444         }
8445       } else {
8446         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8447       }
8448     } else if (atomic_available) {
8449       retval = atomic_reduce_block;
8450     }
8451 #else
8452 #error "Unknown or unsupported OS"
8453 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8454        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8455 
8456 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8457 
8458 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8459 
8460     // basic tuning
8461 
8462     if (atomic_available) {
8463       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8464         retval = atomic_reduce_block;
8465       }
8466     } // otherwise: use critical section
8467 
8468 #elif KMP_OS_DARWIN
8469 
8470     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8471     if (atomic_available && (num_vars <= 3)) {
8472       retval = atomic_reduce_block;
8473     } else if (tree_available) {
8474       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8475           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8476         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8477       }
8478     } // otherwise: use critical section
8479 
8480 #else
8481 #error "Unknown or unsupported OS"
8482 #endif
8483 
8484 #else
8485 #error "Unknown or unsupported architecture"
8486 #endif
8487   }
8488 
8489   // KMP_FORCE_REDUCTION
8490 
8491   // If the team is serialized (team_size == 1), ignore the forced reduction
8492   // method and stay with the unsynchronized method (empty_reduce_block)
8493   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8494       team_size != 1) {
8495 
8496     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8497 
8498     int atomic_available, tree_available;
8499 
8500     switch ((forced_retval = __kmp_force_reduction_method)) {
8501     case critical_reduce_block:
8502       KMP_ASSERT(lck); // lck should be != 0
8503       break;
8504 
8505     case atomic_reduce_block:
8506       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8507       if (!atomic_available) {
8508         KMP_WARNING(RedMethodNotSupported, "atomic");
8509         forced_retval = critical_reduce_block;
8510       }
8511       break;
8512 
8513     case tree_reduce_block:
8514       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8515       if (!tree_available) {
8516         KMP_WARNING(RedMethodNotSupported, "tree");
8517         forced_retval = critical_reduce_block;
8518       } else {
8519 #if KMP_FAST_REDUCTION_BARRIER
8520         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8521 #endif
8522       }
8523       break;
8524 
8525     default:
8526       KMP_ASSERT(0); // "unsupported method specified"
8527     }
8528 
8529     retval = forced_retval;
8530   }
8531 
8532   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8533 
8534 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8535 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8536 
8537   return (retval);
8538 }
8539 // this function is for testing set/get/determine reduce method
8540 kmp_int32 __kmp_get_reduce_method(void) {
8541   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8542 }
8543 
8544 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8545 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8546 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8547 
8548 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8549 // OpenMP is used subsequently.
8550 void __kmp_hard_pause() {
8551   __kmp_pause_status = kmp_hard_paused;
8552   __kmp_internal_end_thread(-1);
8553 }
8554 
8555 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8556 void __kmp_resume_if_soft_paused() {
8557   if (__kmp_pause_status == kmp_soft_paused) {
8558     __kmp_pause_status = kmp_not_paused;
8559 
8560     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8561       kmp_info_t *thread = __kmp_threads[gtid];
8562       if (thread) { // Wake it if sleeping
8563         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8564                          thread);
8565         if (fl.is_sleeping())
8566           fl.resume(gtid);
8567         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8568           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8569         } else { // thread holds the lock and may sleep soon
8570           do { // until either the thread sleeps, or we can get the lock
8571             if (fl.is_sleeping()) {
8572               fl.resume(gtid);
8573               break;
8574             } else if (__kmp_try_suspend_mx(thread)) {
8575               __kmp_unlock_suspend_mx(thread);
8576               break;
8577             }
8578           } while (1);
8579         }
8580       }
8581     }
8582   }
8583 }
8584 
8585 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8586 // TODO: add warning messages
8587 int __kmp_pause_resource(kmp_pause_status_t level) {
8588   if (level == kmp_not_paused) { // requesting resume
8589     if (__kmp_pause_status == kmp_not_paused) {
8590       // error message about runtime not being paused, so can't resume
8591       return 1;
8592     } else {
8593       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8594                        __kmp_pause_status == kmp_hard_paused);
8595       __kmp_pause_status = kmp_not_paused;
8596       return 0;
8597     }
8598   } else if (level == kmp_soft_paused) { // requesting soft pause
8599     if (__kmp_pause_status != kmp_not_paused) {
8600       // error message about already being paused
8601       return 1;
8602     } else {
8603       __kmp_soft_pause();
8604       return 0;
8605     }
8606   } else if (level == kmp_hard_paused) { // requesting hard pause
8607     if (__kmp_pause_status != kmp_not_paused) {
8608       // error message about already being paused
8609       return 1;
8610     } else {
8611       __kmp_hard_pause();
8612       return 0;
8613     }
8614   } else {
8615     // error message about invalid level
8616     return 1;
8617   }
8618 }
8619 
8620 void __kmp_omp_display_env(int verbose) {
8621   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8622   if (__kmp_init_serial == 0)
8623     __kmp_do_serial_initialize();
8624   __kmp_display_env_impl(!verbose, verbose);
8625   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8626 }
8627 
8628 // Globals and functions for hidden helper task
8629 kmp_info_t **__kmp_hidden_helper_threads;
8630 kmp_info_t *__kmp_hidden_helper_main_thread;
8631 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8632 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8633 #if KMP_OS_LINUX
8634 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8635 #else
8636 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8637 #endif
8638 
8639 namespace {
8640 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8641 
8642 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8643   // This is an explicit synchronization on all hidden helper threads in case
8644   // that when a regular thread pushes a hidden helper task to one hidden
8645   // helper thread, the thread has not been awaken once since they're released
8646   // by the main thread after creating the team.
8647   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8648   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8649          __kmp_hidden_helper_threads_num)
8650     ;
8651 
8652   // If main thread, then wait for signal
8653   if (__kmpc_master(nullptr, *gtid)) {
8654     // First, unset the initial state and release the initial thread
8655     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8656     __kmp_hidden_helper_initz_release();
8657     __kmp_hidden_helper_main_thread_wait();
8658     // Now wake up all worker threads
8659     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8660       __kmp_hidden_helper_worker_thread_signal();
8661     }
8662   }
8663 }
8664 } // namespace
8665 
8666 void __kmp_hidden_helper_threads_initz_routine() {
8667   // Create a new root for hidden helper team/threads
8668   const int gtid = __kmp_register_root(TRUE);
8669   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8670   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8671   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8672       __kmp_hidden_helper_threads_num;
8673 
8674   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8675 
8676   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8677 
8678   // Set the initialization flag to FALSE
8679   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8680 
8681   __kmp_hidden_helper_threads_deinitz_release();
8682 }
8683