1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 #if OMP_PROFILING_SUPPORT
36 #include "llvm/Support/TimeProfiler.h"
37 static char *ProfileTraceFile = nullptr;
38 #endif
39 
40 /* these are temporary issues to be dealt with */
41 #define KMP_USE_PRCTL 0
42 
43 #if KMP_OS_WINDOWS
44 #include <process.h>
45 #endif
46 
47 #include "tsan_annotations.h"
48 
49 #if KMP_OS_WINDOWS
50 // windows does not need include files as it doesn't use shared memory
51 #else
52 #include <sys/mman.h>
53 #include <sys/stat.h>
54 #include <fcntl.h>
55 #define SHM_SIZE 1024
56 #endif
57 
58 #if defined(KMP_GOMP_COMPAT)
59 char const __kmp_version_alt_comp[] =
60     KMP_VERSION_PREFIX "alternative compiler support: yes";
61 #endif /* defined(KMP_GOMP_COMPAT) */
62 
63 char const __kmp_version_omp_api[] =
64     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
65 
66 #ifdef KMP_DEBUG
67 char const __kmp_version_lock[] =
68     KMP_VERSION_PREFIX "lock type: run time selectable";
69 #endif /* KMP_DEBUG */
70 
71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
72 
73 /* ------------------------------------------------------------------------ */
74 
75 #if KMP_USE_MONITOR
76 kmp_info_t __kmp_monitor;
77 #endif
78 
79 /* Forward declarations */
80 
81 void __kmp_cleanup(void);
82 
83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
84                                   int gtid);
85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
86                                   kmp_internal_control_t *new_icvs,
87                                   ident_t *loc);
88 #if KMP_AFFINITY_SUPPORTED
89 static void __kmp_partition_places(kmp_team_t *team,
90                                    int update_master_only = 0);
91 #endif
92 static void __kmp_do_serial_initialize(void);
93 void __kmp_fork_barrier(int gtid, int tid);
94 void __kmp_join_barrier(int gtid);
95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
96                           kmp_internal_control_t *new_icvs, ident_t *loc);
97 
98 #ifdef USE_LOAD_BALANCE
99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
100 #endif
101 
102 static int __kmp_expand_threads(int nNeed);
103 #if KMP_OS_WINDOWS
104 static int __kmp_unregister_root_other_thread(int gtid);
105 #endif
106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
108 
109 /* Calculate the identifier of the current thread */
110 /* fast (and somewhat portable) way to get unique identifier of executing
111    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
112 int __kmp_get_global_thread_id() {
113   int i;
114   kmp_info_t **other_threads;
115   size_t stack_data;
116   char *stack_addr;
117   size_t stack_size;
118   char *stack_base;
119 
120   KA_TRACE(
121       1000,
122       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
123        __kmp_nth, __kmp_all_nth));
124 
125   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
126      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
127      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
128      __kmp_init_gtid for this to work. */
129 
130   if (!TCR_4(__kmp_init_gtid))
131     return KMP_GTID_DNE;
132 
133 #ifdef KMP_TDATA_GTID
134   if (TCR_4(__kmp_gtid_mode) >= 3) {
135     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
136     return __kmp_gtid;
137   }
138 #endif
139   if (TCR_4(__kmp_gtid_mode) >= 2) {
140     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
141     return __kmp_gtid_get_specific();
142   }
143   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
144 
145   stack_addr = (char *)&stack_data;
146   other_threads = __kmp_threads;
147 
148   /* ATT: The code below is a source of potential bugs due to unsynchronized
149      access to __kmp_threads array. For example:
150      1. Current thread loads other_threads[i] to thr and checks it, it is
151         non-NULL.
152      2. Current thread is suspended by OS.
153      3. Another thread unregisters and finishes (debug versions of free()
154         may fill memory with something like 0xEF).
155      4. Current thread is resumed.
156      5. Current thread reads junk from *thr.
157      TODO: Fix it.  --ln  */
158 
159   for (i = 0; i < __kmp_threads_capacity; i++) {
160 
161     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
162     if (!thr)
163       continue;
164 
165     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
166     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
167 
168     /* stack grows down -- search through all of the active threads */
169 
170     if (stack_addr <= stack_base) {
171       size_t stack_diff = stack_base - stack_addr;
172 
173       if (stack_diff <= stack_size) {
174         /* The only way we can be closer than the allocated */
175         /* stack size is if we are running on this thread. */
176         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
177         return i;
178       }
179     }
180   }
181 
182   /* get specific to try and determine our gtid */
183   KA_TRACE(1000,
184            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
185             "thread, using TLS\n"));
186   i = __kmp_gtid_get_specific();
187 
188   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
189 
190   /* if we havn't been assigned a gtid, then return code */
191   if (i < 0)
192     return i;
193 
194   /* dynamically updated stack window for uber threads to avoid get_specific
195      call */
196   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
197     KMP_FATAL(StackOverflow, i);
198   }
199 
200   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201   if (stack_addr > stack_base) {
202     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
203     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
205                 stack_base);
206   } else {
207     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
208             stack_base - stack_addr);
209   }
210 
211   /* Reprint stack bounds for ubermaster since they have been refined */
212   if (__kmp_storage_map) {
213     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
214     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
215     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
216                                  other_threads[i]->th.th_info.ds.ds_stacksize,
217                                  "th_%d stack (refinement)", i);
218   }
219   return i;
220 }
221 
222 int __kmp_get_global_thread_id_reg() {
223   int gtid;
224 
225   if (!__kmp_init_serial) {
226     gtid = KMP_GTID_DNE;
227   } else
228 #ifdef KMP_TDATA_GTID
229       if (TCR_4(__kmp_gtid_mode) >= 3) {
230     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
231     gtid = __kmp_gtid;
232   } else
233 #endif
234       if (TCR_4(__kmp_gtid_mode) >= 2) {
235     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
236     gtid = __kmp_gtid_get_specific();
237   } else {
238     KA_TRACE(1000,
239              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
240     gtid = __kmp_get_global_thread_id();
241   }
242 
243   /* we must be a new uber master sibling thread */
244   if (gtid == KMP_GTID_DNE) {
245     KA_TRACE(10,
246              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
247               "Registering a new gtid.\n"));
248     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
249     if (!__kmp_init_serial) {
250       __kmp_do_serial_initialize();
251       gtid = __kmp_gtid_get_specific();
252     } else {
253       gtid = __kmp_register_root(FALSE);
254     }
255     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
256     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
257   }
258 
259   KMP_DEBUG_ASSERT(gtid >= 0);
260 
261   return gtid;
262 }
263 
264 /* caller must hold forkjoin_lock */
265 void __kmp_check_stack_overlap(kmp_info_t *th) {
266   int f;
267   char *stack_beg = NULL;
268   char *stack_end = NULL;
269   int gtid;
270 
271   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
272   if (__kmp_storage_map) {
273     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
274     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
275 
276     gtid = __kmp_gtid_from_thread(th);
277 
278     if (gtid == KMP_GTID_MONITOR) {
279       __kmp_print_storage_map_gtid(
280           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281           "th_%s stack (%s)", "mon",
282           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283     } else {
284       __kmp_print_storage_map_gtid(
285           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286           "th_%d stack (%s)", gtid,
287           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288     }
289   }
290 
291   /* No point in checking ubermaster threads since they use refinement and
292    * cannot overlap */
293   gtid = __kmp_gtid_from_thread(th);
294   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
295     KA_TRACE(10,
296              ("__kmp_check_stack_overlap: performing extensive checking\n"));
297     if (stack_beg == NULL) {
298       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
299       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
300     }
301 
302     for (f = 0; f < __kmp_threads_capacity; f++) {
303       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
304 
305       if (f_th && f_th != th) {
306         char *other_stack_end =
307             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
308         char *other_stack_beg =
309             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
310         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
311             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
312 
313           /* Print the other stack values before the abort */
314           if (__kmp_storage_map)
315             __kmp_print_storage_map_gtid(
316                 -1, other_stack_beg, other_stack_end,
317                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
318                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
319 
320           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
321                       __kmp_msg_null);
322         }
323       }
324     }
325   }
326   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
327 }
328 
329 /* ------------------------------------------------------------------------ */
330 
331 void __kmp_infinite_loop(void) {
332   static int done = FALSE;
333 
334   while (!done) {
335     KMP_YIELD(TRUE);
336   }
337 }
338 
339 #define MAX_MESSAGE 512
340 
341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
342                                   char const *format, ...) {
343   char buffer[MAX_MESSAGE];
344   va_list ap;
345 
346   va_start(ap, format);
347   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
348                p2, (unsigned long)size, format);
349   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
350   __kmp_vprintf(kmp_err, buffer, ap);
351 #if KMP_PRINT_DATA_PLACEMENT
352   int node;
353   if (gtid >= 0) {
354     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
355       if (__kmp_storage_map_verbose) {
356         node = __kmp_get_host_node(p1);
357         if (node < 0) /* doesn't work, so don't try this next time */
358           __kmp_storage_map_verbose = FALSE;
359         else {
360           char *last;
361           int lastNode;
362           int localProc = __kmp_get_cpu_from_gtid(gtid);
363 
364           const int page_size = KMP_GET_PAGE_SIZE();
365 
366           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
367           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
368           if (localProc >= 0)
369             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
370                                  localProc >> 1);
371           else
372             __kmp_printf_no_lock("  GTID %d\n", gtid);
373 #if KMP_USE_PRCTL
374           /* The more elaborate format is disabled for now because of the prctl
375            * hanging bug. */
376           do {
377             last = p1;
378             lastNode = node;
379             /* This loop collates adjacent pages with the same host node. */
380             do {
381               (char *)p1 += page_size;
382             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
383             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
384                                  lastNode);
385           } while (p1 <= p2);
386 #else
387           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
388                                (char *)p1 + (page_size - 1),
389                                __kmp_get_host_node(p1));
390           if (p1 < p2) {
391             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
392                                  (char *)p2 + (page_size - 1),
393                                  __kmp_get_host_node(p2));
394           }
395 #endif
396         }
397       }
398     } else
399       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
400   }
401 #endif /* KMP_PRINT_DATA_PLACEMENT */
402   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
403 }
404 
405 void __kmp_warn(char const *format, ...) {
406   char buffer[MAX_MESSAGE];
407   va_list ap;
408 
409   if (__kmp_generate_warnings == kmp_warnings_off) {
410     return;
411   }
412 
413   va_start(ap, format);
414 
415   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
416   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
417   __kmp_vprintf(kmp_err, buffer, ap);
418   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
419 
420   va_end(ap);
421 }
422 
423 void __kmp_abort_process() {
424   // Later threads may stall here, but that's ok because abort() will kill them.
425   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
426 
427   if (__kmp_debug_buf) {
428     __kmp_dump_debug_buffer();
429   }
430 
431   if (KMP_OS_WINDOWS) {
432     // Let other threads know of abnormal termination and prevent deadlock
433     // if abort happened during library initialization or shutdown
434     __kmp_global.g.g_abort = SIGABRT;
435 
436     /* On Windows* OS by default abort() causes pop-up error box, which stalls
437        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
438        boxes. _set_abort_behavior() works well, but this function is not
439        available in VS7 (this is not problem for DLL, but it is a problem for
440        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
441        help, at least in some versions of MS C RTL.
442 
443        It seems following sequence is the only way to simulate abort() and
444        avoid pop-up error box. */
445     raise(SIGABRT);
446     _exit(3); // Just in case, if signal ignored, exit anyway.
447   } else {
448     __kmp_unregister_library();
449     abort();
450   }
451 
452   __kmp_infinite_loop();
453   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
454 
455 } // __kmp_abort_process
456 
457 void __kmp_abort_thread(void) {
458   // TODO: Eliminate g_abort global variable and this function.
459   // In case of abort just call abort(), it will kill all the threads.
460   __kmp_infinite_loop();
461 } // __kmp_abort_thread
462 
463 /* Print out the storage map for the major kmp_info_t thread data structures
464    that are allocated together. */
465 
466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
467   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
468                                gtid);
469 
470   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
471                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
472 
473   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
474                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
475 
476   __kmp_print_storage_map_gtid(
477       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
478       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
481                                &thr->th.th_bar[bs_plain_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
483                                gtid);
484 
485   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
486                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
487                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
488                                gtid);
489 
490 #if KMP_FAST_REDUCTION_BARRIER
491   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
492                                &thr->th.th_bar[bs_reduction_barrier + 1],
493                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
494                                gtid);
495 #endif // KMP_FAST_REDUCTION_BARRIER
496 }
497 
498 /* Print out the storage map for the major kmp_team_t team data structures
499    that are allocated together. */
500 
501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
502                                          int team_id, int num_thr) {
503   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
504   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
505                                header, team_id);
506 
507   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
508                                &team->t.t_bar[bs_last_barrier],
509                                sizeof(kmp_balign_team_t) * bs_last_barrier,
510                                "%s_%d.t_bar", header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
513                                &team->t.t_bar[bs_plain_barrier + 1],
514                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
515                                header, team_id);
516 
517   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
518                                &team->t.t_bar[bs_forkjoin_barrier + 1],
519                                sizeof(kmp_balign_team_t),
520                                "%s_%d.t_bar[forkjoin]", header, team_id);
521 
522 #if KMP_FAST_REDUCTION_BARRIER
523   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
524                                &team->t.t_bar[bs_reduction_barrier + 1],
525                                sizeof(kmp_balign_team_t),
526                                "%s_%d.t_bar[reduction]", header, team_id);
527 #endif // KMP_FAST_REDUCTION_BARRIER
528 
529   __kmp_print_storage_map_gtid(
530       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
531       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
532 
533   __kmp_print_storage_map_gtid(
534       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
535       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
536 
537   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
538                                &team->t.t_disp_buffer[num_disp_buff],
539                                sizeof(dispatch_shared_info_t) * num_disp_buff,
540                                "%s_%d.t_disp_buffer", header, team_id);
541 }
542 
543 static void __kmp_init_allocator() {
544   __kmp_init_memkind();
545   __kmp_init_target_mem();
546 }
547 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
548 
549 /* ------------------------------------------------------------------------ */
550 
551 #if KMP_DYNAMIC_LIB
552 #if KMP_OS_WINDOWS
553 
554 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
555   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
556 
557   switch (fdwReason) {
558 
559   case DLL_PROCESS_ATTACH:
560     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
561 
562     return TRUE;
563 
564   case DLL_PROCESS_DETACH:
565     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
566 
567     // According to Windows* documentation for DllMain entry point:
568     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
569     //   lpReserved == NULL when FreeLibrary() is called,
570     //   lpReserved != NULL when the process is terminated.
571     // When FreeLibrary() is called, worker threads remain alive. So the
572     // runtime's state is consistent and executing proper shutdown is OK.
573     // When the process is terminated, worker threads have exited or been
574     // forcefully terminated by the OS and only the shutdown thread remains.
575     // This can leave the runtime in an inconsistent state.
576     // Hence, only attempt proper cleanup when FreeLibrary() is called.
577     // Otherwise, rely on OS to reclaim resources.
578     if (lpReserved == NULL)
579       __kmp_internal_end_library(__kmp_gtid_get_specific());
580 
581     return TRUE;
582 
583   case DLL_THREAD_ATTACH:
584     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
585 
586     /* if we want to register new siblings all the time here call
587      * __kmp_get_gtid(); */
588     return TRUE;
589 
590   case DLL_THREAD_DETACH:
591     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
592 
593     __kmp_internal_end_thread(__kmp_gtid_get_specific());
594     return TRUE;
595   }
596 
597   return TRUE;
598 }
599 
600 #endif /* KMP_OS_WINDOWS */
601 #endif /* KMP_DYNAMIC_LIB */
602 
603 /* __kmp_parallel_deo -- Wait until it's our turn. */
604 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
605   int gtid = *gtid_ref;
606 #ifdef BUILD_PARALLEL_ORDERED
607   kmp_team_t *team = __kmp_team_from_gtid(gtid);
608 #endif /* BUILD_PARALLEL_ORDERED */
609 
610   if (__kmp_env_consistency_check) {
611     if (__kmp_threads[gtid]->th.th_root->r.r_active)
612 #if KMP_USE_DYNAMIC_LOCK
613       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
614 #else
615       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
616 #endif
617   }
618 #ifdef BUILD_PARALLEL_ORDERED
619   if (!team->t.t_serialized) {
620     KMP_MB();
621     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
622              NULL);
623     KMP_MB();
624   }
625 #endif /* BUILD_PARALLEL_ORDERED */
626 }
627 
628 /* __kmp_parallel_dxo -- Signal the next task. */
629 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
630   int gtid = *gtid_ref;
631 #ifdef BUILD_PARALLEL_ORDERED
632   int tid = __kmp_tid_from_gtid(gtid);
633   kmp_team_t *team = __kmp_team_from_gtid(gtid);
634 #endif /* BUILD_PARALLEL_ORDERED */
635 
636   if (__kmp_env_consistency_check) {
637     if (__kmp_threads[gtid]->th.th_root->r.r_active)
638       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
639   }
640 #ifdef BUILD_PARALLEL_ORDERED
641   if (!team->t.t_serialized) {
642     KMP_MB(); /* Flush all pending memory write invalidates.  */
643 
644     /* use the tid of the next thread in this team */
645     /* TODO replace with general release procedure */
646     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
647 
648     KMP_MB(); /* Flush all pending memory write invalidates.  */
649   }
650 #endif /* BUILD_PARALLEL_ORDERED */
651 }
652 
653 /* ------------------------------------------------------------------------ */
654 /* The BARRIER for a SINGLE process section is always explicit   */
655 
656 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
657   int status;
658   kmp_info_t *th;
659   kmp_team_t *team;
660 
661   if (!TCR_4(__kmp_init_parallel))
662     __kmp_parallel_initialize();
663   __kmp_resume_if_soft_paused();
664 
665   th = __kmp_threads[gtid];
666   team = th->th.th_team;
667   status = 0;
668 
669   th->th.th_ident = id_ref;
670 
671   if (team->t.t_serialized) {
672     status = 1;
673   } else {
674     kmp_int32 old_this = th->th.th_local.this_construct;
675 
676     ++th->th.th_local.this_construct;
677     /* try to set team count to thread count--success means thread got the
678        single block */
679     /* TODO: Should this be acquire or release? */
680     if (team->t.t_construct == old_this) {
681       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
682                                               th->th.th_local.this_construct);
683     }
684 #if USE_ITT_BUILD
685     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
686         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
687         team->t.t_active_level == 1) {
688       // Only report metadata by primary thread of active team at level 1
689       __kmp_itt_metadata_single(id_ref);
690     }
691 #endif /* USE_ITT_BUILD */
692   }
693 
694   if (__kmp_env_consistency_check) {
695     if (status && push_ws) {
696       __kmp_push_workshare(gtid, ct_psingle, id_ref);
697     } else {
698       __kmp_check_workshare(gtid, ct_psingle, id_ref);
699     }
700   }
701 #if USE_ITT_BUILD
702   if (status) {
703     __kmp_itt_single_start(gtid);
704   }
705 #endif /* USE_ITT_BUILD */
706   return status;
707 }
708 
709 void __kmp_exit_single(int gtid) {
710 #if USE_ITT_BUILD
711   __kmp_itt_single_end(gtid);
712 #endif /* USE_ITT_BUILD */
713   if (__kmp_env_consistency_check)
714     __kmp_pop_workshare(gtid, ct_psingle, NULL);
715 }
716 
717 /* determine if we can go parallel or must use a serialized parallel region and
718  * how many threads we can use
719  * set_nproc is the number of threads requested for the team
720  * returns 0 if we should serialize or only use one thread,
721  * otherwise the number of threads to use
722  * The forkjoin lock is held by the caller. */
723 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
724                                  int master_tid, int set_nthreads,
725                                  int enter_teams) {
726   int capacity;
727   int new_nthreads;
728   KMP_DEBUG_ASSERT(__kmp_init_serial);
729   KMP_DEBUG_ASSERT(root && parent_team);
730   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
731 
732   // If dyn-var is set, dynamically adjust the number of desired threads,
733   // according to the method specified by dynamic_mode.
734   new_nthreads = set_nthreads;
735   if (!get__dynamic_2(parent_team, master_tid)) {
736     ;
737   }
738 #ifdef USE_LOAD_BALANCE
739   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
740     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
741     if (new_nthreads == 1) {
742       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
743                     "reservation to 1 thread\n",
744                     master_tid));
745       return 1;
746     }
747     if (new_nthreads < set_nthreads) {
748       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
749                     "reservation to %d threads\n",
750                     master_tid, new_nthreads));
751     }
752   }
753 #endif /* USE_LOAD_BALANCE */
754   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
755     new_nthreads = __kmp_avail_proc - __kmp_nth +
756                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
757     if (new_nthreads <= 1) {
758       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
759                     "reservation to 1 thread\n",
760                     master_tid));
761       return 1;
762     }
763     if (new_nthreads < set_nthreads) {
764       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
765                     "reservation to %d threads\n",
766                     master_tid, new_nthreads));
767     } else {
768       new_nthreads = set_nthreads;
769     }
770   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
771     if (set_nthreads > 2) {
772       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
773       new_nthreads = (new_nthreads % set_nthreads) + 1;
774       if (new_nthreads == 1) {
775         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
776                       "reservation to 1 thread\n",
777                       master_tid));
778         return 1;
779       }
780       if (new_nthreads < set_nthreads) {
781         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
782                       "reservation to %d threads\n",
783                       master_tid, new_nthreads));
784       }
785     }
786   } else {
787     KMP_ASSERT(0);
788   }
789 
790   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
791   if (__kmp_nth + new_nthreads -
792           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
793       __kmp_max_nth) {
794     int tl_nthreads = __kmp_max_nth - __kmp_nth +
795                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
796     if (tl_nthreads <= 0) {
797       tl_nthreads = 1;
798     }
799 
800     // If dyn-var is false, emit a 1-time warning.
801     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
802       __kmp_reserve_warn = 1;
803       __kmp_msg(kmp_ms_warning,
804                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
805                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
806     }
807     if (tl_nthreads == 1) {
808       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
809                     "reduced reservation to 1 thread\n",
810                     master_tid));
811       return 1;
812     }
813     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
814                   "reservation to %d threads\n",
815                   master_tid, tl_nthreads));
816     new_nthreads = tl_nthreads;
817   }
818 
819   // Respect OMP_THREAD_LIMIT
820   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
821   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
822   if (cg_nthreads + new_nthreads -
823           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
824       max_cg_threads) {
825     int tl_nthreads = max_cg_threads - cg_nthreads +
826                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
827     if (tl_nthreads <= 0) {
828       tl_nthreads = 1;
829     }
830 
831     // If dyn-var is false, emit a 1-time warning.
832     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
833       __kmp_reserve_warn = 1;
834       __kmp_msg(kmp_ms_warning,
835                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
836                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
837     }
838     if (tl_nthreads == 1) {
839       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
840                     "reduced reservation to 1 thread\n",
841                     master_tid));
842       return 1;
843     }
844     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
845                   "reservation to %d threads\n",
846                   master_tid, tl_nthreads));
847     new_nthreads = tl_nthreads;
848   }
849 
850   // Check if the threads array is large enough, or needs expanding.
851   // See comment in __kmp_register_root() about the adjustment if
852   // __kmp_threads[0] == NULL.
853   capacity = __kmp_threads_capacity;
854   if (TCR_PTR(__kmp_threads[0]) == NULL) {
855     --capacity;
856   }
857   if (__kmp_nth + new_nthreads -
858           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
859       capacity) {
860     // Expand the threads array.
861     int slotsRequired = __kmp_nth + new_nthreads -
862                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
863                         capacity;
864     int slotsAdded = __kmp_expand_threads(slotsRequired);
865     if (slotsAdded < slotsRequired) {
866       // The threads array was not expanded enough.
867       new_nthreads -= (slotsRequired - slotsAdded);
868       KMP_ASSERT(new_nthreads >= 1);
869 
870       // If dyn-var is false, emit a 1-time warning.
871       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
872         __kmp_reserve_warn = 1;
873         if (__kmp_tp_cached) {
874           __kmp_msg(kmp_ms_warning,
875                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
876                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
877                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
878         } else {
879           __kmp_msg(kmp_ms_warning,
880                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
881                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
882         }
883       }
884     }
885   }
886 
887 #ifdef KMP_DEBUG
888   if (new_nthreads == 1) {
889     KC_TRACE(10,
890              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
891               "dead roots and rechecking; requested %d threads\n",
892               __kmp_get_gtid(), set_nthreads));
893   } else {
894     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
895                   " %d threads\n",
896                   __kmp_get_gtid(), new_nthreads, set_nthreads));
897   }
898 #endif // KMP_DEBUG
899   return new_nthreads;
900 }
901 
902 /* Allocate threads from the thread pool and assign them to the new team. We are
903    assured that there are enough threads available, because we checked on that
904    earlier within critical section forkjoin */
905 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
906                                     kmp_info_t *master_th, int master_gtid) {
907   int i;
908   int use_hot_team;
909 
910   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
911   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
912   KMP_MB();
913 
914   /* first, let's setup the primary thread */
915   master_th->th.th_info.ds.ds_tid = 0;
916   master_th->th.th_team = team;
917   master_th->th.th_team_nproc = team->t.t_nproc;
918   master_th->th.th_team_master = master_th;
919   master_th->th.th_team_serialized = FALSE;
920   master_th->th.th_dispatch = &team->t.t_dispatch[0];
921 
922 /* make sure we are not the optimized hot team */
923 #if KMP_NESTED_HOT_TEAMS
924   use_hot_team = 0;
925   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
926   if (hot_teams) { // hot teams array is not allocated if
927     // KMP_HOT_TEAMS_MAX_LEVEL=0
928     int level = team->t.t_active_level - 1; // index in array of hot teams
929     if (master_th->th.th_teams_microtask) { // are we inside the teams?
930       if (master_th->th.th_teams_size.nteams > 1) {
931         ++level; // level was not increased in teams construct for
932         // team_of_masters
933       }
934       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
935           master_th->th.th_teams_level == team->t.t_level) {
936         ++level; // level was not increased in teams construct for
937         // team_of_workers before the parallel
938       } // team->t.t_level will be increased inside parallel
939     }
940     if (level < __kmp_hot_teams_max_level) {
941       if (hot_teams[level].hot_team) {
942         // hot team has already been allocated for given level
943         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
944         use_hot_team = 1; // the team is ready to use
945       } else {
946         use_hot_team = 0; // AC: threads are not allocated yet
947         hot_teams[level].hot_team = team; // remember new hot team
948         hot_teams[level].hot_team_nth = team->t.t_nproc;
949       }
950     } else {
951       use_hot_team = 0;
952     }
953   }
954 #else
955   use_hot_team = team == root->r.r_hot_team;
956 #endif
957   if (!use_hot_team) {
958 
959     /* install the primary thread */
960     team->t.t_threads[0] = master_th;
961     __kmp_initialize_info(master_th, team, 0, master_gtid);
962 
963     /* now, install the worker threads */
964     for (i = 1; i < team->t.t_nproc; i++) {
965 
966       /* fork or reallocate a new thread and install it in team */
967       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
968       team->t.t_threads[i] = thr;
969       KMP_DEBUG_ASSERT(thr);
970       KMP_DEBUG_ASSERT(thr->th.th_team == team);
971       /* align team and thread arrived states */
972       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
973                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
974                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
975                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
976                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
977                     team->t.t_bar[bs_plain_barrier].b_arrived));
978       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
979       thr->th.th_teams_level = master_th->th.th_teams_level;
980       thr->th.th_teams_size = master_th->th.th_teams_size;
981       { // Initialize threads' barrier data.
982         int b;
983         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
984         for (b = 0; b < bs_last_barrier; ++b) {
985           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
986           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
987 #if USE_DEBUGGER
988           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
989 #endif
990         }
991       }
992     }
993 
994 #if KMP_AFFINITY_SUPPORTED
995     __kmp_partition_places(team);
996 #endif
997   }
998 
999   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1000     for (i = 0; i < team->t.t_nproc; i++) {
1001       kmp_info_t *thr = team->t.t_threads[i];
1002       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1003           thr->th.th_prev_level != team->t.t_level) {
1004         team->t.t_display_affinity = 1;
1005         break;
1006       }
1007     }
1008   }
1009 
1010   KMP_MB();
1011 }
1012 
1013 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1014 // Propagate any changes to the floating point control registers out to the team
1015 // We try to avoid unnecessary writes to the relevant cache line in the team
1016 // structure, so we don't make changes unless they are needed.
1017 inline static void propagateFPControl(kmp_team_t *team) {
1018   if (__kmp_inherit_fp_control) {
1019     kmp_int16 x87_fpu_control_word;
1020     kmp_uint32 mxcsr;
1021 
1022     // Get primary thread's values of FPU control flags (both X87 and vector)
1023     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1024     __kmp_store_mxcsr(&mxcsr);
1025     mxcsr &= KMP_X86_MXCSR_MASK;
1026 
1027     // There is no point looking at t_fp_control_saved here.
1028     // If it is TRUE, we still have to update the values if they are different
1029     // from those we now have. If it is FALSE we didn't save anything yet, but
1030     // our objective is the same. We have to ensure that the values in the team
1031     // are the same as those we have.
1032     // So, this code achieves what we need whether or not t_fp_control_saved is
1033     // true. By checking whether the value needs updating we avoid unnecessary
1034     // writes that would put the cache-line into a written state, causing all
1035     // threads in the team to have to read it again.
1036     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1037     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1038     // Although we don't use this value, other code in the runtime wants to know
1039     // whether it should restore them. So we must ensure it is correct.
1040     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1041   } else {
1042     // Similarly here. Don't write to this cache-line in the team structure
1043     // unless we have to.
1044     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1045   }
1046 }
1047 
1048 // Do the opposite, setting the hardware registers to the updated values from
1049 // the team.
1050 inline static void updateHWFPControl(kmp_team_t *team) {
1051   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1052     // Only reset the fp control regs if they have been changed in the team.
1053     // the parallel region that we are exiting.
1054     kmp_int16 x87_fpu_control_word;
1055     kmp_uint32 mxcsr;
1056     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1057     __kmp_store_mxcsr(&mxcsr);
1058     mxcsr &= KMP_X86_MXCSR_MASK;
1059 
1060     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1061       __kmp_clear_x87_fpu_status_word();
1062       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1063     }
1064 
1065     if (team->t.t_mxcsr != mxcsr) {
1066       __kmp_load_mxcsr(&team->t.t_mxcsr);
1067     }
1068   }
1069 }
1070 #else
1071 #define propagateFPControl(x) ((void)0)
1072 #define updateHWFPControl(x) ((void)0)
1073 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1074 
1075 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1076                                      int realloc); // forward declaration
1077 
1078 /* Run a parallel region that has been serialized, so runs only in a team of the
1079    single primary thread. */
1080 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1081   kmp_info_t *this_thr;
1082   kmp_team_t *serial_team;
1083 
1084   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1085 
1086   /* Skip all this code for autopar serialized loops since it results in
1087      unacceptable overhead */
1088   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1089     return;
1090 
1091   if (!TCR_4(__kmp_init_parallel))
1092     __kmp_parallel_initialize();
1093   __kmp_resume_if_soft_paused();
1094 
1095   this_thr = __kmp_threads[global_tid];
1096   serial_team = this_thr->th.th_serial_team;
1097 
1098   /* utilize the serialized team held by this thread */
1099   KMP_DEBUG_ASSERT(serial_team);
1100   KMP_MB();
1101 
1102   if (__kmp_tasking_mode != tskm_immediate_exec) {
1103     KMP_DEBUG_ASSERT(
1104         this_thr->th.th_task_team ==
1105         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1106     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1107                      NULL);
1108     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1109                   "team %p, new task_team = NULL\n",
1110                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1111     this_thr->th.th_task_team = NULL;
1112   }
1113 
1114   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1115   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1116     proc_bind = proc_bind_false;
1117   } else if (proc_bind == proc_bind_default) {
1118     // No proc_bind clause was specified, so use the current value
1119     // of proc-bind-var for this parallel region.
1120     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1121   }
1122   // Reset for next parallel region
1123   this_thr->th.th_set_proc_bind = proc_bind_default;
1124 
1125 #if OMPT_SUPPORT
1126   ompt_data_t ompt_parallel_data = ompt_data_none;
1127   ompt_data_t *implicit_task_data;
1128   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1129   if (ompt_enabled.enabled &&
1130       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1131 
1132     ompt_task_info_t *parent_task_info;
1133     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1134 
1135     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1136     if (ompt_enabled.ompt_callback_parallel_begin) {
1137       int team_size = 1;
1138 
1139       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1140           &(parent_task_info->task_data), &(parent_task_info->frame),
1141           &ompt_parallel_data, team_size,
1142           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1143     }
1144   }
1145 #endif // OMPT_SUPPORT
1146 
1147   if (this_thr->th.th_team != serial_team) {
1148     // Nested level will be an index in the nested nthreads array
1149     int level = this_thr->th.th_team->t.t_level;
1150 
1151     if (serial_team->t.t_serialized) {
1152       /* this serial team was already used
1153          TODO increase performance by making this locks more specific */
1154       kmp_team_t *new_team;
1155 
1156       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1157 
1158       new_team =
1159           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1160 #if OMPT_SUPPORT
1161                               ompt_parallel_data,
1162 #endif
1163                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1164                               0 USE_NESTED_HOT_ARG(NULL));
1165       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1166       KMP_ASSERT(new_team);
1167 
1168       /* setup new serialized team and install it */
1169       new_team->t.t_threads[0] = this_thr;
1170       new_team->t.t_parent = this_thr->th.th_team;
1171       serial_team = new_team;
1172       this_thr->th.th_serial_team = serial_team;
1173 
1174       KF_TRACE(
1175           10,
1176           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1177            global_tid, serial_team));
1178 
1179       /* TODO the above breaks the requirement that if we run out of resources,
1180          then we can still guarantee that serialized teams are ok, since we may
1181          need to allocate a new one */
1182     } else {
1183       KF_TRACE(
1184           10,
1185           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1186            global_tid, serial_team));
1187     }
1188 
1189     /* we have to initialize this serial team */
1190     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1191     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1192     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1193     serial_team->t.t_ident = loc;
1194     serial_team->t.t_serialized = 1;
1195     serial_team->t.t_nproc = 1;
1196     serial_team->t.t_parent = this_thr->th.th_team;
1197     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1198     this_thr->th.th_team = serial_team;
1199     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1200 
1201     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1202                   this_thr->th.th_current_task));
1203     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1204     this_thr->th.th_current_task->td_flags.executing = 0;
1205 
1206     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1207 
1208     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1209        implicit task for each serialized task represented by
1210        team->t.t_serialized? */
1211     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1212               &this_thr->th.th_current_task->td_parent->td_icvs);
1213 
1214     // Thread value exists in the nested nthreads array for the next nested
1215     // level
1216     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1217       this_thr->th.th_current_task->td_icvs.nproc =
1218           __kmp_nested_nth.nth[level + 1];
1219     }
1220 
1221     if (__kmp_nested_proc_bind.used &&
1222         (level + 1 < __kmp_nested_proc_bind.used)) {
1223       this_thr->th.th_current_task->td_icvs.proc_bind =
1224           __kmp_nested_proc_bind.bind_types[level + 1];
1225     }
1226 
1227 #if USE_DEBUGGER
1228     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1229 #endif
1230     this_thr->th.th_info.ds.ds_tid = 0;
1231 
1232     /* set thread cache values */
1233     this_thr->th.th_team_nproc = 1;
1234     this_thr->th.th_team_master = this_thr;
1235     this_thr->th.th_team_serialized = 1;
1236 
1237     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1238     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1239     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1240 
1241     propagateFPControl(serial_team);
1242 
1243     /* check if we need to allocate dispatch buffers stack */
1244     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1245     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1246       serial_team->t.t_dispatch->th_disp_buffer =
1247           (dispatch_private_info_t *)__kmp_allocate(
1248               sizeof(dispatch_private_info_t));
1249     }
1250     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1251 
1252     KMP_MB();
1253 
1254   } else {
1255     /* this serialized team is already being used,
1256      * that's fine, just add another nested level */
1257     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1258     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1259     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1260     ++serial_team->t.t_serialized;
1261     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1262 
1263     // Nested level will be an index in the nested nthreads array
1264     int level = this_thr->th.th_team->t.t_level;
1265     // Thread value exists in the nested nthreads array for the next nested
1266     // level
1267     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1268       this_thr->th.th_current_task->td_icvs.nproc =
1269           __kmp_nested_nth.nth[level + 1];
1270     }
1271     serial_team->t.t_level++;
1272     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1273                   "of serial team %p to %d\n",
1274                   global_tid, serial_team, serial_team->t.t_level));
1275 
1276     /* allocate/push dispatch buffers stack */
1277     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1278     {
1279       dispatch_private_info_t *disp_buffer =
1280           (dispatch_private_info_t *)__kmp_allocate(
1281               sizeof(dispatch_private_info_t));
1282       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1283       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1284     }
1285     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1286 
1287     KMP_MB();
1288   }
1289   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1290 
1291   // Perform the display affinity functionality for
1292   // serialized parallel regions
1293   if (__kmp_display_affinity) {
1294     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1295         this_thr->th.th_prev_num_threads != 1) {
1296       // NULL means use the affinity-format-var ICV
1297       __kmp_aux_display_affinity(global_tid, NULL);
1298       this_thr->th.th_prev_level = serial_team->t.t_level;
1299       this_thr->th.th_prev_num_threads = 1;
1300     }
1301   }
1302 
1303   if (__kmp_env_consistency_check)
1304     __kmp_push_parallel(global_tid, NULL);
1305 #if OMPT_SUPPORT
1306   serial_team->t.ompt_team_info.master_return_address = codeptr;
1307   if (ompt_enabled.enabled &&
1308       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1309     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1310         OMPT_GET_FRAME_ADDRESS(0);
1311 
1312     ompt_lw_taskteam_t lw_taskteam;
1313     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1314                             &ompt_parallel_data, codeptr);
1315 
1316     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1317     // don't use lw_taskteam after linking. content was swaped
1318 
1319     /* OMPT implicit task begin */
1320     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1321     if (ompt_enabled.ompt_callback_implicit_task) {
1322       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1323           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1324           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1325           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1326       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1327           __kmp_tid_from_gtid(global_tid);
1328     }
1329 
1330     /* OMPT state */
1331     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1332     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1333         OMPT_GET_FRAME_ADDRESS(0);
1334   }
1335 #endif
1336 }
1337 
1338 /* most of the work for a fork */
1339 /* return true if we really went parallel, false if serialized */
1340 int __kmp_fork_call(ident_t *loc, int gtid,
1341                     enum fork_context_e call_context, // Intel, GNU, ...
1342                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1343                     kmp_va_list ap) {
1344   void **argv;
1345   int i;
1346   int master_tid;
1347   int master_this_cons;
1348   kmp_team_t *team;
1349   kmp_team_t *parent_team;
1350   kmp_info_t *master_th;
1351   kmp_root_t *root;
1352   int nthreads;
1353   int master_active;
1354   int master_set_numthreads;
1355   int level;
1356   int active_level;
1357   int teams_level;
1358 #if KMP_NESTED_HOT_TEAMS
1359   kmp_hot_team_ptr_t **p_hot_teams;
1360 #endif
1361   { // KMP_TIME_BLOCK
1362     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1363     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1364 
1365     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1366     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1367       /* Some systems prefer the stack for the root thread(s) to start with */
1368       /* some gap from the parent stack to prevent false sharing. */
1369       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1370       /* These 2 lines below are so this does not get optimized out */
1371       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1372         __kmp_stkpadding += (short)((kmp_int64)dummy);
1373     }
1374 
1375     /* initialize if needed */
1376     KMP_DEBUG_ASSERT(
1377         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1378     if (!TCR_4(__kmp_init_parallel))
1379       __kmp_parallel_initialize();
1380     __kmp_resume_if_soft_paused();
1381 
1382     /* setup current data */
1383     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1384     // shutdown
1385     parent_team = master_th->th.th_team;
1386     master_tid = master_th->th.th_info.ds.ds_tid;
1387     master_this_cons = master_th->th.th_local.this_construct;
1388     root = master_th->th.th_root;
1389     master_active = root->r.r_active;
1390     master_set_numthreads = master_th->th.th_set_nproc;
1391 
1392 #if OMPT_SUPPORT
1393     ompt_data_t ompt_parallel_data = ompt_data_none;
1394     ompt_data_t *parent_task_data;
1395     ompt_frame_t *ompt_frame;
1396     ompt_data_t *implicit_task_data;
1397     void *return_address = NULL;
1398 
1399     if (ompt_enabled.enabled) {
1400       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1401                                     NULL, NULL);
1402       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1403     }
1404 #endif
1405 
1406     // Nested level will be an index in the nested nthreads array
1407     level = parent_team->t.t_level;
1408     // used to launch non-serial teams even if nested is not allowed
1409     active_level = parent_team->t.t_active_level;
1410     // needed to check nesting inside the teams
1411     teams_level = master_th->th.th_teams_level;
1412 #if KMP_NESTED_HOT_TEAMS
1413     p_hot_teams = &master_th->th.th_hot_teams;
1414     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1415       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1416           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1417       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1418       // it is either actual or not needed (when active_level > 0)
1419       (*p_hot_teams)[0].hot_team_nth = 1;
1420     }
1421 #endif
1422 
1423 #if OMPT_SUPPORT
1424     if (ompt_enabled.enabled) {
1425       if (ompt_enabled.ompt_callback_parallel_begin) {
1426         int team_size = master_set_numthreads
1427                             ? master_set_numthreads
1428                             : get__nproc_2(parent_team, master_tid);
1429         int flags = OMPT_INVOKER(call_context) |
1430                     ((microtask == (microtask_t)__kmp_teams_master)
1431                          ? ompt_parallel_league
1432                          : ompt_parallel_team);
1433         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1434             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1435             return_address);
1436       }
1437       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1438     }
1439 #endif
1440 
1441     master_th->th.th_ident = loc;
1442 
1443     if (master_th->th.th_teams_microtask && ap &&
1444         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1445       // AC: This is start of parallel that is nested inside teams construct.
1446       // The team is actual (hot), all workers are ready at the fork barrier.
1447       // No lock needed to initialize the team a bit, then free workers.
1448       parent_team->t.t_ident = loc;
1449       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1450       parent_team->t.t_argc = argc;
1451       argv = (void **)parent_team->t.t_argv;
1452       for (i = argc - 1; i >= 0; --i)
1453         *argv++ = va_arg(kmp_va_deref(ap), void *);
1454       // Increment our nested depth levels, but not increase the serialization
1455       if (parent_team == master_th->th.th_serial_team) {
1456         // AC: we are in serialized parallel
1457         __kmpc_serialized_parallel(loc, gtid);
1458         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1459 
1460         if (call_context == fork_context_gnu) {
1461           // AC: need to decrement t_serialized for enquiry functions to work
1462           // correctly, will restore at join time
1463           parent_team->t.t_serialized--;
1464           return TRUE;
1465         }
1466 
1467 #if OMPT_SUPPORT
1468         void *dummy;
1469         void **exit_frame_p;
1470 
1471         ompt_lw_taskteam_t lw_taskteam;
1472 
1473         if (ompt_enabled.enabled) {
1474           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1475                                   &ompt_parallel_data, return_address);
1476           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1477 
1478           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1479           // don't use lw_taskteam after linking. content was swaped
1480 
1481           /* OMPT implicit task begin */
1482           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1483           if (ompt_enabled.ompt_callback_implicit_task) {
1484             OMPT_CUR_TASK_INFO(master_th)->thread_num =
1485                 __kmp_tid_from_gtid(gtid);
1486             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1487                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1488                 implicit_task_data, 1,
1489                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1490           }
1491 
1492           /* OMPT state */
1493           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1494         } else {
1495           exit_frame_p = &dummy;
1496         }
1497 #endif
1498         // AC: need to decrement t_serialized for enquiry functions to work
1499         // correctly, will restore at join time
1500         parent_team->t.t_serialized--;
1501 
1502         {
1503           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1504           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1505           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1506 #if OMPT_SUPPORT
1507                                  ,
1508                                  exit_frame_p
1509 #endif
1510           );
1511         }
1512 
1513 #if OMPT_SUPPORT
1514         if (ompt_enabled.enabled) {
1515           *exit_frame_p = NULL;
1516           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1517           if (ompt_enabled.ompt_callback_implicit_task) {
1518             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1519                 ompt_scope_end, NULL, implicit_task_data, 1,
1520                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1521           }
1522           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1523           __ompt_lw_taskteam_unlink(master_th);
1524           if (ompt_enabled.ompt_callback_parallel_end) {
1525             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1526                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1527                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1528                 return_address);
1529           }
1530           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1531         }
1532 #endif
1533         return TRUE;
1534       }
1535 
1536       parent_team->t.t_pkfn = microtask;
1537       parent_team->t.t_invoke = invoker;
1538       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1539       parent_team->t.t_active_level++;
1540       parent_team->t.t_level++;
1541       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1542 
1543 #if OMPT_SUPPORT
1544       if (ompt_enabled.enabled) {
1545         ompt_lw_taskteam_t lw_taskteam;
1546         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1547                                 &ompt_parallel_data, return_address);
1548         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1549       }
1550 #endif
1551 
1552       /* Change number of threads in the team if requested */
1553       if (master_set_numthreads) { // The parallel has num_threads clause
1554         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1555           // AC: only can reduce number of threads dynamically, can't increase
1556           kmp_info_t **other_threads = parent_team->t.t_threads;
1557           parent_team->t.t_nproc = master_set_numthreads;
1558           for (i = 0; i < master_set_numthreads; ++i) {
1559             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1560           }
1561           // Keep extra threads hot in the team for possible next parallels
1562         }
1563         master_th->th.th_set_nproc = 0;
1564       }
1565 
1566 #if USE_DEBUGGER
1567       if (__kmp_debugging) { // Let debugger override number of threads.
1568         int nth = __kmp_omp_num_threads(loc);
1569         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1570           master_set_numthreads = nth;
1571         }
1572       }
1573 #endif
1574 
1575 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1576       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1577            KMP_ITT_DEBUG) &&
1578           __kmp_forkjoin_frames_mode == 3 &&
1579           parent_team->t.t_active_level == 1 // only report frames at level 1
1580           && master_th->th.th_teams_size.nteams == 1) {
1581         kmp_uint64 tmp_time = __itt_get_timestamp();
1582         master_th->th.th_frame_time = tmp_time;
1583         parent_team->t.t_region_time = tmp_time;
1584       }
1585       if (__itt_stack_caller_create_ptr) {
1586         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1587         // create new stack stitching id before entering fork barrier
1588         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1589       }
1590 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1591 
1592       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1593                     "master_th=%p, gtid=%d\n",
1594                     root, parent_team, master_th, gtid));
1595       __kmp_internal_fork(loc, gtid, parent_team);
1596       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1597                     "master_th=%p, gtid=%d\n",
1598                     root, parent_team, master_th, gtid));
1599 
1600       if (call_context == fork_context_gnu)
1601         return TRUE;
1602 
1603       /* Invoke microtask for PRIMARY thread */
1604       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1605                     parent_team->t.t_id, parent_team->t.t_pkfn));
1606 
1607       if (!parent_team->t.t_invoke(gtid)) {
1608         KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1609       }
1610       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1611                     parent_team->t.t_id, parent_team->t.t_pkfn));
1612       KMP_MB(); /* Flush all pending memory write invalidates.  */
1613 
1614       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1615 
1616       return TRUE;
1617     } // Parallel closely nested in teams construct
1618 
1619 #if KMP_DEBUG
1620     if (__kmp_tasking_mode != tskm_immediate_exec) {
1621       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1622                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1623     }
1624 #endif
1625 
1626     int enter_teams = 0;
1627     if (parent_team->t.t_active_level >=
1628         master_th->th.th_current_task->td_icvs.max_active_levels) {
1629       nthreads = 1;
1630     } else {
1631       enter_teams = ((ap == NULL && active_level == 0) ||
1632                      (ap && teams_level > 0 && teams_level == level));
1633       nthreads =
1634           master_set_numthreads
1635               ? master_set_numthreads
1636               : get__nproc_2(
1637                     parent_team,
1638                     master_tid); // TODO: get nproc directly from current task
1639 
1640       // Check if we need to take forkjoin lock? (no need for serialized
1641       // parallel out of teams construct). This code moved here from
1642       // __kmp_reserve_threads() to speedup nested serialized parallels.
1643       if (nthreads > 1) {
1644         if ((get__max_active_levels(master_th) == 1 &&
1645              (root->r.r_in_parallel && !enter_teams)) ||
1646             (__kmp_library == library_serial)) {
1647           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1648                         " threads\n",
1649                         gtid, nthreads));
1650           nthreads = 1;
1651         }
1652       }
1653       if (nthreads > 1) {
1654         /* determine how many new threads we can use */
1655         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1656         /* AC: If we execute teams from parallel region (on host), then teams
1657            should be created but each can only have 1 thread if nesting is
1658            disabled. If teams called from serial region, then teams and their
1659            threads should be created regardless of the nesting setting. */
1660         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1661                                          nthreads, enter_teams);
1662         if (nthreads == 1) {
1663           // Free lock for single thread execution here; for multi-thread
1664           // execution it will be freed later after team of threads created
1665           // and initialized
1666           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1667         }
1668       }
1669     }
1670     KMP_DEBUG_ASSERT(nthreads > 0);
1671 
1672     // If we temporarily changed the set number of threads then restore it now
1673     master_th->th.th_set_nproc = 0;
1674 
1675     /* create a serialized parallel region? */
1676     if (nthreads == 1) {
1677 /* josh todo: hypothetical question: what do we do for OS X*? */
1678 #if KMP_OS_LINUX &&                                                            \
1679     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1680       void *args[argc];
1681 #else
1682       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1683 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1684           KMP_ARCH_AARCH64) */
1685 
1686       KA_TRACE(20,
1687                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1688 
1689       __kmpc_serialized_parallel(loc, gtid);
1690 
1691       if (call_context == fork_context_intel) {
1692         /* TODO this sucks, use the compiler itself to pass args! :) */
1693         master_th->th.th_serial_team->t.t_ident = loc;
1694         if (!ap) {
1695           // revert change made in __kmpc_serialized_parallel()
1696           master_th->th.th_serial_team->t.t_level--;
1697           // Get args from parent team for teams construct
1698 
1699 #if OMPT_SUPPORT
1700           void *dummy;
1701           void **exit_frame_p;
1702           ompt_task_info_t *task_info;
1703 
1704           ompt_lw_taskteam_t lw_taskteam;
1705 
1706           if (ompt_enabled.enabled) {
1707             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1708                                     &ompt_parallel_data, return_address);
1709 
1710             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1711             // don't use lw_taskteam after linking. content was swaped
1712 
1713             task_info = OMPT_CUR_TASK_INFO(master_th);
1714             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1715             if (ompt_enabled.ompt_callback_implicit_task) {
1716               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1717                   __kmp_tid_from_gtid(gtid);
1718               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1719                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1720                   &(task_info->task_data), 1,
1721                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1722                   ompt_task_implicit);
1723             }
1724 
1725             /* OMPT state */
1726             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1727           } else {
1728             exit_frame_p = &dummy;
1729           }
1730 #endif
1731 
1732           {
1733             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1734             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1735             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1736                                    parent_team->t.t_argv
1737 #if OMPT_SUPPORT
1738                                    ,
1739                                    exit_frame_p
1740 #endif
1741             );
1742           }
1743 
1744 #if OMPT_SUPPORT
1745           if (ompt_enabled.enabled) {
1746             *exit_frame_p = NULL;
1747             if (ompt_enabled.ompt_callback_implicit_task) {
1748               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1749                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1750                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1751                   ompt_task_implicit);
1752             }
1753             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1754             __ompt_lw_taskteam_unlink(master_th);
1755             if (ompt_enabled.ompt_callback_parallel_end) {
1756               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1757                   &ompt_parallel_data, parent_task_data,
1758                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1759                   return_address);
1760             }
1761             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1762           }
1763 #endif
1764         } else if (microtask == (microtask_t)__kmp_teams_master) {
1765           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1766                            master_th->th.th_serial_team);
1767           team = master_th->th.th_team;
1768           // team->t.t_pkfn = microtask;
1769           team->t.t_invoke = invoker;
1770           __kmp_alloc_argv_entries(argc, team, TRUE);
1771           team->t.t_argc = argc;
1772           argv = (void **)team->t.t_argv;
1773           if (ap) {
1774             for (i = argc - 1; i >= 0; --i)
1775               *argv++ = va_arg(kmp_va_deref(ap), void *);
1776           } else {
1777             for (i = 0; i < argc; ++i)
1778               // Get args from parent team for teams construct
1779               argv[i] = parent_team->t.t_argv[i];
1780           }
1781           // AC: revert change made in __kmpc_serialized_parallel()
1782           //     because initial code in teams should have level=0
1783           team->t.t_level--;
1784           // AC: call special invoker for outer "parallel" of teams construct
1785           invoker(gtid);
1786 #if OMPT_SUPPORT
1787           if (ompt_enabled.enabled) {
1788             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1789             if (ompt_enabled.ompt_callback_implicit_task) {
1790               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1791                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1792                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1793             }
1794             if (ompt_enabled.ompt_callback_parallel_end) {
1795               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1796                   &ompt_parallel_data, parent_task_data,
1797                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1798                   return_address);
1799             }
1800             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1801           }
1802 #endif
1803         } else {
1804           argv = args;
1805           for (i = argc - 1; i >= 0; --i)
1806             *argv++ = va_arg(kmp_va_deref(ap), void *);
1807           KMP_MB();
1808 
1809 #if OMPT_SUPPORT
1810           void *dummy;
1811           void **exit_frame_p;
1812           ompt_task_info_t *task_info;
1813 
1814           ompt_lw_taskteam_t lw_taskteam;
1815 
1816           if (ompt_enabled.enabled) {
1817             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1818                                     &ompt_parallel_data, return_address);
1819             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1820             // don't use lw_taskteam after linking. content was swaped
1821             task_info = OMPT_CUR_TASK_INFO(master_th);
1822             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1823 
1824             /* OMPT implicit task begin */
1825             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1826             if (ompt_enabled.ompt_callback_implicit_task) {
1827               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1828                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1829                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1830                   ompt_task_implicit);
1831               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1832                   __kmp_tid_from_gtid(gtid);
1833             }
1834 
1835             /* OMPT state */
1836             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1837           } else {
1838             exit_frame_p = &dummy;
1839           }
1840 #endif
1841 
1842           {
1843             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1844             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1845             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1846 #if OMPT_SUPPORT
1847                                    ,
1848                                    exit_frame_p
1849 #endif
1850             );
1851           }
1852 
1853 #if OMPT_SUPPORT
1854           if (ompt_enabled.enabled) {
1855             *exit_frame_p = NULL;
1856             if (ompt_enabled.ompt_callback_implicit_task) {
1857               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1858                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1859                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1860                   ompt_task_implicit);
1861             }
1862 
1863             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1864             __ompt_lw_taskteam_unlink(master_th);
1865             if (ompt_enabled.ompt_callback_parallel_end) {
1866               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1867                   &ompt_parallel_data, parent_task_data,
1868                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1869                   return_address);
1870             }
1871             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1872           }
1873 #endif
1874         }
1875       } else if (call_context == fork_context_gnu) {
1876 #if OMPT_SUPPORT
1877         ompt_lw_taskteam_t lwt;
1878         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1879                                 return_address);
1880 
1881         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1882         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1883 // don't use lw_taskteam after linking. content was swaped
1884 #endif
1885 
1886         // we were called from GNU native code
1887         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1888         return FALSE;
1889       } else {
1890         KMP_ASSERT2(call_context < fork_context_last,
1891                     "__kmp_fork_call: unknown fork_context parameter");
1892       }
1893 
1894       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1895       KMP_MB();
1896       return FALSE;
1897     } // if (nthreads == 1)
1898 
1899     // GEH: only modify the executing flag in the case when not serialized
1900     //      serialized case is handled in kmpc_serialized_parallel
1901     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1902                   "curtask=%p, curtask_max_aclevel=%d\n",
1903                   parent_team->t.t_active_level, master_th,
1904                   master_th->th.th_current_task,
1905                   master_th->th.th_current_task->td_icvs.max_active_levels));
1906     // TODO: GEH - cannot do this assertion because root thread not set up as
1907     // executing
1908     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1909     master_th->th.th_current_task->td_flags.executing = 0;
1910 
1911     if (!master_th->th.th_teams_microtask || level > teams_level) {
1912       /* Increment our nested depth level */
1913       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1914     }
1915 
1916     // See if we need to make a copy of the ICVs.
1917     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1918     if ((level + 1 < __kmp_nested_nth.used) &&
1919         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1920       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1921     } else {
1922       nthreads_icv = 0; // don't update
1923     }
1924 
1925     // Figure out the proc_bind_policy for the new team.
1926     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1927     kmp_proc_bind_t proc_bind_icv =
1928         proc_bind_default; // proc_bind_default means don't update
1929     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1930       proc_bind = proc_bind_false;
1931     } else {
1932       if (proc_bind == proc_bind_default) {
1933         // No proc_bind clause specified; use current proc-bind-var for this
1934         // parallel region
1935         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1936       }
1937       /* else: The proc_bind policy was specified explicitly on parallel clause.
1938          This overrides proc-bind-var for this parallel region, but does not
1939          change proc-bind-var. */
1940       // Figure the value of proc-bind-var for the child threads.
1941       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1942           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1943            master_th->th.th_current_task->td_icvs.proc_bind)) {
1944         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1945       }
1946     }
1947 
1948     // Reset for next parallel region
1949     master_th->th.th_set_proc_bind = proc_bind_default;
1950 
1951     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1952       kmp_internal_control_t new_icvs;
1953       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1954       new_icvs.next = NULL;
1955       if (nthreads_icv > 0) {
1956         new_icvs.nproc = nthreads_icv;
1957       }
1958       if (proc_bind_icv != proc_bind_default) {
1959         new_icvs.proc_bind = proc_bind_icv;
1960       }
1961 
1962       /* allocate a new parallel team */
1963       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1964       team = __kmp_allocate_team(root, nthreads, nthreads,
1965 #if OMPT_SUPPORT
1966                                  ompt_parallel_data,
1967 #endif
1968                                  proc_bind, &new_icvs,
1969                                  argc USE_NESTED_HOT_ARG(master_th));
1970     } else {
1971       /* allocate a new parallel team */
1972       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1973       team = __kmp_allocate_team(root, nthreads, nthreads,
1974 #if OMPT_SUPPORT
1975                                  ompt_parallel_data,
1976 #endif
1977                                  proc_bind,
1978                                  &master_th->th.th_current_task->td_icvs,
1979                                  argc USE_NESTED_HOT_ARG(master_th));
1980     }
1981     KF_TRACE(
1982         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
1983 
1984     /* setup the new team */
1985     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
1986     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
1987     KMP_CHECK_UPDATE(team->t.t_ident, loc);
1988     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
1989     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
1990 #if OMPT_SUPPORT
1991     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
1992                           return_address);
1993 #endif
1994     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
1995     // TODO: parent_team->t.t_level == INT_MAX ???
1996     if (!master_th->th.th_teams_microtask || level > teams_level) {
1997       int new_level = parent_team->t.t_level + 1;
1998       KMP_CHECK_UPDATE(team->t.t_level, new_level);
1999       new_level = parent_team->t.t_active_level + 1;
2000       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2001     } else {
2002       // AC: Do not increase parallel level at start of the teams construct
2003       int new_level = parent_team->t.t_level;
2004       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2005       new_level = parent_team->t.t_active_level;
2006       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2007     }
2008     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2009     // set primary thread's schedule as new run-time schedule
2010     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2011 
2012     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2013     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2014 
2015     // Update the floating point rounding in the team if required.
2016     propagateFPControl(team);
2017 
2018     if (__kmp_tasking_mode != tskm_immediate_exec) {
2019       // Set primary thread's task team to team's task team. Unless this is hot
2020       // team, it should be NULL.
2021       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2022                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2023       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2024                     "%p, new task_team %p / team %p\n",
2025                     __kmp_gtid_from_thread(master_th),
2026                     master_th->th.th_task_team, parent_team,
2027                     team->t.t_task_team[master_th->th.th_task_state], team));
2028 
2029       if (active_level || master_th->th.th_task_team) {
2030         // Take a memo of primary thread's task_state
2031         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2032         if (master_th->th.th_task_state_top >=
2033             master_th->th.th_task_state_stack_sz) { // increase size
2034           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2035           kmp_uint8 *old_stack, *new_stack;
2036           kmp_uint32 i;
2037           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2038           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2039             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2040           }
2041           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2042                ++i) { // zero-init rest of stack
2043             new_stack[i] = 0;
2044           }
2045           old_stack = master_th->th.th_task_state_memo_stack;
2046           master_th->th.th_task_state_memo_stack = new_stack;
2047           master_th->th.th_task_state_stack_sz = new_size;
2048           __kmp_free(old_stack);
2049         }
2050         // Store primary thread's task_state on stack
2051         master_th->th
2052             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2053             master_th->th.th_task_state;
2054         master_th->th.th_task_state_top++;
2055 #if KMP_NESTED_HOT_TEAMS
2056         if (master_th->th.th_hot_teams &&
2057             active_level < __kmp_hot_teams_max_level &&
2058             team == master_th->th.th_hot_teams[active_level].hot_team) {
2059           // Restore primary thread's nested state if nested hot team
2060           master_th->th.th_task_state =
2061               master_th->th
2062                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2063         } else {
2064 #endif
2065           master_th->th.th_task_state = 0;
2066 #if KMP_NESTED_HOT_TEAMS
2067         }
2068 #endif
2069       }
2070 #if !KMP_NESTED_HOT_TEAMS
2071       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2072                        (team == root->r.r_hot_team));
2073 #endif
2074     }
2075 
2076     KA_TRACE(
2077         20,
2078         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2079          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2080          team->t.t_nproc));
2081     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2082                      (team->t.t_master_tid == 0 &&
2083                       (team->t.t_parent == root->r.r_root_team ||
2084                        team->t.t_parent->t.t_serialized)));
2085     KMP_MB();
2086 
2087     /* now, setup the arguments */
2088     argv = (void **)team->t.t_argv;
2089     if (ap) {
2090       for (i = argc - 1; i >= 0; --i) {
2091         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2092         KMP_CHECK_UPDATE(*argv, new_argv);
2093         argv++;
2094       }
2095     } else {
2096       for (i = 0; i < argc; ++i) {
2097         // Get args from parent team for teams construct
2098         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2099       }
2100     }
2101 
2102     /* now actually fork the threads */
2103     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2104     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2105       root->r.r_active = TRUE;
2106 
2107     __kmp_fork_team_threads(root, team, master_th, gtid);
2108     __kmp_setup_icv_copy(team, nthreads,
2109                          &master_th->th.th_current_task->td_icvs, loc);
2110 
2111 #if OMPT_SUPPORT
2112     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2113 #endif
2114 
2115     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2116 
2117 #if USE_ITT_BUILD
2118     if (team->t.t_active_level == 1 // only report frames at level 1
2119         && !master_th->th.th_teams_microtask) { // not in teams construct
2120 #if USE_ITT_NOTIFY
2121       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2122           (__kmp_forkjoin_frames_mode == 3 ||
2123            __kmp_forkjoin_frames_mode == 1)) {
2124         kmp_uint64 tmp_time = 0;
2125         if (__itt_get_timestamp_ptr)
2126           tmp_time = __itt_get_timestamp();
2127         // Internal fork - report frame begin
2128         master_th->th.th_frame_time = tmp_time;
2129         if (__kmp_forkjoin_frames_mode == 3)
2130           team->t.t_region_time = tmp_time;
2131       } else
2132 // only one notification scheme (either "submit" or "forking/joined", not both)
2133 #endif /* USE_ITT_NOTIFY */
2134           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2135               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2136         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2137         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2138       }
2139     }
2140 #endif /* USE_ITT_BUILD */
2141 
2142     /* now go on and do the work */
2143     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2144     KMP_MB();
2145     KF_TRACE(10,
2146              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2147               root, team, master_th, gtid));
2148 
2149 #if USE_ITT_BUILD
2150     if (__itt_stack_caller_create_ptr) {
2151       // create new stack stitching id before entering fork barrier
2152       if (!enter_teams) {
2153         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2154         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2155       } else if (parent_team->t.t_serialized) {
2156         // keep stack stitching id in the serialized parent_team;
2157         // current team will be used for parallel inside the teams;
2158         // if parent_team is active, then it already keeps stack stitching id
2159         // for the league of teams
2160         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2161         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2162       }
2163     }
2164 #endif /* USE_ITT_BUILD */
2165 
2166     // AC: skip __kmp_internal_fork at teams construct, let only primary
2167     // threads execute
2168     if (ap) {
2169       __kmp_internal_fork(loc, gtid, team);
2170       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2171                     "master_th=%p, gtid=%d\n",
2172                     root, team, master_th, gtid));
2173     }
2174 
2175     if (call_context == fork_context_gnu) {
2176       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2177       return TRUE;
2178     }
2179 
2180     /* Invoke microtask for PRIMARY thread */
2181     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2182                   team->t.t_id, team->t.t_pkfn));
2183   } // END of timer KMP_fork_call block
2184 
2185 #if KMP_STATS_ENABLED
2186   // If beginning a teams construct, then change thread state
2187   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2188   if (!ap) {
2189     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2190   }
2191 #endif
2192 
2193   if (!team->t.t_invoke(gtid)) {
2194     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2195   }
2196 
2197 #if KMP_STATS_ENABLED
2198   // If was beginning of a teams construct, then reset thread state
2199   if (!ap) {
2200     KMP_SET_THREAD_STATE(previous_state);
2201   }
2202 #endif
2203 
2204   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2205                 team->t.t_id, team->t.t_pkfn));
2206   KMP_MB(); /* Flush all pending memory write invalidates.  */
2207 
2208   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2209 
2210 #if OMPT_SUPPORT
2211   if (ompt_enabled.enabled) {
2212     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2213   }
2214 #endif
2215 
2216   return TRUE;
2217 }
2218 
2219 #if OMPT_SUPPORT
2220 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2221                                             kmp_team_t *team) {
2222   // restore state outside the region
2223   thread->th.ompt_thread_info.state =
2224       ((team->t.t_serialized) ? ompt_state_work_serial
2225                               : ompt_state_work_parallel);
2226 }
2227 
2228 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2229                                    kmp_team_t *team, ompt_data_t *parallel_data,
2230                                    int flags, void *codeptr) {
2231   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2232   if (ompt_enabled.ompt_callback_parallel_end) {
2233     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2234         parallel_data, &(task_info->task_data), flags, codeptr);
2235   }
2236 
2237   task_info->frame.enter_frame = ompt_data_none;
2238   __kmp_join_restore_state(thread, team);
2239 }
2240 #endif
2241 
2242 void __kmp_join_call(ident_t *loc, int gtid
2243 #if OMPT_SUPPORT
2244                      ,
2245                      enum fork_context_e fork_context
2246 #endif
2247                      ,
2248                      int exit_teams) {
2249   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2250   kmp_team_t *team;
2251   kmp_team_t *parent_team;
2252   kmp_info_t *master_th;
2253   kmp_root_t *root;
2254   int master_active;
2255 
2256   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2257 
2258   /* setup current data */
2259   master_th = __kmp_threads[gtid];
2260   root = master_th->th.th_root;
2261   team = master_th->th.th_team;
2262   parent_team = team->t.t_parent;
2263 
2264   master_th->th.th_ident = loc;
2265 
2266 #if OMPT_SUPPORT
2267   void *team_microtask = (void *)team->t.t_pkfn;
2268   // For GOMP interface with serialized parallel, need the
2269   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2270   // and end-parallel events.
2271   if (ompt_enabled.enabled &&
2272       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2273     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2274   }
2275 #endif
2276 
2277 #if KMP_DEBUG
2278   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2279     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2280                   "th_task_team = %p\n",
2281                   __kmp_gtid_from_thread(master_th), team,
2282                   team->t.t_task_team[master_th->th.th_task_state],
2283                   master_th->th.th_task_team));
2284     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2285                      team->t.t_task_team[master_th->th.th_task_state]);
2286   }
2287 #endif
2288 
2289   if (team->t.t_serialized) {
2290     if (master_th->th.th_teams_microtask) {
2291       // We are in teams construct
2292       int level = team->t.t_level;
2293       int tlevel = master_th->th.th_teams_level;
2294       if (level == tlevel) {
2295         // AC: we haven't incremented it earlier at start of teams construct,
2296         //     so do it here - at the end of teams construct
2297         team->t.t_level++;
2298       } else if (level == tlevel + 1) {
2299         // AC: we are exiting parallel inside teams, need to increment
2300         // serialization in order to restore it in the next call to
2301         // __kmpc_end_serialized_parallel
2302         team->t.t_serialized++;
2303       }
2304     }
2305     __kmpc_end_serialized_parallel(loc, gtid);
2306 
2307 #if OMPT_SUPPORT
2308     if (ompt_enabled.enabled) {
2309       __kmp_join_restore_state(master_th, parent_team);
2310     }
2311 #endif
2312 
2313     return;
2314   }
2315 
2316   master_active = team->t.t_master_active;
2317 
2318   if (!exit_teams) {
2319     // AC: No barrier for internal teams at exit from teams construct.
2320     //     But there is barrier for external team (league).
2321     __kmp_internal_join(loc, gtid, team);
2322 #if USE_ITT_BUILD
2323     if (__itt_stack_caller_create_ptr) {
2324       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2325       // destroy the stack stitching id after join barrier
2326       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2327       team->t.t_stack_id = NULL;
2328     }
2329 #endif
2330   } else {
2331     master_th->th.th_task_state =
2332         0; // AC: no tasking in teams (out of any parallel)
2333 #if USE_ITT_BUILD
2334     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2335       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2336       // destroy the stack stitching id on exit from the teams construct
2337       // if parent_team is active, then the id will be destroyed later on
2338       // by master of the league of teams
2339       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2340       parent_team->t.t_stack_id = NULL;
2341     }
2342 #endif
2343   }
2344 
2345   KMP_MB();
2346 
2347 #if OMPT_SUPPORT
2348   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2349   void *codeptr = team->t.ompt_team_info.master_return_address;
2350 #endif
2351 
2352 #if USE_ITT_BUILD
2353   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2354   if (team->t.t_active_level == 1 &&
2355       (!master_th->th.th_teams_microtask || /* not in teams construct */
2356        master_th->th.th_teams_size.nteams == 1)) {
2357     master_th->th.th_ident = loc;
2358     // only one notification scheme (either "submit" or "forking/joined", not
2359     // both)
2360     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2361         __kmp_forkjoin_frames_mode == 3)
2362       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2363                              master_th->th.th_frame_time, 0, loc,
2364                              master_th->th.th_team_nproc, 1);
2365     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2366              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2367       __kmp_itt_region_joined(gtid);
2368   } // active_level == 1
2369 #endif /* USE_ITT_BUILD */
2370 
2371   if (master_th->th.th_teams_microtask && !exit_teams &&
2372       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2373       team->t.t_level == master_th->th.th_teams_level + 1) {
2374 // AC: We need to leave the team structure intact at the end of parallel
2375 // inside the teams construct, so that at the next parallel same (hot) team
2376 // works, only adjust nesting levels
2377 #if OMPT_SUPPORT
2378     ompt_data_t ompt_parallel_data = ompt_data_none;
2379     if (ompt_enabled.enabled) {
2380       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2381       if (ompt_enabled.ompt_callback_implicit_task) {
2382         int ompt_team_size = team->t.t_nproc;
2383         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2384             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2385             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2386       }
2387       task_info->frame.exit_frame = ompt_data_none;
2388       task_info->task_data = ompt_data_none;
2389       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2390       __ompt_lw_taskteam_unlink(master_th);
2391     }
2392 #endif
2393     /* Decrement our nested depth level */
2394     team->t.t_level--;
2395     team->t.t_active_level--;
2396     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2397 
2398     // Restore number of threads in the team if needed. This code relies on
2399     // the proper adjustment of th_teams_size.nth after the fork in
2400     // __kmp_teams_master on each teams primary thread in the case that
2401     // __kmp_reserve_threads reduced it.
2402     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2403       int old_num = master_th->th.th_team_nproc;
2404       int new_num = master_th->th.th_teams_size.nth;
2405       kmp_info_t **other_threads = team->t.t_threads;
2406       team->t.t_nproc = new_num;
2407       for (int i = 0; i < old_num; ++i) {
2408         other_threads[i]->th.th_team_nproc = new_num;
2409       }
2410       // Adjust states of non-used threads of the team
2411       for (int i = old_num; i < new_num; ++i) {
2412         // Re-initialize thread's barrier data.
2413         KMP_DEBUG_ASSERT(other_threads[i]);
2414         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2415         for (int b = 0; b < bs_last_barrier; ++b) {
2416           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2417           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2418 #if USE_DEBUGGER
2419           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2420 #endif
2421         }
2422         if (__kmp_tasking_mode != tskm_immediate_exec) {
2423           // Synchronize thread's task state
2424           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2425         }
2426       }
2427     }
2428 
2429 #if OMPT_SUPPORT
2430     if (ompt_enabled.enabled) {
2431       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2432                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2433     }
2434 #endif
2435 
2436     return;
2437   }
2438 
2439   /* do cleanup and restore the parent team */
2440   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2441   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2442 
2443   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2444 
2445   /* jc: The following lock has instructions with REL and ACQ semantics,
2446      separating the parallel user code called in this parallel region
2447      from the serial user code called after this function returns. */
2448   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2449 
2450   if (!master_th->th.th_teams_microtask ||
2451       team->t.t_level > master_th->th.th_teams_level) {
2452     /* Decrement our nested depth level */
2453     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2454   }
2455   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2456 
2457 #if OMPT_SUPPORT
2458   if (ompt_enabled.enabled) {
2459     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2460     if (ompt_enabled.ompt_callback_implicit_task) {
2461       int flags = (team_microtask == (void *)__kmp_teams_master)
2462                       ? ompt_task_initial
2463                       : ompt_task_implicit;
2464       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2465       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2466           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2467           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2468     }
2469     task_info->frame.exit_frame = ompt_data_none;
2470     task_info->task_data = ompt_data_none;
2471   }
2472 #endif
2473 
2474   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2475                 master_th, team));
2476   __kmp_pop_current_task_from_thread(master_th);
2477 
2478 #if KMP_AFFINITY_SUPPORTED
2479   // Restore master thread's partition.
2480   master_th->th.th_first_place = team->t.t_first_place;
2481   master_th->th.th_last_place = team->t.t_last_place;
2482 #endif // KMP_AFFINITY_SUPPORTED
2483   master_th->th.th_def_allocator = team->t.t_def_allocator;
2484 
2485   updateHWFPControl(team);
2486 
2487   if (root->r.r_active != master_active)
2488     root->r.r_active = master_active;
2489 
2490   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2491                             master_th)); // this will free worker threads
2492 
2493   /* this race was fun to find. make sure the following is in the critical
2494      region otherwise assertions may fail occasionally since the old team may be
2495      reallocated and the hierarchy appears inconsistent. it is actually safe to
2496      run and won't cause any bugs, but will cause those assertion failures. it's
2497      only one deref&assign so might as well put this in the critical region */
2498   master_th->th.th_team = parent_team;
2499   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2500   master_th->th.th_team_master = parent_team->t.t_threads[0];
2501   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2502 
2503   /* restore serialized team, if need be */
2504   if (parent_team->t.t_serialized &&
2505       parent_team != master_th->th.th_serial_team &&
2506       parent_team != root->r.r_root_team) {
2507     __kmp_free_team(root,
2508                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2509     master_th->th.th_serial_team = parent_team;
2510   }
2511 
2512   if (__kmp_tasking_mode != tskm_immediate_exec) {
2513     if (master_th->th.th_task_state_top >
2514         0) { // Restore task state from memo stack
2515       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2516       // Remember primary thread's state if we re-use this nested hot team
2517       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2518           master_th->th.th_task_state;
2519       --master_th->th.th_task_state_top; // pop
2520       // Now restore state at this level
2521       master_th->th.th_task_state =
2522           master_th->th
2523               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2524     }
2525     // Copy the task team from the parent team to the primary thread
2526     master_th->th.th_task_team =
2527         parent_team->t.t_task_team[master_th->th.th_task_state];
2528     KA_TRACE(20,
2529              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2530               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2531               parent_team));
2532   }
2533 
2534   // TODO: GEH - cannot do this assertion because root thread not set up as
2535   // executing
2536   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2537   master_th->th.th_current_task->td_flags.executing = 1;
2538 
2539   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2540 
2541 #if OMPT_SUPPORT
2542   int flags =
2543       OMPT_INVOKER(fork_context) |
2544       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2545                                                       : ompt_parallel_team);
2546   if (ompt_enabled.enabled) {
2547     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2548                     codeptr);
2549   }
2550 #endif
2551 
2552   KMP_MB();
2553   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2554 }
2555 
2556 /* Check whether we should push an internal control record onto the
2557    serial team stack.  If so, do it.  */
2558 void __kmp_save_internal_controls(kmp_info_t *thread) {
2559 
2560   if (thread->th.th_team != thread->th.th_serial_team) {
2561     return;
2562   }
2563   if (thread->th.th_team->t.t_serialized > 1) {
2564     int push = 0;
2565 
2566     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2567       push = 1;
2568     } else {
2569       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2570           thread->th.th_team->t.t_serialized) {
2571         push = 1;
2572       }
2573     }
2574     if (push) { /* push a record on the serial team's stack */
2575       kmp_internal_control_t *control =
2576           (kmp_internal_control_t *)__kmp_allocate(
2577               sizeof(kmp_internal_control_t));
2578 
2579       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2580 
2581       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2582 
2583       control->next = thread->th.th_team->t.t_control_stack_top;
2584       thread->th.th_team->t.t_control_stack_top = control;
2585     }
2586   }
2587 }
2588 
2589 /* Changes set_nproc */
2590 void __kmp_set_num_threads(int new_nth, int gtid) {
2591   kmp_info_t *thread;
2592   kmp_root_t *root;
2593 
2594   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2595   KMP_DEBUG_ASSERT(__kmp_init_serial);
2596 
2597   if (new_nth < 1)
2598     new_nth = 1;
2599   else if (new_nth > __kmp_max_nth)
2600     new_nth = __kmp_max_nth;
2601 
2602   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2603   thread = __kmp_threads[gtid];
2604   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2605     return; // nothing to do
2606 
2607   __kmp_save_internal_controls(thread);
2608 
2609   set__nproc(thread, new_nth);
2610 
2611   // If this omp_set_num_threads() call will cause the hot team size to be
2612   // reduced (in the absence of a num_threads clause), then reduce it now,
2613   // rather than waiting for the next parallel region.
2614   root = thread->th.th_root;
2615   if (__kmp_init_parallel && (!root->r.r_active) &&
2616       (root->r.r_hot_team->t.t_nproc > new_nth)
2617 #if KMP_NESTED_HOT_TEAMS
2618       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2619 #endif
2620   ) {
2621     kmp_team_t *hot_team = root->r.r_hot_team;
2622     int f;
2623 
2624     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2625 
2626     // Release the extra threads we don't need any more.
2627     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2628       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2629       if (__kmp_tasking_mode != tskm_immediate_exec) {
2630         // When decreasing team size, threads no longer in the team should unref
2631         // task team.
2632         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2633       }
2634       __kmp_free_thread(hot_team->t.t_threads[f]);
2635       hot_team->t.t_threads[f] = NULL;
2636     }
2637     hot_team->t.t_nproc = new_nth;
2638 #if KMP_NESTED_HOT_TEAMS
2639     if (thread->th.th_hot_teams) {
2640       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2641       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2642     }
2643 #endif
2644 
2645     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2646 
2647     // Update the t_nproc field in the threads that are still active.
2648     for (f = 0; f < new_nth; f++) {
2649       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2650       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2651     }
2652     // Special flag in case omp_set_num_threads() call
2653     hot_team->t.t_size_changed = -1;
2654   }
2655 }
2656 
2657 /* Changes max_active_levels */
2658 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2659   kmp_info_t *thread;
2660 
2661   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2662                 "%d = (%d)\n",
2663                 gtid, max_active_levels));
2664   KMP_DEBUG_ASSERT(__kmp_init_serial);
2665 
2666   // validate max_active_levels
2667   if (max_active_levels < 0) {
2668     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2669     // We ignore this call if the user has specified a negative value.
2670     // The current setting won't be changed. The last valid setting will be
2671     // used. A warning will be issued (if warnings are allowed as controlled by
2672     // the KMP_WARNINGS env var).
2673     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2674                   "max_active_levels for thread %d = (%d)\n",
2675                   gtid, max_active_levels));
2676     return;
2677   }
2678   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2679     // it's OK, the max_active_levels is within the valid range: [ 0;
2680     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2681     // We allow a zero value. (implementation defined behavior)
2682   } else {
2683     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2684                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2685     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2686     // Current upper limit is MAX_INT. (implementation defined behavior)
2687     // If the input exceeds the upper limit, we correct the input to be the
2688     // upper limit. (implementation defined behavior)
2689     // Actually, the flow should never get here until we use MAX_INT limit.
2690   }
2691   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2692                 "max_active_levels for thread %d = (%d)\n",
2693                 gtid, max_active_levels));
2694 
2695   thread = __kmp_threads[gtid];
2696 
2697   __kmp_save_internal_controls(thread);
2698 
2699   set__max_active_levels(thread, max_active_levels);
2700 }
2701 
2702 /* Gets max_active_levels */
2703 int __kmp_get_max_active_levels(int gtid) {
2704   kmp_info_t *thread;
2705 
2706   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2707   KMP_DEBUG_ASSERT(__kmp_init_serial);
2708 
2709   thread = __kmp_threads[gtid];
2710   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2711   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2712                 "curtask_maxaclevel=%d\n",
2713                 gtid, thread->th.th_current_task,
2714                 thread->th.th_current_task->td_icvs.max_active_levels));
2715   return thread->th.th_current_task->td_icvs.max_active_levels;
2716 }
2717 
2718 // nteams-var per-device ICV
2719 void __kmp_set_num_teams(int num_teams) {
2720   if (num_teams > 0)
2721     __kmp_nteams = num_teams;
2722 }
2723 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2724 // teams-thread-limit-var per-device ICV
2725 void __kmp_set_teams_thread_limit(int limit) {
2726   if (limit > 0)
2727     __kmp_teams_thread_limit = limit;
2728 }
2729 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2730 
2731 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2732 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2733 
2734 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2735 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2736   kmp_info_t *thread;
2737   kmp_sched_t orig_kind;
2738   //    kmp_team_t *team;
2739 
2740   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2741                 gtid, (int)kind, chunk));
2742   KMP_DEBUG_ASSERT(__kmp_init_serial);
2743 
2744   // Check if the kind parameter is valid, correct if needed.
2745   // Valid parameters should fit in one of two intervals - standard or extended:
2746   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2747   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2748   orig_kind = kind;
2749   kind = __kmp_sched_without_mods(kind);
2750 
2751   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2752       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2753     // TODO: Hint needs attention in case we change the default schedule.
2754     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2755               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2756               __kmp_msg_null);
2757     kind = kmp_sched_default;
2758     chunk = 0; // ignore chunk value in case of bad kind
2759   }
2760 
2761   thread = __kmp_threads[gtid];
2762 
2763   __kmp_save_internal_controls(thread);
2764 
2765   if (kind < kmp_sched_upper_std) {
2766     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2767       // differ static chunked vs. unchunked:  chunk should be invalid to
2768       // indicate unchunked schedule (which is the default)
2769       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2770     } else {
2771       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2772           __kmp_sch_map[kind - kmp_sched_lower - 1];
2773     }
2774   } else {
2775     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2776     //    kmp_sched_lower - 2 ];
2777     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2778         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2779                       kmp_sched_lower - 2];
2780   }
2781   __kmp_sched_apply_mods_intkind(
2782       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2783   if (kind == kmp_sched_auto || chunk < 1) {
2784     // ignore parameter chunk for schedule auto
2785     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2786   } else {
2787     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2788   }
2789 }
2790 
2791 /* Gets def_sched_var ICV values */
2792 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2793   kmp_info_t *thread;
2794   enum sched_type th_type;
2795 
2796   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2797   KMP_DEBUG_ASSERT(__kmp_init_serial);
2798 
2799   thread = __kmp_threads[gtid];
2800 
2801   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2802   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2803   case kmp_sch_static:
2804   case kmp_sch_static_greedy:
2805   case kmp_sch_static_balanced:
2806     *kind = kmp_sched_static;
2807     __kmp_sched_apply_mods_stdkind(kind, th_type);
2808     *chunk = 0; // chunk was not set, try to show this fact via zero value
2809     return;
2810   case kmp_sch_static_chunked:
2811     *kind = kmp_sched_static;
2812     break;
2813   case kmp_sch_dynamic_chunked:
2814     *kind = kmp_sched_dynamic;
2815     break;
2816   case kmp_sch_guided_chunked:
2817   case kmp_sch_guided_iterative_chunked:
2818   case kmp_sch_guided_analytical_chunked:
2819     *kind = kmp_sched_guided;
2820     break;
2821   case kmp_sch_auto:
2822     *kind = kmp_sched_auto;
2823     break;
2824   case kmp_sch_trapezoidal:
2825     *kind = kmp_sched_trapezoidal;
2826     break;
2827 #if KMP_STATIC_STEAL_ENABLED
2828   case kmp_sch_static_steal:
2829     *kind = kmp_sched_static_steal;
2830     break;
2831 #endif
2832   default:
2833     KMP_FATAL(UnknownSchedulingType, th_type);
2834   }
2835 
2836   __kmp_sched_apply_mods_stdkind(kind, th_type);
2837   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2838 }
2839 
2840 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2841 
2842   int ii, dd;
2843   kmp_team_t *team;
2844   kmp_info_t *thr;
2845 
2846   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2847   KMP_DEBUG_ASSERT(__kmp_init_serial);
2848 
2849   // validate level
2850   if (level == 0)
2851     return 0;
2852   if (level < 0)
2853     return -1;
2854   thr = __kmp_threads[gtid];
2855   team = thr->th.th_team;
2856   ii = team->t.t_level;
2857   if (level > ii)
2858     return -1;
2859 
2860   if (thr->th.th_teams_microtask) {
2861     // AC: we are in teams region where multiple nested teams have same level
2862     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2863     if (level <=
2864         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2865       KMP_DEBUG_ASSERT(ii >= tlevel);
2866       // AC: As we need to pass by the teams league, we need to artificially
2867       // increase ii
2868       if (ii == tlevel) {
2869         ii += 2; // three teams have same level
2870       } else {
2871         ii++; // two teams have same level
2872       }
2873     }
2874   }
2875 
2876   if (ii == level)
2877     return __kmp_tid_from_gtid(gtid);
2878 
2879   dd = team->t.t_serialized;
2880   level++;
2881   while (ii > level) {
2882     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2883     }
2884     if ((team->t.t_serialized) && (!dd)) {
2885       team = team->t.t_parent;
2886       continue;
2887     }
2888     if (ii > level) {
2889       team = team->t.t_parent;
2890       dd = team->t.t_serialized;
2891       ii--;
2892     }
2893   }
2894 
2895   return (dd > 1) ? (0) : (team->t.t_master_tid);
2896 }
2897 
2898 int __kmp_get_team_size(int gtid, int level) {
2899 
2900   int ii, dd;
2901   kmp_team_t *team;
2902   kmp_info_t *thr;
2903 
2904   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2905   KMP_DEBUG_ASSERT(__kmp_init_serial);
2906 
2907   // validate level
2908   if (level == 0)
2909     return 1;
2910   if (level < 0)
2911     return -1;
2912   thr = __kmp_threads[gtid];
2913   team = thr->th.th_team;
2914   ii = team->t.t_level;
2915   if (level > ii)
2916     return -1;
2917 
2918   if (thr->th.th_teams_microtask) {
2919     // AC: we are in teams region where multiple nested teams have same level
2920     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2921     if (level <=
2922         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2923       KMP_DEBUG_ASSERT(ii >= tlevel);
2924       // AC: As we need to pass by the teams league, we need to artificially
2925       // increase ii
2926       if (ii == tlevel) {
2927         ii += 2; // three teams have same level
2928       } else {
2929         ii++; // two teams have same level
2930       }
2931     }
2932   }
2933 
2934   while (ii > level) {
2935     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2936     }
2937     if (team->t.t_serialized && (!dd)) {
2938       team = team->t.t_parent;
2939       continue;
2940     }
2941     if (ii > level) {
2942       team = team->t.t_parent;
2943       ii--;
2944     }
2945   }
2946 
2947   return team->t.t_nproc;
2948 }
2949 
2950 kmp_r_sched_t __kmp_get_schedule_global() {
2951   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2952   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2953   // independently. So one can get the updated schedule here.
2954 
2955   kmp_r_sched_t r_sched;
2956 
2957   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2958   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2959   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2960   // different roots (even in OMP 2.5)
2961   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2962   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2963   if (s == kmp_sch_static) {
2964     // replace STATIC with more detailed schedule (balanced or greedy)
2965     r_sched.r_sched_type = __kmp_static;
2966   } else if (s == kmp_sch_guided_chunked) {
2967     // replace GUIDED with more detailed schedule (iterative or analytical)
2968     r_sched.r_sched_type = __kmp_guided;
2969   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2970     r_sched.r_sched_type = __kmp_sched;
2971   }
2972   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2973 
2974   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2975     // __kmp_chunk may be wrong here (if it was not ever set)
2976     r_sched.chunk = KMP_DEFAULT_CHUNK;
2977   } else {
2978     r_sched.chunk = __kmp_chunk;
2979   }
2980 
2981   return r_sched;
2982 }
2983 
2984 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2985    at least argc number of *t_argv entries for the requested team. */
2986 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2987 
2988   KMP_DEBUG_ASSERT(team);
2989   if (!realloc || argc > team->t.t_max_argc) {
2990 
2991     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2992                    "current entries=%d\n",
2993                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2994     /* if previously allocated heap space for args, free them */
2995     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2996       __kmp_free((void *)team->t.t_argv);
2997 
2998     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2999       /* use unused space in the cache line for arguments */
3000       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3001       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3002                      "argv entries\n",
3003                      team->t.t_id, team->t.t_max_argc));
3004       team->t.t_argv = &team->t.t_inline_argv[0];
3005       if (__kmp_storage_map) {
3006         __kmp_print_storage_map_gtid(
3007             -1, &team->t.t_inline_argv[0],
3008             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3009             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3010             team->t.t_id);
3011       }
3012     } else {
3013       /* allocate space for arguments in the heap */
3014       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3015                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3016                                : 2 * argc;
3017       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3018                      "argv entries\n",
3019                      team->t.t_id, team->t.t_max_argc));
3020       team->t.t_argv =
3021           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3022       if (__kmp_storage_map) {
3023         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3024                                      &team->t.t_argv[team->t.t_max_argc],
3025                                      sizeof(void *) * team->t.t_max_argc,
3026                                      "team_%d.t_argv", team->t.t_id);
3027       }
3028     }
3029   }
3030 }
3031 
3032 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3033   int i;
3034   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3035   team->t.t_threads =
3036       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3037   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3038       sizeof(dispatch_shared_info_t) * num_disp_buff);
3039   team->t.t_dispatch =
3040       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3041   team->t.t_implicit_task_taskdata =
3042       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3043   team->t.t_max_nproc = max_nth;
3044 
3045   /* setup dispatch buffers */
3046   for (i = 0; i < num_disp_buff; ++i) {
3047     team->t.t_disp_buffer[i].buffer_index = i;
3048     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3049   }
3050 }
3051 
3052 static void __kmp_free_team_arrays(kmp_team_t *team) {
3053   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3054   int i;
3055   for (i = 0; i < team->t.t_max_nproc; ++i) {
3056     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3057       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3058       team->t.t_dispatch[i].th_disp_buffer = NULL;
3059     }
3060   }
3061 #if KMP_USE_HIER_SCHED
3062   __kmp_dispatch_free_hierarchies(team);
3063 #endif
3064   __kmp_free(team->t.t_threads);
3065   __kmp_free(team->t.t_disp_buffer);
3066   __kmp_free(team->t.t_dispatch);
3067   __kmp_free(team->t.t_implicit_task_taskdata);
3068   team->t.t_threads = NULL;
3069   team->t.t_disp_buffer = NULL;
3070   team->t.t_dispatch = NULL;
3071   team->t.t_implicit_task_taskdata = 0;
3072 }
3073 
3074 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3075   kmp_info_t **oldThreads = team->t.t_threads;
3076 
3077   __kmp_free(team->t.t_disp_buffer);
3078   __kmp_free(team->t.t_dispatch);
3079   __kmp_free(team->t.t_implicit_task_taskdata);
3080   __kmp_allocate_team_arrays(team, max_nth);
3081 
3082   KMP_MEMCPY(team->t.t_threads, oldThreads,
3083              team->t.t_nproc * sizeof(kmp_info_t *));
3084 
3085   __kmp_free(oldThreads);
3086 }
3087 
3088 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3089 
3090   kmp_r_sched_t r_sched =
3091       __kmp_get_schedule_global(); // get current state of scheduling globals
3092 
3093   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3094 
3095   kmp_internal_control_t g_icvs = {
3096     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3097     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3098     // adjustment of threads (per thread)
3099     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3100     // whether blocktime is explicitly set
3101     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3102 #if KMP_USE_MONITOR
3103     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3104 // intervals
3105 #endif
3106     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3107     // next parallel region (per thread)
3108     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3109     __kmp_cg_max_nth, // int thread_limit;
3110     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3111     // for max_active_levels
3112     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3113     // {sched,chunk} pair
3114     __kmp_nested_proc_bind.bind_types[0],
3115     __kmp_default_device,
3116     NULL // struct kmp_internal_control *next;
3117   };
3118 
3119   return g_icvs;
3120 }
3121 
3122 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3123 
3124   kmp_internal_control_t gx_icvs;
3125   gx_icvs.serial_nesting_level =
3126       0; // probably =team->t.t_serial like in save_inter_controls
3127   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3128   gx_icvs.next = NULL;
3129 
3130   return gx_icvs;
3131 }
3132 
3133 static void __kmp_initialize_root(kmp_root_t *root) {
3134   int f;
3135   kmp_team_t *root_team;
3136   kmp_team_t *hot_team;
3137   int hot_team_max_nth;
3138   kmp_r_sched_t r_sched =
3139       __kmp_get_schedule_global(); // get current state of scheduling globals
3140   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3141   KMP_DEBUG_ASSERT(root);
3142   KMP_ASSERT(!root->r.r_begin);
3143 
3144   /* setup the root state structure */
3145   __kmp_init_lock(&root->r.r_begin_lock);
3146   root->r.r_begin = FALSE;
3147   root->r.r_active = FALSE;
3148   root->r.r_in_parallel = 0;
3149   root->r.r_blocktime = __kmp_dflt_blocktime;
3150 
3151   /* setup the root team for this task */
3152   /* allocate the root team structure */
3153   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3154 
3155   root_team =
3156       __kmp_allocate_team(root,
3157                           1, // new_nproc
3158                           1, // max_nproc
3159 #if OMPT_SUPPORT
3160                           ompt_data_none, // root parallel id
3161 #endif
3162                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3163                           0 // argc
3164                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3165                           );
3166 #if USE_DEBUGGER
3167   // Non-NULL value should be assigned to make the debugger display the root
3168   // team.
3169   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3170 #endif
3171 
3172   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3173 
3174   root->r.r_root_team = root_team;
3175   root_team->t.t_control_stack_top = NULL;
3176 
3177   /* initialize root team */
3178   root_team->t.t_threads[0] = NULL;
3179   root_team->t.t_nproc = 1;
3180   root_team->t.t_serialized = 1;
3181   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3182   root_team->t.t_sched.sched = r_sched.sched;
3183   KA_TRACE(
3184       20,
3185       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3186        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3187 
3188   /* setup the  hot team for this task */
3189   /* allocate the hot team structure */
3190   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3191 
3192   hot_team =
3193       __kmp_allocate_team(root,
3194                           1, // new_nproc
3195                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3196 #if OMPT_SUPPORT
3197                           ompt_data_none, // root parallel id
3198 #endif
3199                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3200                           0 // argc
3201                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3202                           );
3203   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3204 
3205   root->r.r_hot_team = hot_team;
3206   root_team->t.t_control_stack_top = NULL;
3207 
3208   /* first-time initialization */
3209   hot_team->t.t_parent = root_team;
3210 
3211   /* initialize hot team */
3212   hot_team_max_nth = hot_team->t.t_max_nproc;
3213   for (f = 0; f < hot_team_max_nth; ++f) {
3214     hot_team->t.t_threads[f] = NULL;
3215   }
3216   hot_team->t.t_nproc = 1;
3217   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3218   hot_team->t.t_sched.sched = r_sched.sched;
3219   hot_team->t.t_size_changed = 0;
3220 }
3221 
3222 #ifdef KMP_DEBUG
3223 
3224 typedef struct kmp_team_list_item {
3225   kmp_team_p const *entry;
3226   struct kmp_team_list_item *next;
3227 } kmp_team_list_item_t;
3228 typedef kmp_team_list_item_t *kmp_team_list_t;
3229 
3230 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3231     kmp_team_list_t list, // List of teams.
3232     kmp_team_p const *team // Team to add.
3233 ) {
3234 
3235   // List must terminate with item where both entry and next are NULL.
3236   // Team is added to the list only once.
3237   // List is sorted in ascending order by team id.
3238   // Team id is *not* a key.
3239 
3240   kmp_team_list_t l;
3241 
3242   KMP_DEBUG_ASSERT(list != NULL);
3243   if (team == NULL) {
3244     return;
3245   }
3246 
3247   __kmp_print_structure_team_accum(list, team->t.t_parent);
3248   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3249 
3250   // Search list for the team.
3251   l = list;
3252   while (l->next != NULL && l->entry != team) {
3253     l = l->next;
3254   }
3255   if (l->next != NULL) {
3256     return; // Team has been added before, exit.
3257   }
3258 
3259   // Team is not found. Search list again for insertion point.
3260   l = list;
3261   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3262     l = l->next;
3263   }
3264 
3265   // Insert team.
3266   {
3267     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3268         sizeof(kmp_team_list_item_t));
3269     *item = *l;
3270     l->entry = team;
3271     l->next = item;
3272   }
3273 }
3274 
3275 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3276 
3277 ) {
3278   __kmp_printf("%s", title);
3279   if (team != NULL) {
3280     __kmp_printf("%2x %p\n", team->t.t_id, team);
3281   } else {
3282     __kmp_printf(" - (nil)\n");
3283   }
3284 }
3285 
3286 static void __kmp_print_structure_thread(char const *title,
3287                                          kmp_info_p const *thread) {
3288   __kmp_printf("%s", title);
3289   if (thread != NULL) {
3290     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3291   } else {
3292     __kmp_printf(" - (nil)\n");
3293   }
3294 }
3295 
3296 void __kmp_print_structure(void) {
3297 
3298   kmp_team_list_t list;
3299 
3300   // Initialize list of teams.
3301   list =
3302       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3303   list->entry = NULL;
3304   list->next = NULL;
3305 
3306   __kmp_printf("\n------------------------------\nGlobal Thread "
3307                "Table\n------------------------------\n");
3308   {
3309     int gtid;
3310     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3311       __kmp_printf("%2d", gtid);
3312       if (__kmp_threads != NULL) {
3313         __kmp_printf(" %p", __kmp_threads[gtid]);
3314       }
3315       if (__kmp_root != NULL) {
3316         __kmp_printf(" %p", __kmp_root[gtid]);
3317       }
3318       __kmp_printf("\n");
3319     }
3320   }
3321 
3322   // Print out __kmp_threads array.
3323   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3324                "----------\n");
3325   if (__kmp_threads != NULL) {
3326     int gtid;
3327     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3328       kmp_info_t const *thread = __kmp_threads[gtid];
3329       if (thread != NULL) {
3330         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3331         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3332         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3333         __kmp_print_structure_team("    Serial Team:  ",
3334                                    thread->th.th_serial_team);
3335         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3336         __kmp_print_structure_thread("    Primary:      ",
3337                                      thread->th.th_team_master);
3338         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3339         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3340         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3341         __kmp_print_structure_thread("    Next in pool: ",
3342                                      thread->th.th_next_pool);
3343         __kmp_printf("\n");
3344         __kmp_print_structure_team_accum(list, thread->th.th_team);
3345         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3346       }
3347     }
3348   } else {
3349     __kmp_printf("Threads array is not allocated.\n");
3350   }
3351 
3352   // Print out __kmp_root array.
3353   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3354                "--------\n");
3355   if (__kmp_root != NULL) {
3356     int gtid;
3357     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3358       kmp_root_t const *root = __kmp_root[gtid];
3359       if (root != NULL) {
3360         __kmp_printf("GTID %2d %p:\n", gtid, root);
3361         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3362         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3363         __kmp_print_structure_thread("    Uber Thread:  ",
3364                                      root->r.r_uber_thread);
3365         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3366         __kmp_printf("    In Parallel:  %2d\n",
3367                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3368         __kmp_printf("\n");
3369         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3370         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3371       }
3372     }
3373   } else {
3374     __kmp_printf("Ubers array is not allocated.\n");
3375   }
3376 
3377   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3378                "--------\n");
3379   while (list->next != NULL) {
3380     kmp_team_p const *team = list->entry;
3381     int i;
3382     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3383     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3384     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3385     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3386     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3387     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3388     for (i = 0; i < team->t.t_nproc; ++i) {
3389       __kmp_printf("    Thread %2d:      ", i);
3390       __kmp_print_structure_thread("", team->t.t_threads[i]);
3391     }
3392     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3393     __kmp_printf("\n");
3394     list = list->next;
3395   }
3396 
3397   // Print out __kmp_thread_pool and __kmp_team_pool.
3398   __kmp_printf("\n------------------------------\nPools\n----------------------"
3399                "--------\n");
3400   __kmp_print_structure_thread("Thread pool:          ",
3401                                CCAST(kmp_info_t *, __kmp_thread_pool));
3402   __kmp_print_structure_team("Team pool:            ",
3403                              CCAST(kmp_team_t *, __kmp_team_pool));
3404   __kmp_printf("\n");
3405 
3406   // Free team list.
3407   while (list != NULL) {
3408     kmp_team_list_item_t *item = list;
3409     list = list->next;
3410     KMP_INTERNAL_FREE(item);
3411   }
3412 }
3413 
3414 #endif
3415 
3416 //---------------------------------------------------------------------------
3417 //  Stuff for per-thread fast random number generator
3418 //  Table of primes
3419 static const unsigned __kmp_primes[] = {
3420     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3421     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3422     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3423     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3424     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3425     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3426     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3427     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3428     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3429     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3430     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3431 
3432 //---------------------------------------------------------------------------
3433 //  __kmp_get_random: Get a random number using a linear congruential method.
3434 unsigned short __kmp_get_random(kmp_info_t *thread) {
3435   unsigned x = thread->th.th_x;
3436   unsigned short r = (unsigned short)(x >> 16);
3437 
3438   thread->th.th_x = x * thread->th.th_a + 1;
3439 
3440   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3441                 thread->th.th_info.ds.ds_tid, r));
3442 
3443   return r;
3444 }
3445 //--------------------------------------------------------
3446 // __kmp_init_random: Initialize a random number generator
3447 void __kmp_init_random(kmp_info_t *thread) {
3448   unsigned seed = thread->th.th_info.ds.ds_tid;
3449 
3450   thread->th.th_a =
3451       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3452   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3453   KA_TRACE(30,
3454            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3455 }
3456 
3457 #if KMP_OS_WINDOWS
3458 /* reclaim array entries for root threads that are already dead, returns number
3459  * reclaimed */
3460 static int __kmp_reclaim_dead_roots(void) {
3461   int i, r = 0;
3462 
3463   for (i = 0; i < __kmp_threads_capacity; ++i) {
3464     if (KMP_UBER_GTID(i) &&
3465         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3466         !__kmp_root[i]
3467              ->r.r_active) { // AC: reclaim only roots died in non-active state
3468       r += __kmp_unregister_root_other_thread(i);
3469     }
3470   }
3471   return r;
3472 }
3473 #endif
3474 
3475 /* This function attempts to create free entries in __kmp_threads and
3476    __kmp_root, and returns the number of free entries generated.
3477 
3478    For Windows* OS static library, the first mechanism used is to reclaim array
3479    entries for root threads that are already dead.
3480 
3481    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3482    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3483    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3484    threadprivate cache array has been created. Synchronization with
3485    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3486 
3487    After any dead root reclamation, if the clipping value allows array expansion
3488    to result in the generation of a total of nNeed free slots, the function does
3489    that expansion. If not, nothing is done beyond the possible initial root
3490    thread reclamation.
3491 
3492    If any argument is negative, the behavior is undefined. */
3493 static int __kmp_expand_threads(int nNeed) {
3494   int added = 0;
3495   int minimumRequiredCapacity;
3496   int newCapacity;
3497   kmp_info_t **newThreads;
3498   kmp_root_t **newRoot;
3499 
3500   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3501   // resizing __kmp_threads does not need additional protection if foreign
3502   // threads are present
3503 
3504 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3505   /* only for Windows static library */
3506   /* reclaim array entries for root threads that are already dead */
3507   added = __kmp_reclaim_dead_roots();
3508 
3509   if (nNeed) {
3510     nNeed -= added;
3511     if (nNeed < 0)
3512       nNeed = 0;
3513   }
3514 #endif
3515   if (nNeed <= 0)
3516     return added;
3517 
3518   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3519   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3520   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3521   // > __kmp_max_nth in one of two ways:
3522   //
3523   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3524   //    may not be reused by another thread, so we may need to increase
3525   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3526   //
3527   // 2) New foreign root(s) are encountered.  We always register new foreign
3528   //    roots. This may cause a smaller # of threads to be allocated at
3529   //    subsequent parallel regions, but the worker threads hang around (and
3530   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3531   //
3532   // Anyway, that is the reason for moving the check to see if
3533   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3534   // instead of having it performed here. -BB
3535 
3536   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3537 
3538   /* compute expansion headroom to check if we can expand */
3539   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3540     /* possible expansion too small -- give up */
3541     return added;
3542   }
3543   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3544 
3545   newCapacity = __kmp_threads_capacity;
3546   do {
3547     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3548                                                           : __kmp_sys_max_nth;
3549   } while (newCapacity < minimumRequiredCapacity);
3550   newThreads = (kmp_info_t **)__kmp_allocate(
3551       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3552   newRoot =
3553       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3554   KMP_MEMCPY(newThreads, __kmp_threads,
3555              __kmp_threads_capacity * sizeof(kmp_info_t *));
3556   KMP_MEMCPY(newRoot, __kmp_root,
3557              __kmp_threads_capacity * sizeof(kmp_root_t *));
3558 
3559   kmp_info_t **temp_threads = __kmp_threads;
3560   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3561   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3562   __kmp_free(temp_threads);
3563   added += newCapacity - __kmp_threads_capacity;
3564   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3565 
3566   if (newCapacity > __kmp_tp_capacity) {
3567     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3568     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3569       __kmp_threadprivate_resize_cache(newCapacity);
3570     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3571       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3572     }
3573     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3574   }
3575 
3576   return added;
3577 }
3578 
3579 /* Register the current thread as a root thread and obtain our gtid. We must
3580    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3581    thread that calls from __kmp_do_serial_initialize() */
3582 int __kmp_register_root(int initial_thread) {
3583   kmp_info_t *root_thread;
3584   kmp_root_t *root;
3585   int gtid;
3586   int capacity;
3587   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3588   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3589   KMP_MB();
3590 
3591   /* 2007-03-02:
3592      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3593      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3594      work as expected -- it may return false (that means there is at least one
3595      empty slot in __kmp_threads array), but it is possible the only free slot
3596      is #0, which is reserved for initial thread and so cannot be used for this
3597      one. Following code workarounds this bug.
3598 
3599      However, right solution seems to be not reserving slot #0 for initial
3600      thread because:
3601      (1) there is no magic in slot #0,
3602      (2) we cannot detect initial thread reliably (the first thread which does
3603         serial initialization may be not a real initial thread).
3604   */
3605   capacity = __kmp_threads_capacity;
3606   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3607     --capacity;
3608   }
3609 
3610   /* see if there are too many threads */
3611   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3612     if (__kmp_tp_cached) {
3613       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3614                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3615                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3616     } else {
3617       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3618                   __kmp_msg_null);
3619     }
3620   }
3621 
3622   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3623   // 0: initial thread, also a regular OpenMP thread.
3624   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3625   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3626   // regular OpenMP threads.
3627   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3628     // Find an available thread slot for hidden helper thread. Slots for hidden
3629     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3630     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3631                    gtid <= __kmp_hidden_helper_threads_num;
3632          gtid++)
3633       ;
3634     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3635     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3636                  "hidden helper thread: T#%d\n",
3637                  gtid));
3638   } else {
3639     /* find an available thread slot */
3640     // Don't reassign the zero slot since we need that to only be used by
3641     // initial thread. Slots for hidden helper threads should also be skipped.
3642     if (initial_thread && __kmp_threads[0] == NULL) {
3643       gtid = 0;
3644     } else {
3645       for (gtid = __kmp_hidden_helper_threads_num + 1;
3646            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3647         ;
3648     }
3649     KA_TRACE(
3650         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3651     KMP_ASSERT(gtid < __kmp_threads_capacity);
3652   }
3653 
3654   /* update global accounting */
3655   __kmp_all_nth++;
3656   TCW_4(__kmp_nth, __kmp_nth + 1);
3657 
3658   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3659   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3660   if (__kmp_adjust_gtid_mode) {
3661     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3662       if (TCR_4(__kmp_gtid_mode) != 2) {
3663         TCW_4(__kmp_gtid_mode, 2);
3664       }
3665     } else {
3666       if (TCR_4(__kmp_gtid_mode) != 1) {
3667         TCW_4(__kmp_gtid_mode, 1);
3668       }
3669     }
3670   }
3671 
3672 #ifdef KMP_ADJUST_BLOCKTIME
3673   /* Adjust blocktime to zero if necessary            */
3674   /* Middle initialization might not have occurred yet */
3675   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3676     if (__kmp_nth > __kmp_avail_proc) {
3677       __kmp_zero_bt = TRUE;
3678     }
3679   }
3680 #endif /* KMP_ADJUST_BLOCKTIME */
3681 
3682   /* setup this new hierarchy */
3683   if (!(root = __kmp_root[gtid])) {
3684     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3685     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3686   }
3687 
3688 #if KMP_STATS_ENABLED
3689   // Initialize stats as soon as possible (right after gtid assignment).
3690   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3691   __kmp_stats_thread_ptr->startLife();
3692   KMP_SET_THREAD_STATE(SERIAL_REGION);
3693   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3694 #endif
3695   __kmp_initialize_root(root);
3696 
3697   /* setup new root thread structure */
3698   if (root->r.r_uber_thread) {
3699     root_thread = root->r.r_uber_thread;
3700   } else {
3701     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3702     if (__kmp_storage_map) {
3703       __kmp_print_thread_storage_map(root_thread, gtid);
3704     }
3705     root_thread->th.th_info.ds.ds_gtid = gtid;
3706 #if OMPT_SUPPORT
3707     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3708 #endif
3709     root_thread->th.th_root = root;
3710     if (__kmp_env_consistency_check) {
3711       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3712     }
3713 #if USE_FAST_MEMORY
3714     __kmp_initialize_fast_memory(root_thread);
3715 #endif /* USE_FAST_MEMORY */
3716 
3717 #if KMP_USE_BGET
3718     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3719     __kmp_initialize_bget(root_thread);
3720 #endif
3721     __kmp_init_random(root_thread); // Initialize random number generator
3722   }
3723 
3724   /* setup the serial team held in reserve by the root thread */
3725   if (!root_thread->th.th_serial_team) {
3726     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3727     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3728     root_thread->th.th_serial_team = __kmp_allocate_team(
3729         root, 1, 1,
3730 #if OMPT_SUPPORT
3731         ompt_data_none, // root parallel id
3732 #endif
3733         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3734   }
3735   KMP_ASSERT(root_thread->th.th_serial_team);
3736   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3737                 root_thread->th.th_serial_team));
3738 
3739   /* drop root_thread into place */
3740   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3741 
3742   root->r.r_root_team->t.t_threads[0] = root_thread;
3743   root->r.r_hot_team->t.t_threads[0] = root_thread;
3744   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3745   // AC: the team created in reserve, not for execution (it is unused for now).
3746   root_thread->th.th_serial_team->t.t_serialized = 0;
3747   root->r.r_uber_thread = root_thread;
3748 
3749   /* initialize the thread, get it ready to go */
3750   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3751   TCW_4(__kmp_init_gtid, TRUE);
3752 
3753   /* prepare the primary thread for get_gtid() */
3754   __kmp_gtid_set_specific(gtid);
3755 
3756 #if USE_ITT_BUILD
3757   __kmp_itt_thread_name(gtid);
3758 #endif /* USE_ITT_BUILD */
3759 
3760 #ifdef KMP_TDATA_GTID
3761   __kmp_gtid = gtid;
3762 #endif
3763   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3764   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3765 
3766   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3767                 "plain=%u\n",
3768                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3769                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3770                 KMP_INIT_BARRIER_STATE));
3771   { // Initialize barrier data.
3772     int b;
3773     for (b = 0; b < bs_last_barrier; ++b) {
3774       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3775 #if USE_DEBUGGER
3776       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3777 #endif
3778     }
3779   }
3780   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3781                    KMP_INIT_BARRIER_STATE);
3782 
3783 #if KMP_AFFINITY_SUPPORTED
3784   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3785   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3786   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3787   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3788   if (TCR_4(__kmp_init_middle)) {
3789     __kmp_affinity_set_init_mask(gtid, TRUE);
3790   }
3791 #endif /* KMP_AFFINITY_SUPPORTED */
3792   root_thread->th.th_def_allocator = __kmp_def_allocator;
3793   root_thread->th.th_prev_level = 0;
3794   root_thread->th.th_prev_num_threads = 1;
3795 
3796   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3797   tmp->cg_root = root_thread;
3798   tmp->cg_thread_limit = __kmp_cg_max_nth;
3799   tmp->cg_nthreads = 1;
3800   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3801                  " cg_nthreads init to 1\n",
3802                  root_thread, tmp));
3803   tmp->up = NULL;
3804   root_thread->th.th_cg_roots = tmp;
3805 
3806   __kmp_root_counter++;
3807 
3808 #if OMPT_SUPPORT
3809   if (!initial_thread && ompt_enabled.enabled) {
3810 
3811     kmp_info_t *root_thread = ompt_get_thread();
3812 
3813     ompt_set_thread_state(root_thread, ompt_state_overhead);
3814 
3815     if (ompt_enabled.ompt_callback_thread_begin) {
3816       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3817           ompt_thread_initial, __ompt_get_thread_data_internal());
3818     }
3819     ompt_data_t *task_data;
3820     ompt_data_t *parallel_data;
3821     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3822                                   NULL);
3823     if (ompt_enabled.ompt_callback_implicit_task) {
3824       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3825           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3826     }
3827 
3828     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3829   }
3830 #endif
3831 
3832   KMP_MB();
3833   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3834 
3835   return gtid;
3836 }
3837 
3838 #if KMP_NESTED_HOT_TEAMS
3839 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3840                                 const int max_level) {
3841   int i, n, nth;
3842   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3843   if (!hot_teams || !hot_teams[level].hot_team) {
3844     return 0;
3845   }
3846   KMP_DEBUG_ASSERT(level < max_level);
3847   kmp_team_t *team = hot_teams[level].hot_team;
3848   nth = hot_teams[level].hot_team_nth;
3849   n = nth - 1; // primary thread is not freed
3850   if (level < max_level - 1) {
3851     for (i = 0; i < nth; ++i) {
3852       kmp_info_t *th = team->t.t_threads[i];
3853       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3854       if (i > 0 && th->th.th_hot_teams) {
3855         __kmp_free(th->th.th_hot_teams);
3856         th->th.th_hot_teams = NULL;
3857       }
3858     }
3859   }
3860   __kmp_free_team(root, team, NULL);
3861   return n;
3862 }
3863 #endif
3864 
3865 // Resets a root thread and clear its root and hot teams.
3866 // Returns the number of __kmp_threads entries directly and indirectly freed.
3867 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3868   kmp_team_t *root_team = root->r.r_root_team;
3869   kmp_team_t *hot_team = root->r.r_hot_team;
3870   int n = hot_team->t.t_nproc;
3871   int i;
3872 
3873   KMP_DEBUG_ASSERT(!root->r.r_active);
3874 
3875   root->r.r_root_team = NULL;
3876   root->r.r_hot_team = NULL;
3877   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3878   // before call to __kmp_free_team().
3879   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3880 #if KMP_NESTED_HOT_TEAMS
3881   if (__kmp_hot_teams_max_level >
3882       0) { // need to free nested hot teams and their threads if any
3883     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3884       kmp_info_t *th = hot_team->t.t_threads[i];
3885       if (__kmp_hot_teams_max_level > 1) {
3886         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3887       }
3888       if (th->th.th_hot_teams) {
3889         __kmp_free(th->th.th_hot_teams);
3890         th->th.th_hot_teams = NULL;
3891       }
3892     }
3893   }
3894 #endif
3895   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3896 
3897   // Before we can reap the thread, we need to make certain that all other
3898   // threads in the teams that had this root as ancestor have stopped trying to
3899   // steal tasks.
3900   if (__kmp_tasking_mode != tskm_immediate_exec) {
3901     __kmp_wait_to_unref_task_teams();
3902   }
3903 
3904 #if KMP_OS_WINDOWS
3905   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3906   KA_TRACE(
3907       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3908            "\n",
3909            (LPVOID) & (root->r.r_uber_thread->th),
3910            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3911   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3912 #endif /* KMP_OS_WINDOWS */
3913 
3914 #if OMPT_SUPPORT
3915   ompt_data_t *task_data;
3916   ompt_data_t *parallel_data;
3917   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3918                                 NULL);
3919   if (ompt_enabled.ompt_callback_implicit_task) {
3920     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3921         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3922   }
3923   if (ompt_enabled.ompt_callback_thread_end) {
3924     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3925         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3926   }
3927 #endif
3928 
3929   TCW_4(__kmp_nth,
3930         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3931   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3932   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3933                  " to %d\n",
3934                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3935                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3936   if (i == 1) {
3937     // need to free contention group structure
3938     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3939                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3940     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3941     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3942     root->r.r_uber_thread->th.th_cg_roots = NULL;
3943   }
3944   __kmp_reap_thread(root->r.r_uber_thread, 1);
3945 
3946   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3947   // instead of freeing.
3948   root->r.r_uber_thread = NULL;
3949   /* mark root as no longer in use */
3950   root->r.r_begin = FALSE;
3951 
3952   return n;
3953 }
3954 
3955 void __kmp_unregister_root_current_thread(int gtid) {
3956   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3957   /* this lock should be ok, since unregister_root_current_thread is never
3958      called during an abort, only during a normal close. furthermore, if you
3959      have the forkjoin lock, you should never try to get the initz lock */
3960   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3961   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3962     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3963                   "exiting T#%d\n",
3964                   gtid));
3965     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3966     return;
3967   }
3968   kmp_root_t *root = __kmp_root[gtid];
3969 
3970   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3971   KMP_ASSERT(KMP_UBER_GTID(gtid));
3972   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3973   KMP_ASSERT(root->r.r_active == FALSE);
3974 
3975   KMP_MB();
3976 
3977   kmp_info_t *thread = __kmp_threads[gtid];
3978   kmp_team_t *team = thread->th.th_team;
3979   kmp_task_team_t *task_team = thread->th.th_task_team;
3980 
3981   // we need to wait for the proxy tasks before finishing the thread
3982   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3983 #if OMPT_SUPPORT
3984     // the runtime is shutting down so we won't report any events
3985     thread->th.ompt_thread_info.state = ompt_state_undefined;
3986 #endif
3987     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3988   }
3989 
3990   __kmp_reset_root(gtid, root);
3991 
3992   KMP_MB();
3993   KC_TRACE(10,
3994            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3995 
3996   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3997 }
3998 
3999 #if KMP_OS_WINDOWS
4000 /* __kmp_forkjoin_lock must be already held
4001    Unregisters a root thread that is not the current thread.  Returns the number
4002    of __kmp_threads entries freed as a result. */
4003 static int __kmp_unregister_root_other_thread(int gtid) {
4004   kmp_root_t *root = __kmp_root[gtid];
4005   int r;
4006 
4007   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4008   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4009   KMP_ASSERT(KMP_UBER_GTID(gtid));
4010   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4011   KMP_ASSERT(root->r.r_active == FALSE);
4012 
4013   r = __kmp_reset_root(gtid, root);
4014   KC_TRACE(10,
4015            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4016   return r;
4017 }
4018 #endif
4019 
4020 #if KMP_DEBUG
4021 void __kmp_task_info() {
4022 
4023   kmp_int32 gtid = __kmp_entry_gtid();
4024   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4025   kmp_info_t *this_thr = __kmp_threads[gtid];
4026   kmp_team_t *steam = this_thr->th.th_serial_team;
4027   kmp_team_t *team = this_thr->th.th_team;
4028 
4029   __kmp_printf(
4030       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4031       "ptask=%p\n",
4032       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4033       team->t.t_implicit_task_taskdata[tid].td_parent);
4034 }
4035 #endif // KMP_DEBUG
4036 
4037 /* TODO optimize with one big memclr, take out what isn't needed, split
4038    responsibility to workers as much as possible, and delay initialization of
4039    features as much as possible  */
4040 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4041                                   int tid, int gtid) {
4042   /* this_thr->th.th_info.ds.ds_gtid is setup in
4043      kmp_allocate_thread/create_worker.
4044      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4045   kmp_info_t *master = team->t.t_threads[0];
4046   KMP_DEBUG_ASSERT(this_thr != NULL);
4047   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4048   KMP_DEBUG_ASSERT(team);
4049   KMP_DEBUG_ASSERT(team->t.t_threads);
4050   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4051   KMP_DEBUG_ASSERT(master);
4052   KMP_DEBUG_ASSERT(master->th.th_root);
4053 
4054   KMP_MB();
4055 
4056   TCW_SYNC_PTR(this_thr->th.th_team, team);
4057 
4058   this_thr->th.th_info.ds.ds_tid = tid;
4059   this_thr->th.th_set_nproc = 0;
4060   if (__kmp_tasking_mode != tskm_immediate_exec)
4061     // When tasking is possible, threads are not safe to reap until they are
4062     // done tasking; this will be set when tasking code is exited in wait
4063     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4064   else // no tasking --> always safe to reap
4065     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4066   this_thr->th.th_set_proc_bind = proc_bind_default;
4067 #if KMP_AFFINITY_SUPPORTED
4068   this_thr->th.th_new_place = this_thr->th.th_current_place;
4069 #endif
4070   this_thr->th.th_root = master->th.th_root;
4071 
4072   /* setup the thread's cache of the team structure */
4073   this_thr->th.th_team_nproc = team->t.t_nproc;
4074   this_thr->th.th_team_master = master;
4075   this_thr->th.th_team_serialized = team->t.t_serialized;
4076   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4077 
4078   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4079 
4080   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4081                 tid, gtid, this_thr, this_thr->th.th_current_task));
4082 
4083   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4084                            team, tid, TRUE);
4085 
4086   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4087                 tid, gtid, this_thr, this_thr->th.th_current_task));
4088   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4089   // __kmp_initialize_team()?
4090 
4091   /* TODO no worksharing in speculative threads */
4092   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4093 
4094   this_thr->th.th_local.this_construct = 0;
4095 
4096   if (!this_thr->th.th_pri_common) {
4097     this_thr->th.th_pri_common =
4098         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4099     if (__kmp_storage_map) {
4100       __kmp_print_storage_map_gtid(
4101           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4102           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4103     }
4104     this_thr->th.th_pri_head = NULL;
4105   }
4106 
4107   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4108       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4109     // Make new thread's CG root same as primary thread's
4110     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4111     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4112     if (tmp) {
4113       // worker changes CG, need to check if old CG should be freed
4114       int i = tmp->cg_nthreads--;
4115       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4116                      " on node %p of thread %p to %d\n",
4117                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4118       if (i == 1) {
4119         __kmp_free(tmp); // last thread left CG --> free it
4120       }
4121     }
4122     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4123     // Increment new thread's CG root's counter to add the new thread
4124     this_thr->th.th_cg_roots->cg_nthreads++;
4125     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4126                    " node %p of thread %p to %d\n",
4127                    this_thr, this_thr->th.th_cg_roots,
4128                    this_thr->th.th_cg_roots->cg_root,
4129                    this_thr->th.th_cg_roots->cg_nthreads));
4130     this_thr->th.th_current_task->td_icvs.thread_limit =
4131         this_thr->th.th_cg_roots->cg_thread_limit;
4132   }
4133 
4134   /* Initialize dynamic dispatch */
4135   {
4136     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4137     // Use team max_nproc since this will never change for the team.
4138     size_t disp_size =
4139         sizeof(dispatch_private_info_t) *
4140         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4141     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4142                   team->t.t_max_nproc));
4143     KMP_ASSERT(dispatch);
4144     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4145     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4146 
4147     dispatch->th_disp_index = 0;
4148     dispatch->th_doacross_buf_idx = 0;
4149     if (!dispatch->th_disp_buffer) {
4150       dispatch->th_disp_buffer =
4151           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4152 
4153       if (__kmp_storage_map) {
4154         __kmp_print_storage_map_gtid(
4155             gtid, &dispatch->th_disp_buffer[0],
4156             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4157                                           ? 1
4158                                           : __kmp_dispatch_num_buffers],
4159             disp_size,
4160             "th_%d.th_dispatch.th_disp_buffer "
4161             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4162             gtid, team->t.t_id, gtid);
4163       }
4164     } else {
4165       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4166     }
4167 
4168     dispatch->th_dispatch_pr_current = 0;
4169     dispatch->th_dispatch_sh_current = 0;
4170 
4171     dispatch->th_deo_fcn = 0; /* ORDERED     */
4172     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4173   }
4174 
4175   this_thr->th.th_next_pool = NULL;
4176 
4177   if (!this_thr->th.th_task_state_memo_stack) {
4178     size_t i;
4179     this_thr->th.th_task_state_memo_stack =
4180         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4181     this_thr->th.th_task_state_top = 0;
4182     this_thr->th.th_task_state_stack_sz = 4;
4183     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4184          ++i) // zero init the stack
4185       this_thr->th.th_task_state_memo_stack[i] = 0;
4186   }
4187 
4188   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4189   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4190 
4191   KMP_MB();
4192 }
4193 
4194 /* allocate a new thread for the requesting team. this is only called from
4195    within a forkjoin critical section. we will first try to get an available
4196    thread from the thread pool. if none is available, we will fork a new one
4197    assuming we are able to create a new one. this should be assured, as the
4198    caller should check on this first. */
4199 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4200                                   int new_tid) {
4201   kmp_team_t *serial_team;
4202   kmp_info_t *new_thr;
4203   int new_gtid;
4204 
4205   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4206   KMP_DEBUG_ASSERT(root && team);
4207 #if !KMP_NESTED_HOT_TEAMS
4208   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4209 #endif
4210   KMP_MB();
4211 
4212   /* first, try to get one from the thread pool */
4213   if (__kmp_thread_pool) {
4214     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4215     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4216     if (new_thr == __kmp_thread_pool_insert_pt) {
4217       __kmp_thread_pool_insert_pt = NULL;
4218     }
4219     TCW_4(new_thr->th.th_in_pool, FALSE);
4220     __kmp_suspend_initialize_thread(new_thr);
4221     __kmp_lock_suspend_mx(new_thr);
4222     if (new_thr->th.th_active_in_pool == TRUE) {
4223       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4224       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4225       new_thr->th.th_active_in_pool = FALSE;
4226     }
4227     __kmp_unlock_suspend_mx(new_thr);
4228 
4229     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4230                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4231     KMP_ASSERT(!new_thr->th.th_team);
4232     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4233 
4234     /* setup the thread structure */
4235     __kmp_initialize_info(new_thr, team, new_tid,
4236                           new_thr->th.th_info.ds.ds_gtid);
4237     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4238 
4239     TCW_4(__kmp_nth, __kmp_nth + 1);
4240 
4241     new_thr->th.th_task_state = 0;
4242     new_thr->th.th_task_state_top = 0;
4243     new_thr->th.th_task_state_stack_sz = 4;
4244 
4245 #ifdef KMP_ADJUST_BLOCKTIME
4246     /* Adjust blocktime back to zero if necessary */
4247     /* Middle initialization might not have occurred yet */
4248     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4249       if (__kmp_nth > __kmp_avail_proc) {
4250         __kmp_zero_bt = TRUE;
4251       }
4252     }
4253 #endif /* KMP_ADJUST_BLOCKTIME */
4254 
4255 #if KMP_DEBUG
4256     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4257     // KMP_BARRIER_PARENT_FLAG.
4258     int b;
4259     kmp_balign_t *balign = new_thr->th.th_bar;
4260     for (b = 0; b < bs_last_barrier; ++b)
4261       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4262 #endif
4263 
4264     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4265                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4266 
4267     KMP_MB();
4268     return new_thr;
4269   }
4270 
4271   /* no, well fork a new one */
4272   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4273   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4274 
4275 #if KMP_USE_MONITOR
4276   // If this is the first worker thread the RTL is creating, then also
4277   // launch the monitor thread.  We try to do this as early as possible.
4278   if (!TCR_4(__kmp_init_monitor)) {
4279     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4280     if (!TCR_4(__kmp_init_monitor)) {
4281       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4282       TCW_4(__kmp_init_monitor, 1);
4283       __kmp_create_monitor(&__kmp_monitor);
4284       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4285 #if KMP_OS_WINDOWS
4286       // AC: wait until monitor has started. This is a fix for CQ232808.
4287       // The reason is that if the library is loaded/unloaded in a loop with
4288       // small (parallel) work in between, then there is high probability that
4289       // monitor thread started after the library shutdown. At shutdown it is
4290       // too late to cope with the problem, because when the primary thread is
4291       // in DllMain (process detach) the monitor has no chances to start (it is
4292       // blocked), and primary thread has no means to inform the monitor that
4293       // the library has gone, because all the memory which the monitor can
4294       // access is going to be released/reset.
4295       while (TCR_4(__kmp_init_monitor) < 2) {
4296         KMP_YIELD(TRUE);
4297       }
4298       KF_TRACE(10, ("after monitor thread has started\n"));
4299 #endif
4300     }
4301     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4302   }
4303 #endif
4304 
4305   KMP_MB();
4306 
4307   {
4308     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4309                              ? 1
4310                              : __kmp_hidden_helper_threads_num + 1;
4311 
4312     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4313          ++new_gtid) {
4314       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4315     }
4316 
4317     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4318       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4319     }
4320   }
4321 
4322   /* allocate space for it. */
4323   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4324 
4325   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4326 
4327 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4328   // suppress race conditions detection on synchronization flags in debug mode
4329   // this helps to analyze library internals eliminating false positives
4330   __itt_suppress_mark_range(
4331       __itt_suppress_range, __itt_suppress_threading_errors,
4332       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4333   __itt_suppress_mark_range(
4334       __itt_suppress_range, __itt_suppress_threading_errors,
4335       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4336 #if KMP_OS_WINDOWS
4337   __itt_suppress_mark_range(
4338       __itt_suppress_range, __itt_suppress_threading_errors,
4339       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4340 #else
4341   __itt_suppress_mark_range(__itt_suppress_range,
4342                             __itt_suppress_threading_errors,
4343                             &new_thr->th.th_suspend_init_count,
4344                             sizeof(new_thr->th.th_suspend_init_count));
4345 #endif
4346   // TODO: check if we need to also suppress b_arrived flags
4347   __itt_suppress_mark_range(__itt_suppress_range,
4348                             __itt_suppress_threading_errors,
4349                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4350                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4351   __itt_suppress_mark_range(__itt_suppress_range,
4352                             __itt_suppress_threading_errors,
4353                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4354                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4355   __itt_suppress_mark_range(__itt_suppress_range,
4356                             __itt_suppress_threading_errors,
4357                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4358                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4359 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4360   if (__kmp_storage_map) {
4361     __kmp_print_thread_storage_map(new_thr, new_gtid);
4362   }
4363 
4364   // add the reserve serialized team, initialized from the team's primary thread
4365   {
4366     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4367     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4368     new_thr->th.th_serial_team = serial_team =
4369         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4370 #if OMPT_SUPPORT
4371                                           ompt_data_none, // root parallel id
4372 #endif
4373                                           proc_bind_default, &r_icvs,
4374                                           0 USE_NESTED_HOT_ARG(NULL));
4375   }
4376   KMP_ASSERT(serial_team);
4377   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4378   // execution (it is unused for now).
4379   serial_team->t.t_threads[0] = new_thr;
4380   KF_TRACE(10,
4381            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4382             new_thr));
4383 
4384   /* setup the thread structures */
4385   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4386 
4387 #if USE_FAST_MEMORY
4388   __kmp_initialize_fast_memory(new_thr);
4389 #endif /* USE_FAST_MEMORY */
4390 
4391 #if KMP_USE_BGET
4392   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4393   __kmp_initialize_bget(new_thr);
4394 #endif
4395 
4396   __kmp_init_random(new_thr); // Initialize random number generator
4397 
4398   /* Initialize these only once when thread is grabbed for a team allocation */
4399   KA_TRACE(20,
4400            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4401             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4402 
4403   int b;
4404   kmp_balign_t *balign = new_thr->th.th_bar;
4405   for (b = 0; b < bs_last_barrier; ++b) {
4406     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4407     balign[b].bb.team = NULL;
4408     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4409     balign[b].bb.use_oncore_barrier = 0;
4410   }
4411 
4412   new_thr->th.th_spin_here = FALSE;
4413   new_thr->th.th_next_waiting = 0;
4414 #if KMP_OS_UNIX
4415   new_thr->th.th_blocking = false;
4416 #endif
4417 
4418 #if KMP_AFFINITY_SUPPORTED
4419   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4420   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4421   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4422   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4423 #endif
4424   new_thr->th.th_def_allocator = __kmp_def_allocator;
4425   new_thr->th.th_prev_level = 0;
4426   new_thr->th.th_prev_num_threads = 1;
4427 
4428   TCW_4(new_thr->th.th_in_pool, FALSE);
4429   new_thr->th.th_active_in_pool = FALSE;
4430   TCW_4(new_thr->th.th_active, TRUE);
4431 
4432   /* adjust the global counters */
4433   __kmp_all_nth++;
4434   __kmp_nth++;
4435 
4436   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4437   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4438   if (__kmp_adjust_gtid_mode) {
4439     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4440       if (TCR_4(__kmp_gtid_mode) != 2) {
4441         TCW_4(__kmp_gtid_mode, 2);
4442       }
4443     } else {
4444       if (TCR_4(__kmp_gtid_mode) != 1) {
4445         TCW_4(__kmp_gtid_mode, 1);
4446       }
4447     }
4448   }
4449 
4450 #ifdef KMP_ADJUST_BLOCKTIME
4451   /* Adjust blocktime back to zero if necessary       */
4452   /* Middle initialization might not have occurred yet */
4453   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4454     if (__kmp_nth > __kmp_avail_proc) {
4455       __kmp_zero_bt = TRUE;
4456     }
4457   }
4458 #endif /* KMP_ADJUST_BLOCKTIME */
4459 
4460   /* actually fork it and create the new worker thread */
4461   KF_TRACE(
4462       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4463   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4464   KF_TRACE(10,
4465            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4466 
4467   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4468                 new_gtid));
4469   KMP_MB();
4470   return new_thr;
4471 }
4472 
4473 /* Reinitialize team for reuse.
4474    The hot team code calls this case at every fork barrier, so EPCC barrier
4475    test are extremely sensitive to changes in it, esp. writes to the team
4476    struct, which cause a cache invalidation in all threads.
4477    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4478 static void __kmp_reinitialize_team(kmp_team_t *team,
4479                                     kmp_internal_control_t *new_icvs,
4480                                     ident_t *loc) {
4481   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4482                 team->t.t_threads[0], team));
4483   KMP_DEBUG_ASSERT(team && new_icvs);
4484   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4485   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4486 
4487   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4488   // Copy ICVs to the primary thread's implicit taskdata
4489   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4490   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4491 
4492   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4493                 team->t.t_threads[0], team));
4494 }
4495 
4496 /* Initialize the team data structure.
4497    This assumes the t_threads and t_max_nproc are already set.
4498    Also, we don't touch the arguments */
4499 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4500                                   kmp_internal_control_t *new_icvs,
4501                                   ident_t *loc) {
4502   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4503 
4504   /* verify */
4505   KMP_DEBUG_ASSERT(team);
4506   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4507   KMP_DEBUG_ASSERT(team->t.t_threads);
4508   KMP_MB();
4509 
4510   team->t.t_master_tid = 0; /* not needed */
4511   /* team->t.t_master_bar;        not needed */
4512   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4513   team->t.t_nproc = new_nproc;
4514 
4515   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4516   team->t.t_next_pool = NULL;
4517   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4518    * up hot team */
4519 
4520   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4521   team->t.t_invoke = NULL; /* not needed */
4522 
4523   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4524   team->t.t_sched.sched = new_icvs->sched.sched;
4525 
4526 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4527   team->t.t_fp_control_saved = FALSE; /* not needed */
4528   team->t.t_x87_fpu_control_word = 0; /* not needed */
4529   team->t.t_mxcsr = 0; /* not needed */
4530 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4531 
4532   team->t.t_construct = 0;
4533 
4534   team->t.t_ordered.dt.t_value = 0;
4535   team->t.t_master_active = FALSE;
4536 
4537 #ifdef KMP_DEBUG
4538   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4539 #endif
4540 #if KMP_OS_WINDOWS
4541   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4542 #endif
4543 
4544   team->t.t_control_stack_top = NULL;
4545 
4546   __kmp_reinitialize_team(team, new_icvs, loc);
4547 
4548   KMP_MB();
4549   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4550 }
4551 
4552 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4553 /* Sets full mask for thread and returns old mask, no changes to structures. */
4554 static void
4555 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4556   if (KMP_AFFINITY_CAPABLE()) {
4557     int status;
4558     if (old_mask != NULL) {
4559       status = __kmp_get_system_affinity(old_mask, TRUE);
4560       int error = errno;
4561       if (status != 0) {
4562         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4563                     __kmp_msg_null);
4564       }
4565     }
4566     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4567   }
4568 }
4569 #endif
4570 
4571 #if KMP_AFFINITY_SUPPORTED
4572 
4573 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4574 // It calculates the worker + primary thread's partition based upon the parent
4575 // thread's partition, and binds each worker to a thread in their partition.
4576 // The primary thread's partition should already include its current binding.
4577 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4578   // Copy the primary thread's place partition to the team struct
4579   kmp_info_t *master_th = team->t.t_threads[0];
4580   KMP_DEBUG_ASSERT(master_th != NULL);
4581   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4582   int first_place = master_th->th.th_first_place;
4583   int last_place = master_th->th.th_last_place;
4584   int masters_place = master_th->th.th_current_place;
4585   team->t.t_first_place = first_place;
4586   team->t.t_last_place = last_place;
4587 
4588   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4589                 "bound to place %d partition = [%d,%d]\n",
4590                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4591                 team->t.t_id, masters_place, first_place, last_place));
4592 
4593   switch (proc_bind) {
4594 
4595   case proc_bind_default:
4596     // Serial teams might have the proc_bind policy set to proc_bind_default.
4597     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4598     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4599     break;
4600 
4601   case proc_bind_primary: {
4602     int f;
4603     int n_th = team->t.t_nproc;
4604     for (f = 1; f < n_th; f++) {
4605       kmp_info_t *th = team->t.t_threads[f];
4606       KMP_DEBUG_ASSERT(th != NULL);
4607       th->th.th_first_place = first_place;
4608       th->th.th_last_place = last_place;
4609       th->th.th_new_place = masters_place;
4610       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4611           team->t.t_display_affinity != 1) {
4612         team->t.t_display_affinity = 1;
4613       }
4614 
4615       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4616                      "partition = [%d,%d]\n",
4617                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4618                      f, masters_place, first_place, last_place));
4619     }
4620   } break;
4621 
4622   case proc_bind_close: {
4623     int f;
4624     int n_th = team->t.t_nproc;
4625     int n_places;
4626     if (first_place <= last_place) {
4627       n_places = last_place - first_place + 1;
4628     } else {
4629       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4630     }
4631     if (n_th <= n_places) {
4632       int place = masters_place;
4633       for (f = 1; f < n_th; f++) {
4634         kmp_info_t *th = team->t.t_threads[f];
4635         KMP_DEBUG_ASSERT(th != NULL);
4636 
4637         if (place == last_place) {
4638           place = first_place;
4639         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4640           place = 0;
4641         } else {
4642           place++;
4643         }
4644         th->th.th_first_place = first_place;
4645         th->th.th_last_place = last_place;
4646         th->th.th_new_place = place;
4647         if (__kmp_display_affinity && place != th->th.th_current_place &&
4648             team->t.t_display_affinity != 1) {
4649           team->t.t_display_affinity = 1;
4650         }
4651 
4652         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4653                        "partition = [%d,%d]\n",
4654                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4655                        team->t.t_id, f, place, first_place, last_place));
4656       }
4657     } else {
4658       int S, rem, gap, s_count;
4659       S = n_th / n_places;
4660       s_count = 0;
4661       rem = n_th - (S * n_places);
4662       gap = rem > 0 ? n_places / rem : n_places;
4663       int place = masters_place;
4664       int gap_ct = gap;
4665       for (f = 0; f < n_th; f++) {
4666         kmp_info_t *th = team->t.t_threads[f];
4667         KMP_DEBUG_ASSERT(th != NULL);
4668 
4669         th->th.th_first_place = first_place;
4670         th->th.th_last_place = last_place;
4671         th->th.th_new_place = place;
4672         if (__kmp_display_affinity && place != th->th.th_current_place &&
4673             team->t.t_display_affinity != 1) {
4674           team->t.t_display_affinity = 1;
4675         }
4676         s_count++;
4677 
4678         if ((s_count == S) && rem && (gap_ct == gap)) {
4679           // do nothing, add an extra thread to place on next iteration
4680         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4681           // we added an extra thread to this place; move to next place
4682           if (place == last_place) {
4683             place = first_place;
4684           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4685             place = 0;
4686           } else {
4687             place++;
4688           }
4689           s_count = 0;
4690           gap_ct = 1;
4691           rem--;
4692         } else if (s_count == S) { // place full; don't add extra
4693           if (place == last_place) {
4694             place = first_place;
4695           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4696             place = 0;
4697           } else {
4698             place++;
4699           }
4700           gap_ct++;
4701           s_count = 0;
4702         }
4703 
4704         KA_TRACE(100,
4705                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4706                   "partition = [%d,%d]\n",
4707                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4708                   th->th.th_new_place, first_place, last_place));
4709       }
4710       KMP_DEBUG_ASSERT(place == masters_place);
4711     }
4712   } break;
4713 
4714   case proc_bind_spread: {
4715     int f;
4716     int n_th = team->t.t_nproc;
4717     int n_places;
4718     int thidx;
4719     if (first_place <= last_place) {
4720       n_places = last_place - first_place + 1;
4721     } else {
4722       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4723     }
4724     if (n_th <= n_places) {
4725       int place = -1;
4726 
4727       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4728         int S = n_places / n_th;
4729         int s_count, rem, gap, gap_ct;
4730 
4731         place = masters_place;
4732         rem = n_places - n_th * S;
4733         gap = rem ? n_th / rem : 1;
4734         gap_ct = gap;
4735         thidx = n_th;
4736         if (update_master_only == 1)
4737           thidx = 1;
4738         for (f = 0; f < thidx; f++) {
4739           kmp_info_t *th = team->t.t_threads[f];
4740           KMP_DEBUG_ASSERT(th != NULL);
4741 
4742           th->th.th_first_place = place;
4743           th->th.th_new_place = place;
4744           if (__kmp_display_affinity && place != th->th.th_current_place &&
4745               team->t.t_display_affinity != 1) {
4746             team->t.t_display_affinity = 1;
4747           }
4748           s_count = 1;
4749           while (s_count < S) {
4750             if (place == last_place) {
4751               place = first_place;
4752             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4753               place = 0;
4754             } else {
4755               place++;
4756             }
4757             s_count++;
4758           }
4759           if (rem && (gap_ct == gap)) {
4760             if (place == last_place) {
4761               place = first_place;
4762             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4763               place = 0;
4764             } else {
4765               place++;
4766             }
4767             rem--;
4768             gap_ct = 0;
4769           }
4770           th->th.th_last_place = place;
4771           gap_ct++;
4772 
4773           if (place == last_place) {
4774             place = first_place;
4775           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4776             place = 0;
4777           } else {
4778             place++;
4779           }
4780 
4781           KA_TRACE(100,
4782                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4783                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4784                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4785                     f, th->th.th_new_place, th->th.th_first_place,
4786                     th->th.th_last_place, __kmp_affinity_num_masks));
4787         }
4788       } else {
4789         /* Having uniform space of available computation places I can create
4790            T partitions of round(P/T) size and put threads into the first
4791            place of each partition. */
4792         double current = static_cast<double>(masters_place);
4793         double spacing =
4794             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4795         int first, last;
4796         kmp_info_t *th;
4797 
4798         thidx = n_th + 1;
4799         if (update_master_only == 1)
4800           thidx = 1;
4801         for (f = 0; f < thidx; f++) {
4802           first = static_cast<int>(current);
4803           last = static_cast<int>(current + spacing) - 1;
4804           KMP_DEBUG_ASSERT(last >= first);
4805           if (first >= n_places) {
4806             if (masters_place) {
4807               first -= n_places;
4808               last -= n_places;
4809               if (first == (masters_place + 1)) {
4810                 KMP_DEBUG_ASSERT(f == n_th);
4811                 first--;
4812               }
4813               if (last == masters_place) {
4814                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4815                 last--;
4816               }
4817             } else {
4818               KMP_DEBUG_ASSERT(f == n_th);
4819               first = 0;
4820               last = 0;
4821             }
4822           }
4823           if (last >= n_places) {
4824             last = (n_places - 1);
4825           }
4826           place = first;
4827           current += spacing;
4828           if (f < n_th) {
4829             KMP_DEBUG_ASSERT(0 <= first);
4830             KMP_DEBUG_ASSERT(n_places > first);
4831             KMP_DEBUG_ASSERT(0 <= last);
4832             KMP_DEBUG_ASSERT(n_places > last);
4833             KMP_DEBUG_ASSERT(last_place >= first_place);
4834             th = team->t.t_threads[f];
4835             KMP_DEBUG_ASSERT(th);
4836             th->th.th_first_place = first;
4837             th->th.th_new_place = place;
4838             th->th.th_last_place = last;
4839             if (__kmp_display_affinity && place != th->th.th_current_place &&
4840                 team->t.t_display_affinity != 1) {
4841               team->t.t_display_affinity = 1;
4842             }
4843             KA_TRACE(100,
4844                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4845                       "partition = [%d,%d], spacing = %.4f\n",
4846                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4847                       team->t.t_id, f, th->th.th_new_place,
4848                       th->th.th_first_place, th->th.th_last_place, spacing));
4849           }
4850         }
4851       }
4852       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4853     } else {
4854       int S, rem, gap, s_count;
4855       S = n_th / n_places;
4856       s_count = 0;
4857       rem = n_th - (S * n_places);
4858       gap = rem > 0 ? n_places / rem : n_places;
4859       int place = masters_place;
4860       int gap_ct = gap;
4861       thidx = n_th;
4862       if (update_master_only == 1)
4863         thidx = 1;
4864       for (f = 0; f < thidx; f++) {
4865         kmp_info_t *th = team->t.t_threads[f];
4866         KMP_DEBUG_ASSERT(th != NULL);
4867 
4868         th->th.th_first_place = place;
4869         th->th.th_last_place = place;
4870         th->th.th_new_place = place;
4871         if (__kmp_display_affinity && place != th->th.th_current_place &&
4872             team->t.t_display_affinity != 1) {
4873           team->t.t_display_affinity = 1;
4874         }
4875         s_count++;
4876 
4877         if ((s_count == S) && rem && (gap_ct == gap)) {
4878           // do nothing, add an extra thread to place on next iteration
4879         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4880           // we added an extra thread to this place; move on to next place
4881           if (place == last_place) {
4882             place = first_place;
4883           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4884             place = 0;
4885           } else {
4886             place++;
4887           }
4888           s_count = 0;
4889           gap_ct = 1;
4890           rem--;
4891         } else if (s_count == S) { // place is full; don't add extra thread
4892           if (place == last_place) {
4893             place = first_place;
4894           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4895             place = 0;
4896           } else {
4897             place++;
4898           }
4899           gap_ct++;
4900           s_count = 0;
4901         }
4902 
4903         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4904                        "partition = [%d,%d]\n",
4905                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4906                        team->t.t_id, f, th->th.th_new_place,
4907                        th->th.th_first_place, th->th.th_last_place));
4908       }
4909       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4910     }
4911   } break;
4912 
4913   default:
4914     break;
4915   }
4916 
4917   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4918 }
4919 
4920 #endif // KMP_AFFINITY_SUPPORTED
4921 
4922 /* allocate a new team data structure to use.  take one off of the free pool if
4923    available */
4924 kmp_team_t *
4925 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4926 #if OMPT_SUPPORT
4927                     ompt_data_t ompt_parallel_data,
4928 #endif
4929                     kmp_proc_bind_t new_proc_bind,
4930                     kmp_internal_control_t *new_icvs,
4931                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4932   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4933   int f;
4934   kmp_team_t *team;
4935   int use_hot_team = !root->r.r_active;
4936   int level = 0;
4937 
4938   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4939   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4940   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4941   KMP_MB();
4942 
4943 #if KMP_NESTED_HOT_TEAMS
4944   kmp_hot_team_ptr_t *hot_teams;
4945   if (master) {
4946     team = master->th.th_team;
4947     level = team->t.t_active_level;
4948     if (master->th.th_teams_microtask) { // in teams construct?
4949       if (master->th.th_teams_size.nteams > 1 &&
4950           ( // #teams > 1
4951               team->t.t_pkfn ==
4952                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4953               master->th.th_teams_level <
4954                   team->t.t_level)) { // or nested parallel inside the teams
4955         ++level; // not increment if #teams==1, or for outer fork of the teams;
4956         // increment otherwise
4957       }
4958     }
4959     hot_teams = master->th.th_hot_teams;
4960     if (level < __kmp_hot_teams_max_level && hot_teams &&
4961         hot_teams[level].hot_team) {
4962       // hot team has already been allocated for given level
4963       use_hot_team = 1;
4964     } else {
4965       use_hot_team = 0;
4966     }
4967   } else {
4968     // check we won't access uninitialized hot_teams, just in case
4969     KMP_DEBUG_ASSERT(new_nproc == 1);
4970   }
4971 #endif
4972   // Optimization to use a "hot" team
4973   if (use_hot_team && new_nproc > 1) {
4974     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4975 #if KMP_NESTED_HOT_TEAMS
4976     team = hot_teams[level].hot_team;
4977 #else
4978     team = root->r.r_hot_team;
4979 #endif
4980 #if KMP_DEBUG
4981     if (__kmp_tasking_mode != tskm_immediate_exec) {
4982       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4983                     "task_team[1] = %p before reinit\n",
4984                     team->t.t_task_team[0], team->t.t_task_team[1]));
4985     }
4986 #endif
4987 
4988     // Has the number of threads changed?
4989     /* Let's assume the most common case is that the number of threads is
4990        unchanged, and put that case first. */
4991     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4992       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4993       // This case can mean that omp_set_num_threads() was called and the hot
4994       // team size was already reduced, so we check the special flag
4995       if (team->t.t_size_changed == -1) {
4996         team->t.t_size_changed = 1;
4997       } else {
4998         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4999       }
5000 
5001       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5002       kmp_r_sched_t new_sched = new_icvs->sched;
5003       // set primary thread's schedule as new run-time schedule
5004       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5005 
5006       __kmp_reinitialize_team(team, new_icvs,
5007                               root->r.r_uber_thread->th.th_ident);
5008 
5009       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5010                     team->t.t_threads[0], team));
5011       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5012 
5013 #if KMP_AFFINITY_SUPPORTED
5014       if ((team->t.t_size_changed == 0) &&
5015           (team->t.t_proc_bind == new_proc_bind)) {
5016         if (new_proc_bind == proc_bind_spread) {
5017           __kmp_partition_places(
5018               team, 1); // add flag to update only master for spread
5019         }
5020         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5021                        "proc_bind = %d, partition = [%d,%d]\n",
5022                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5023                        team->t.t_last_place));
5024       } else {
5025         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5026         __kmp_partition_places(team);
5027       }
5028 #else
5029       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5030 #endif /* KMP_AFFINITY_SUPPORTED */
5031     } else if (team->t.t_nproc > new_nproc) {
5032       KA_TRACE(20,
5033                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5034                 new_nproc));
5035 
5036       team->t.t_size_changed = 1;
5037 #if KMP_NESTED_HOT_TEAMS
5038       if (__kmp_hot_teams_mode == 0) {
5039         // AC: saved number of threads should correspond to team's value in this
5040         // mode, can be bigger in mode 1, when hot team has threads in reserve
5041         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5042         hot_teams[level].hot_team_nth = new_nproc;
5043 #endif // KMP_NESTED_HOT_TEAMS
5044         /* release the extra threads we don't need any more */
5045         for (f = new_nproc; f < team->t.t_nproc; f++) {
5046           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5047           if (__kmp_tasking_mode != tskm_immediate_exec) {
5048             // When decreasing team size, threads no longer in the team should
5049             // unref task team.
5050             team->t.t_threads[f]->th.th_task_team = NULL;
5051           }
5052           __kmp_free_thread(team->t.t_threads[f]);
5053           team->t.t_threads[f] = NULL;
5054         }
5055 #if KMP_NESTED_HOT_TEAMS
5056       } // (__kmp_hot_teams_mode == 0)
5057       else {
5058         // When keeping extra threads in team, switch threads to wait on own
5059         // b_go flag
5060         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5061           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5062           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5063           for (int b = 0; b < bs_last_barrier; ++b) {
5064             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5065               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5066             }
5067             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5068           }
5069         }
5070       }
5071 #endif // KMP_NESTED_HOT_TEAMS
5072       team->t.t_nproc = new_nproc;
5073       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5074       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5075       __kmp_reinitialize_team(team, new_icvs,
5076                               root->r.r_uber_thread->th.th_ident);
5077 
5078       // Update remaining threads
5079       for (f = 0; f < new_nproc; ++f) {
5080         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5081       }
5082 
5083       // restore the current task state of the primary thread: should be the
5084       // implicit task
5085       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5086                     team->t.t_threads[0], team));
5087 
5088       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5089 
5090 #ifdef KMP_DEBUG
5091       for (f = 0; f < team->t.t_nproc; f++) {
5092         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5093                          team->t.t_threads[f]->th.th_team_nproc ==
5094                              team->t.t_nproc);
5095       }
5096 #endif
5097 
5098       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5099 #if KMP_AFFINITY_SUPPORTED
5100       __kmp_partition_places(team);
5101 #endif
5102     } else { // team->t.t_nproc < new_nproc
5103 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5104       kmp_affin_mask_t *old_mask;
5105       if (KMP_AFFINITY_CAPABLE()) {
5106         KMP_CPU_ALLOC(old_mask);
5107       }
5108 #endif
5109 
5110       KA_TRACE(20,
5111                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5112                 new_nproc));
5113 
5114       team->t.t_size_changed = 1;
5115 
5116 #if KMP_NESTED_HOT_TEAMS
5117       int avail_threads = hot_teams[level].hot_team_nth;
5118       if (new_nproc < avail_threads)
5119         avail_threads = new_nproc;
5120       kmp_info_t **other_threads = team->t.t_threads;
5121       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5122         // Adjust barrier data of reserved threads (if any) of the team
5123         // Other data will be set in __kmp_initialize_info() below.
5124         int b;
5125         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5126         for (b = 0; b < bs_last_barrier; ++b) {
5127           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5128           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5129 #if USE_DEBUGGER
5130           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5131 #endif
5132         }
5133       }
5134       if (hot_teams[level].hot_team_nth >= new_nproc) {
5135         // we have all needed threads in reserve, no need to allocate any
5136         // this only possible in mode 1, cannot have reserved threads in mode 0
5137         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5138         team->t.t_nproc = new_nproc; // just get reserved threads involved
5139       } else {
5140         // we may have some threads in reserve, but not enough
5141         team->t.t_nproc =
5142             hot_teams[level]
5143                 .hot_team_nth; // get reserved threads involved if any
5144         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5145 #endif // KMP_NESTED_HOT_TEAMS
5146         if (team->t.t_max_nproc < new_nproc) {
5147           /* reallocate larger arrays */
5148           __kmp_reallocate_team_arrays(team, new_nproc);
5149           __kmp_reinitialize_team(team, new_icvs, NULL);
5150         }
5151 
5152 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5153         /* Temporarily set full mask for primary thread before creation of
5154            workers. The reason is that workers inherit the affinity from the
5155            primary thread, so if a lot of workers are created on the single
5156            core quickly, they don't get a chance to set their own affinity for
5157            a long time. */
5158         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5159 #endif
5160 
5161         /* allocate new threads for the hot team */
5162         for (f = team->t.t_nproc; f < new_nproc; f++) {
5163           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5164           KMP_DEBUG_ASSERT(new_worker);
5165           team->t.t_threads[f] = new_worker;
5166 
5167           KA_TRACE(20,
5168                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5169                     "join=%llu, plain=%llu\n",
5170                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5171                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5172                     team->t.t_bar[bs_plain_barrier].b_arrived));
5173 
5174           { // Initialize barrier data for new threads.
5175             int b;
5176             kmp_balign_t *balign = new_worker->th.th_bar;
5177             for (b = 0; b < bs_last_barrier; ++b) {
5178               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5179               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5180                                KMP_BARRIER_PARENT_FLAG);
5181 #if USE_DEBUGGER
5182               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5183 #endif
5184             }
5185           }
5186         }
5187 
5188 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5189         if (KMP_AFFINITY_CAPABLE()) {
5190           /* Restore initial primary thread's affinity mask */
5191           __kmp_set_system_affinity(old_mask, TRUE);
5192           KMP_CPU_FREE(old_mask);
5193         }
5194 #endif
5195 #if KMP_NESTED_HOT_TEAMS
5196       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5197 #endif // KMP_NESTED_HOT_TEAMS
5198       /* make sure everyone is syncronized */
5199       int old_nproc = team->t.t_nproc; // save old value and use to update only
5200       // new threads below
5201       __kmp_initialize_team(team, new_nproc, new_icvs,
5202                             root->r.r_uber_thread->th.th_ident);
5203 
5204       /* reinitialize the threads */
5205       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5206       for (f = 0; f < team->t.t_nproc; ++f)
5207         __kmp_initialize_info(team->t.t_threads[f], team, f,
5208                               __kmp_gtid_from_tid(f, team));
5209 
5210       if (level) { // set th_task_state for new threads in nested hot team
5211         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5212         // only need to set the th_task_state for the new threads. th_task_state
5213         // for primary thread will not be accurate until after this in
5214         // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5215         // get the correct value.
5216         for (f = old_nproc; f < team->t.t_nproc; ++f)
5217           team->t.t_threads[f]->th.th_task_state =
5218               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5219       } else { // set th_task_state for new threads in non-nested hot team
5220         // copy primary thread's state
5221         kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5222         for (f = old_nproc; f < team->t.t_nproc; ++f)
5223           team->t.t_threads[f]->th.th_task_state = old_state;
5224       }
5225 
5226 #ifdef KMP_DEBUG
5227       for (f = 0; f < team->t.t_nproc; ++f) {
5228         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5229                          team->t.t_threads[f]->th.th_team_nproc ==
5230                              team->t.t_nproc);
5231       }
5232 #endif
5233 
5234       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5235 #if KMP_AFFINITY_SUPPORTED
5236       __kmp_partition_places(team);
5237 #endif
5238     } // Check changes in number of threads
5239 
5240     kmp_info_t *master = team->t.t_threads[0];
5241     if (master->th.th_teams_microtask) {
5242       for (f = 1; f < new_nproc; ++f) {
5243         // propagate teams construct specific info to workers
5244         kmp_info_t *thr = team->t.t_threads[f];
5245         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5246         thr->th.th_teams_level = master->th.th_teams_level;
5247         thr->th.th_teams_size = master->th.th_teams_size;
5248       }
5249     }
5250 #if KMP_NESTED_HOT_TEAMS
5251     if (level) {
5252       // Sync barrier state for nested hot teams, not needed for outermost hot
5253       // team.
5254       for (f = 1; f < new_nproc; ++f) {
5255         kmp_info_t *thr = team->t.t_threads[f];
5256         int b;
5257         kmp_balign_t *balign = thr->th.th_bar;
5258         for (b = 0; b < bs_last_barrier; ++b) {
5259           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5260           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5261 #if USE_DEBUGGER
5262           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5263 #endif
5264         }
5265       }
5266     }
5267 #endif // KMP_NESTED_HOT_TEAMS
5268 
5269     /* reallocate space for arguments if necessary */
5270     __kmp_alloc_argv_entries(argc, team, TRUE);
5271     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5272     // The hot team re-uses the previous task team,
5273     // if untouched during the previous release->gather phase.
5274 
5275     KF_TRACE(10, (" hot_team = %p\n", team));
5276 
5277 #if KMP_DEBUG
5278     if (__kmp_tasking_mode != tskm_immediate_exec) {
5279       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5280                     "task_team[1] = %p after reinit\n",
5281                     team->t.t_task_team[0], team->t.t_task_team[1]));
5282     }
5283 #endif
5284 
5285 #if OMPT_SUPPORT
5286     __ompt_team_assign_id(team, ompt_parallel_data);
5287 #endif
5288 
5289     KMP_MB();
5290 
5291     return team;
5292   }
5293 
5294   /* next, let's try to take one from the team pool */
5295   KMP_MB();
5296   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5297     /* TODO: consider resizing undersized teams instead of reaping them, now
5298        that we have a resizing mechanism */
5299     if (team->t.t_max_nproc >= max_nproc) {
5300       /* take this team from the team pool */
5301       __kmp_team_pool = team->t.t_next_pool;
5302 
5303       /* setup the team for fresh use */
5304       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5305 
5306       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5307                     "task_team[1] %p to NULL\n",
5308                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5309       team->t.t_task_team[0] = NULL;
5310       team->t.t_task_team[1] = NULL;
5311 
5312       /* reallocate space for arguments if necessary */
5313       __kmp_alloc_argv_entries(argc, team, TRUE);
5314       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5315 
5316       KA_TRACE(
5317           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5318                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5319       { // Initialize barrier data.
5320         int b;
5321         for (b = 0; b < bs_last_barrier; ++b) {
5322           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5323 #if USE_DEBUGGER
5324           team->t.t_bar[b].b_master_arrived = 0;
5325           team->t.t_bar[b].b_team_arrived = 0;
5326 #endif
5327         }
5328       }
5329 
5330       team->t.t_proc_bind = new_proc_bind;
5331 
5332       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5333                     team->t.t_id));
5334 
5335 #if OMPT_SUPPORT
5336       __ompt_team_assign_id(team, ompt_parallel_data);
5337 #endif
5338 
5339       KMP_MB();
5340 
5341       return team;
5342     }
5343 
5344     /* reap team if it is too small, then loop back and check the next one */
5345     // not sure if this is wise, but, will be redone during the hot-teams
5346     // rewrite.
5347     /* TODO: Use technique to find the right size hot-team, don't reap them */
5348     team = __kmp_reap_team(team);
5349     __kmp_team_pool = team;
5350   }
5351 
5352   /* nothing available in the pool, no matter, make a new team! */
5353   KMP_MB();
5354   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5355 
5356   /* and set it up */
5357   team->t.t_max_nproc = max_nproc;
5358   /* NOTE well, for some reason allocating one big buffer and dividing it up
5359      seems to really hurt performance a lot on the P4, so, let's not use this */
5360   __kmp_allocate_team_arrays(team, max_nproc);
5361 
5362   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5363   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5364 
5365   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5366                 "%p to NULL\n",
5367                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5368   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5369   // memory, no need to duplicate
5370   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5371   // memory, no need to duplicate
5372 
5373   if (__kmp_storage_map) {
5374     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5375   }
5376 
5377   /* allocate space for arguments */
5378   __kmp_alloc_argv_entries(argc, team, FALSE);
5379   team->t.t_argc = argc;
5380 
5381   KA_TRACE(20,
5382            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5383             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5384   { // Initialize barrier data.
5385     int b;
5386     for (b = 0; b < bs_last_barrier; ++b) {
5387       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5388 #if USE_DEBUGGER
5389       team->t.t_bar[b].b_master_arrived = 0;
5390       team->t.t_bar[b].b_team_arrived = 0;
5391 #endif
5392     }
5393   }
5394 
5395   team->t.t_proc_bind = new_proc_bind;
5396 
5397 #if OMPT_SUPPORT
5398   __ompt_team_assign_id(team, ompt_parallel_data);
5399   team->t.ompt_serialized_team_info = NULL;
5400 #endif
5401 
5402   KMP_MB();
5403 
5404   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5405                 team->t.t_id));
5406 
5407   return team;
5408 }
5409 
5410 /* TODO implement hot-teams at all levels */
5411 /* TODO implement lazy thread release on demand (disband request) */
5412 
5413 /* free the team.  return it to the team pool.  release all the threads
5414  * associated with it */
5415 void __kmp_free_team(kmp_root_t *root,
5416                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5417   int f;
5418   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5419                 team->t.t_id));
5420 
5421   /* verify state */
5422   KMP_DEBUG_ASSERT(root);
5423   KMP_DEBUG_ASSERT(team);
5424   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5425   KMP_DEBUG_ASSERT(team->t.t_threads);
5426 
5427   int use_hot_team = team == root->r.r_hot_team;
5428 #if KMP_NESTED_HOT_TEAMS
5429   int level;
5430   kmp_hot_team_ptr_t *hot_teams;
5431   if (master) {
5432     level = team->t.t_active_level - 1;
5433     if (master->th.th_teams_microtask) { // in teams construct?
5434       if (master->th.th_teams_size.nteams > 1) {
5435         ++level; // level was not increased in teams construct for
5436         // team_of_masters
5437       }
5438       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5439           master->th.th_teams_level == team->t.t_level) {
5440         ++level; // level was not increased in teams construct for
5441         // team_of_workers before the parallel
5442       } // team->t.t_level will be increased inside parallel
5443     }
5444     hot_teams = master->th.th_hot_teams;
5445     if (level < __kmp_hot_teams_max_level) {
5446       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5447       use_hot_team = 1;
5448     }
5449   }
5450 #endif // KMP_NESTED_HOT_TEAMS
5451 
5452   /* team is done working */
5453   TCW_SYNC_PTR(team->t.t_pkfn,
5454                NULL); // Important for Debugging Support Library.
5455 #if KMP_OS_WINDOWS
5456   team->t.t_copyin_counter = 0; // init counter for possible reuse
5457 #endif
5458   // Do not reset pointer to parent team to NULL for hot teams.
5459 
5460   /* if we are non-hot team, release our threads */
5461   if (!use_hot_team) {
5462     if (__kmp_tasking_mode != tskm_immediate_exec) {
5463       // Wait for threads to reach reapable state
5464       for (f = 1; f < team->t.t_nproc; ++f) {
5465         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5466         kmp_info_t *th = team->t.t_threads[f];
5467         volatile kmp_uint32 *state = &th->th.th_reap_state;
5468         while (*state != KMP_SAFE_TO_REAP) {
5469 #if KMP_OS_WINDOWS
5470           // On Windows a thread can be killed at any time, check this
5471           DWORD ecode;
5472           if (!__kmp_is_thread_alive(th, &ecode)) {
5473             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5474             break;
5475           }
5476 #endif
5477           // first check if thread is sleeping
5478           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5479           if (fl.is_sleeping())
5480             fl.resume(__kmp_gtid_from_thread(th));
5481           KMP_CPU_PAUSE();
5482         }
5483       }
5484 
5485       // Delete task teams
5486       int tt_idx;
5487       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5488         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5489         if (task_team != NULL) {
5490           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5491             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5492             team->t.t_threads[f]->th.th_task_team = NULL;
5493           }
5494           KA_TRACE(
5495               20,
5496               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5497                __kmp_get_gtid(), task_team, team->t.t_id));
5498 #if KMP_NESTED_HOT_TEAMS
5499           __kmp_free_task_team(master, task_team);
5500 #endif
5501           team->t.t_task_team[tt_idx] = NULL;
5502         }
5503       }
5504     }
5505 
5506     // Reset pointer to parent team only for non-hot teams.
5507     team->t.t_parent = NULL;
5508     team->t.t_level = 0;
5509     team->t.t_active_level = 0;
5510 
5511     /* free the worker threads */
5512     for (f = 1; f < team->t.t_nproc; ++f) {
5513       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5514       __kmp_free_thread(team->t.t_threads[f]);
5515       team->t.t_threads[f] = NULL;
5516     }
5517 
5518     /* put the team back in the team pool */
5519     /* TODO limit size of team pool, call reap_team if pool too large */
5520     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5521     __kmp_team_pool = (volatile kmp_team_t *)team;
5522   } else { // Check if team was created for primary threads in teams construct
5523     // See if first worker is a CG root
5524     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5525                      team->t.t_threads[1]->th.th_cg_roots);
5526     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5527       // Clean up the CG root nodes on workers so that this team can be re-used
5528       for (f = 1; f < team->t.t_nproc; ++f) {
5529         kmp_info_t *thr = team->t.t_threads[f];
5530         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5531                          thr->th.th_cg_roots->cg_root == thr);
5532         // Pop current CG root off list
5533         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5534         thr->th.th_cg_roots = tmp->up;
5535         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5536                        " up to node %p. cg_nthreads was %d\n",
5537                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5538         int i = tmp->cg_nthreads--;
5539         if (i == 1) {
5540           __kmp_free(tmp); // free CG if we are the last thread in it
5541         }
5542         // Restore current task's thread_limit from CG root
5543         if (thr->th.th_cg_roots)
5544           thr->th.th_current_task->td_icvs.thread_limit =
5545               thr->th.th_cg_roots->cg_thread_limit;
5546       }
5547     }
5548   }
5549 
5550   KMP_MB();
5551 }
5552 
5553 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5554 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5555   kmp_team_t *next_pool = team->t.t_next_pool;
5556 
5557   KMP_DEBUG_ASSERT(team);
5558   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5559   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5560   KMP_DEBUG_ASSERT(team->t.t_threads);
5561   KMP_DEBUG_ASSERT(team->t.t_argv);
5562 
5563   /* TODO clean the threads that are a part of this? */
5564 
5565   /* free stuff */
5566   __kmp_free_team_arrays(team);
5567   if (team->t.t_argv != &team->t.t_inline_argv[0])
5568     __kmp_free((void *)team->t.t_argv);
5569   __kmp_free(team);
5570 
5571   KMP_MB();
5572   return next_pool;
5573 }
5574 
5575 // Free the thread.  Don't reap it, just place it on the pool of available
5576 // threads.
5577 //
5578 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5579 // binding for the affinity mechanism to be useful.
5580 //
5581 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5582 // However, we want to avoid a potential performance problem by always
5583 // scanning through the list to find the correct point at which to insert
5584 // the thread (potential N**2 behavior).  To do this we keep track of the
5585 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5586 // With single-level parallelism, threads will always be added to the tail
5587 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5588 // parallelism, all bets are off and we may need to scan through the entire
5589 // free list.
5590 //
5591 // This change also has a potentially large performance benefit, for some
5592 // applications.  Previously, as threads were freed from the hot team, they
5593 // would be placed back on the free list in inverse order.  If the hot team
5594 // grew back to it's original size, then the freed thread would be placed
5595 // back on the hot team in reverse order.  This could cause bad cache
5596 // locality problems on programs where the size of the hot team regularly
5597 // grew and shrunk.
5598 //
5599 // Now, for single-level parallelism, the OMP tid is always == gtid.
5600 void __kmp_free_thread(kmp_info_t *this_th) {
5601   int gtid;
5602   kmp_info_t **scan;
5603 
5604   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5605                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5606 
5607   KMP_DEBUG_ASSERT(this_th);
5608 
5609   // When moving thread to pool, switch thread to wait on own b_go flag, and
5610   // uninitialized (NULL team).
5611   int b;
5612   kmp_balign_t *balign = this_th->th.th_bar;
5613   for (b = 0; b < bs_last_barrier; ++b) {
5614     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5615       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5616     balign[b].bb.team = NULL;
5617     balign[b].bb.leaf_kids = 0;
5618   }
5619   this_th->th.th_task_state = 0;
5620   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5621 
5622   /* put thread back on the free pool */
5623   TCW_PTR(this_th->th.th_team, NULL);
5624   TCW_PTR(this_th->th.th_root, NULL);
5625   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5626 
5627   while (this_th->th.th_cg_roots) {
5628     this_th->th.th_cg_roots->cg_nthreads--;
5629     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5630                    " %p of thread  %p to %d\n",
5631                    this_th, this_th->th.th_cg_roots,
5632                    this_th->th.th_cg_roots->cg_root,
5633                    this_th->th.th_cg_roots->cg_nthreads));
5634     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5635     if (tmp->cg_root == this_th) { // Thread is a cg_root
5636       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5637       KA_TRACE(
5638           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5639       this_th->th.th_cg_roots = tmp->up;
5640       __kmp_free(tmp);
5641     } else { // Worker thread
5642       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5643         __kmp_free(tmp);
5644       }
5645       this_th->th.th_cg_roots = NULL;
5646       break;
5647     }
5648   }
5649 
5650   /* If the implicit task assigned to this thread can be used by other threads
5651    * -> multiple threads can share the data and try to free the task at
5652    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5653    * with higher probability when hot team is disabled but can occurs even when
5654    * the hot team is enabled */
5655   __kmp_free_implicit_task(this_th);
5656   this_th->th.th_current_task = NULL;
5657 
5658   // If the __kmp_thread_pool_insert_pt is already past the new insert
5659   // point, then we need to re-scan the entire list.
5660   gtid = this_th->th.th_info.ds.ds_gtid;
5661   if (__kmp_thread_pool_insert_pt != NULL) {
5662     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5663     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5664       __kmp_thread_pool_insert_pt = NULL;
5665     }
5666   }
5667 
5668   // Scan down the list to find the place to insert the thread.
5669   // scan is the address of a link in the list, possibly the address of
5670   // __kmp_thread_pool itself.
5671   //
5672   // In the absence of nested parallelism, the for loop will have 0 iterations.
5673   if (__kmp_thread_pool_insert_pt != NULL) {
5674     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5675   } else {
5676     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5677   }
5678   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5679        scan = &((*scan)->th.th_next_pool))
5680     ;
5681 
5682   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5683   // to its address.
5684   TCW_PTR(this_th->th.th_next_pool, *scan);
5685   __kmp_thread_pool_insert_pt = *scan = this_th;
5686   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5687                    (this_th->th.th_info.ds.ds_gtid <
5688                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5689   TCW_4(this_th->th.th_in_pool, TRUE);
5690   __kmp_suspend_initialize_thread(this_th);
5691   __kmp_lock_suspend_mx(this_th);
5692   if (this_th->th.th_active == TRUE) {
5693     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5694     this_th->th.th_active_in_pool = TRUE;
5695   }
5696 #if KMP_DEBUG
5697   else {
5698     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5699   }
5700 #endif
5701   __kmp_unlock_suspend_mx(this_th);
5702 
5703   TCW_4(__kmp_nth, __kmp_nth - 1);
5704 
5705 #ifdef KMP_ADJUST_BLOCKTIME
5706   /* Adjust blocktime back to user setting or default if necessary */
5707   /* Middle initialization might never have occurred                */
5708   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5709     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5710     if (__kmp_nth <= __kmp_avail_proc) {
5711       __kmp_zero_bt = FALSE;
5712     }
5713   }
5714 #endif /* KMP_ADJUST_BLOCKTIME */
5715 
5716   KMP_MB();
5717 }
5718 
5719 /* ------------------------------------------------------------------------ */
5720 
5721 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5722 #if OMP_PROFILING_SUPPORT
5723   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5724   // TODO: add a configuration option for time granularity
5725   if (ProfileTraceFile)
5726     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5727 #endif
5728 
5729   int gtid = this_thr->th.th_info.ds.ds_gtid;
5730   /*    void                 *stack_data;*/
5731   kmp_team_t **volatile pteam;
5732 
5733   KMP_MB();
5734   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5735 
5736   if (__kmp_env_consistency_check) {
5737     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5738   }
5739 
5740 #if OMPT_SUPPORT
5741   ompt_data_t *thread_data;
5742   if (ompt_enabled.enabled) {
5743     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5744     *thread_data = ompt_data_none;
5745 
5746     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5747     this_thr->th.ompt_thread_info.wait_id = 0;
5748     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5749     this_thr->th.ompt_thread_info.parallel_flags = 0;
5750     if (ompt_enabled.ompt_callback_thread_begin) {
5751       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5752           ompt_thread_worker, thread_data);
5753     }
5754     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5755   }
5756 #endif
5757 
5758   /* This is the place where threads wait for work */
5759   while (!TCR_4(__kmp_global.g.g_done)) {
5760     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5761     KMP_MB();
5762 
5763     /* wait for work to do */
5764     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5765 
5766     /* No tid yet since not part of a team */
5767     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5768 
5769 #if OMPT_SUPPORT
5770     if (ompt_enabled.enabled) {
5771       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5772     }
5773 #endif
5774 
5775     pteam = &this_thr->th.th_team;
5776 
5777     /* have we been allocated? */
5778     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5779       /* we were just woken up, so run our new task */
5780       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5781         int rc;
5782         KA_TRACE(20,
5783                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5784                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5785                   (*pteam)->t.t_pkfn));
5786 
5787         updateHWFPControl(*pteam);
5788 
5789 #if OMPT_SUPPORT
5790         if (ompt_enabled.enabled) {
5791           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5792         }
5793 #endif
5794 
5795         rc = (*pteam)->t.t_invoke(gtid);
5796         KMP_ASSERT(rc);
5797 
5798         KMP_MB();
5799         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5800                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5801                       (*pteam)->t.t_pkfn));
5802       }
5803 #if OMPT_SUPPORT
5804       if (ompt_enabled.enabled) {
5805         /* no frame set while outside task */
5806         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5807 
5808         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5809       }
5810 #endif
5811       /* join barrier after parallel region */
5812       __kmp_join_barrier(gtid);
5813     }
5814   }
5815   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5816 
5817 #if OMPT_SUPPORT
5818   if (ompt_enabled.ompt_callback_thread_end) {
5819     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5820   }
5821 #endif
5822 
5823   this_thr->th.th_task_team = NULL;
5824   /* run the destructors for the threadprivate data for this thread */
5825   __kmp_common_destroy_gtid(gtid);
5826 
5827   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5828   KMP_MB();
5829 
5830 #if OMP_PROFILING_SUPPORT
5831   llvm::timeTraceProfilerFinishThread();
5832 #endif
5833   return this_thr;
5834 }
5835 
5836 /* ------------------------------------------------------------------------ */
5837 
5838 void __kmp_internal_end_dest(void *specific_gtid) {
5839   // Make sure no significant bits are lost
5840   int gtid;
5841   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5842 
5843   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5844   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5845    * this is because 0 is reserved for the nothing-stored case */
5846 
5847   __kmp_internal_end_thread(gtid);
5848 }
5849 
5850 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5851 
5852 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5853   __kmp_internal_end_atexit();
5854 }
5855 
5856 #endif
5857 
5858 /* [Windows] josh: when the atexit handler is called, there may still be more
5859    than one thread alive */
5860 void __kmp_internal_end_atexit(void) {
5861   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5862   /* [Windows]
5863      josh: ideally, we want to completely shutdown the library in this atexit
5864      handler, but stat code that depends on thread specific data for gtid fails
5865      because that data becomes unavailable at some point during the shutdown, so
5866      we call __kmp_internal_end_thread instead. We should eventually remove the
5867      dependency on __kmp_get_specific_gtid in the stat code and use
5868      __kmp_internal_end_library to cleanly shutdown the library.
5869 
5870      // TODO: Can some of this comment about GVS be removed?
5871      I suspect that the offending stat code is executed when the calling thread
5872      tries to clean up a dead root thread's data structures, resulting in GVS
5873      code trying to close the GVS structures for that thread, but since the stat
5874      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5875      the calling thread is cleaning up itself instead of another thread, it get
5876      confused. This happens because allowing a thread to unregister and cleanup
5877      another thread is a recent modification for addressing an issue.
5878      Based on the current design (20050722), a thread may end up
5879      trying to unregister another thread only if thread death does not trigger
5880      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5881      thread specific data destructor function to detect thread death. For
5882      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5883      is nothing.  Thus, the workaround is applicable only for Windows static
5884      stat library. */
5885   __kmp_internal_end_library(-1);
5886 #if KMP_OS_WINDOWS
5887   __kmp_close_console();
5888 #endif
5889 }
5890 
5891 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5892   // It is assumed __kmp_forkjoin_lock is acquired.
5893 
5894   int gtid;
5895 
5896   KMP_DEBUG_ASSERT(thread != NULL);
5897 
5898   gtid = thread->th.th_info.ds.ds_gtid;
5899 
5900   if (!is_root) {
5901     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5902       /* Assume the threads are at the fork barrier here */
5903       KA_TRACE(
5904           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5905                gtid));
5906       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5907        * (GEH) */
5908       ANNOTATE_HAPPENS_BEFORE(thread);
5909       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5910                          thread);
5911       __kmp_release_64(&flag);
5912     }
5913 
5914     // Terminate OS thread.
5915     __kmp_reap_worker(thread);
5916 
5917     // The thread was killed asynchronously.  If it was actively
5918     // spinning in the thread pool, decrement the global count.
5919     //
5920     // There is a small timing hole here - if the worker thread was just waking
5921     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5922     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5923     // the global counter might not get updated.
5924     //
5925     // Currently, this can only happen as the library is unloaded,
5926     // so there are no harmful side effects.
5927     if (thread->th.th_active_in_pool) {
5928       thread->th.th_active_in_pool = FALSE;
5929       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5930       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5931     }
5932   }
5933 
5934   __kmp_free_implicit_task(thread);
5935 
5936 // Free the fast memory for tasking
5937 #if USE_FAST_MEMORY
5938   __kmp_free_fast_memory(thread);
5939 #endif /* USE_FAST_MEMORY */
5940 
5941   __kmp_suspend_uninitialize_thread(thread);
5942 
5943   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5944   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5945 
5946   --__kmp_all_nth;
5947   // __kmp_nth was decremented when thread is added to the pool.
5948 
5949 #ifdef KMP_ADJUST_BLOCKTIME
5950   /* Adjust blocktime back to user setting or default if necessary */
5951   /* Middle initialization might never have occurred                */
5952   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5953     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5954     if (__kmp_nth <= __kmp_avail_proc) {
5955       __kmp_zero_bt = FALSE;
5956     }
5957   }
5958 #endif /* KMP_ADJUST_BLOCKTIME */
5959 
5960   /* free the memory being used */
5961   if (__kmp_env_consistency_check) {
5962     if (thread->th.th_cons) {
5963       __kmp_free_cons_stack(thread->th.th_cons);
5964       thread->th.th_cons = NULL;
5965     }
5966   }
5967 
5968   if (thread->th.th_pri_common != NULL) {
5969     __kmp_free(thread->th.th_pri_common);
5970     thread->th.th_pri_common = NULL;
5971   }
5972 
5973   if (thread->th.th_task_state_memo_stack != NULL) {
5974     __kmp_free(thread->th.th_task_state_memo_stack);
5975     thread->th.th_task_state_memo_stack = NULL;
5976   }
5977 
5978 #if KMP_USE_BGET
5979   if (thread->th.th_local.bget_data != NULL) {
5980     __kmp_finalize_bget(thread);
5981   }
5982 #endif
5983 
5984 #if KMP_AFFINITY_SUPPORTED
5985   if (thread->th.th_affin_mask != NULL) {
5986     KMP_CPU_FREE(thread->th.th_affin_mask);
5987     thread->th.th_affin_mask = NULL;
5988   }
5989 #endif /* KMP_AFFINITY_SUPPORTED */
5990 
5991 #if KMP_USE_HIER_SCHED
5992   if (thread->th.th_hier_bar_data != NULL) {
5993     __kmp_free(thread->th.th_hier_bar_data);
5994     thread->th.th_hier_bar_data = NULL;
5995   }
5996 #endif
5997 
5998   __kmp_reap_team(thread->th.th_serial_team);
5999   thread->th.th_serial_team = NULL;
6000   __kmp_free(thread);
6001 
6002   KMP_MB();
6003 
6004 } // __kmp_reap_thread
6005 
6006 static void __kmp_internal_end(void) {
6007   int i;
6008 
6009   /* First, unregister the library */
6010   __kmp_unregister_library();
6011 
6012 #if KMP_OS_WINDOWS
6013   /* In Win static library, we can't tell when a root actually dies, so we
6014      reclaim the data structures for any root threads that have died but not
6015      unregistered themselves, in order to shut down cleanly.
6016      In Win dynamic library we also can't tell when a thread dies.  */
6017   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6018 // dead roots
6019 #endif
6020 
6021   for (i = 0; i < __kmp_threads_capacity; i++)
6022     if (__kmp_root[i])
6023       if (__kmp_root[i]->r.r_active)
6024         break;
6025   KMP_MB(); /* Flush all pending memory write invalidates.  */
6026   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6027 
6028   if (i < __kmp_threads_capacity) {
6029 #if KMP_USE_MONITOR
6030     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6031     KMP_MB(); /* Flush all pending memory write invalidates.  */
6032 
6033     // Need to check that monitor was initialized before reaping it. If we are
6034     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6035     // __kmp_monitor will appear to contain valid data, but it is only valid in
6036     // the parent process, not the child.
6037     // New behavior (201008): instead of keying off of the flag
6038     // __kmp_init_parallel, the monitor thread creation is keyed off
6039     // of the new flag __kmp_init_monitor.
6040     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6041     if (TCR_4(__kmp_init_monitor)) {
6042       __kmp_reap_monitor(&__kmp_monitor);
6043       TCW_4(__kmp_init_monitor, 0);
6044     }
6045     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6046     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6047 #endif // KMP_USE_MONITOR
6048   } else {
6049 /* TODO move this to cleanup code */
6050 #ifdef KMP_DEBUG
6051     /* make sure that everything has properly ended */
6052     for (i = 0; i < __kmp_threads_capacity; i++) {
6053       if (__kmp_root[i]) {
6054         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6055         //                    there can be uber threads alive here
6056         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6057       }
6058     }
6059 #endif
6060 
6061     KMP_MB();
6062 
6063     // Reap the worker threads.
6064     // This is valid for now, but be careful if threads are reaped sooner.
6065     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6066       // Get the next thread from the pool.
6067       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6068       __kmp_thread_pool = thread->th.th_next_pool;
6069       // Reap it.
6070       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6071       thread->th.th_next_pool = NULL;
6072       thread->th.th_in_pool = FALSE;
6073       __kmp_reap_thread(thread, 0);
6074     }
6075     __kmp_thread_pool_insert_pt = NULL;
6076 
6077     // Reap teams.
6078     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6079       // Get the next team from the pool.
6080       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6081       __kmp_team_pool = team->t.t_next_pool;
6082       // Reap it.
6083       team->t.t_next_pool = NULL;
6084       __kmp_reap_team(team);
6085     }
6086 
6087     __kmp_reap_task_teams();
6088 
6089 #if KMP_OS_UNIX
6090     // Threads that are not reaped should not access any resources since they
6091     // are going to be deallocated soon, so the shutdown sequence should wait
6092     // until all threads either exit the final spin-waiting loop or begin
6093     // sleeping after the given blocktime.
6094     for (i = 0; i < __kmp_threads_capacity; i++) {
6095       kmp_info_t *thr = __kmp_threads[i];
6096       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6097         KMP_CPU_PAUSE();
6098     }
6099 #endif
6100 
6101     for (i = 0; i < __kmp_threads_capacity; ++i) {
6102       // TBD: Add some checking...
6103       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6104     }
6105 
6106     /* Make sure all threadprivate destructors get run by joining with all
6107        worker threads before resetting this flag */
6108     TCW_SYNC_4(__kmp_init_common, FALSE);
6109 
6110     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6111     KMP_MB();
6112 
6113 #if KMP_USE_MONITOR
6114     // See note above: One of the possible fixes for CQ138434 / CQ140126
6115     //
6116     // FIXME: push both code fragments down and CSE them?
6117     // push them into __kmp_cleanup() ?
6118     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6119     if (TCR_4(__kmp_init_monitor)) {
6120       __kmp_reap_monitor(&__kmp_monitor);
6121       TCW_4(__kmp_init_monitor, 0);
6122     }
6123     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6124     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6125 #endif
6126   } /* else !__kmp_global.t_active */
6127   TCW_4(__kmp_init_gtid, FALSE);
6128   KMP_MB(); /* Flush all pending memory write invalidates.  */
6129 
6130   __kmp_cleanup();
6131 #if OMPT_SUPPORT
6132   ompt_fini();
6133 #endif
6134 }
6135 
6136 void __kmp_internal_end_library(int gtid_req) {
6137   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6138   /* this shouldn't be a race condition because __kmp_internal_end() is the
6139      only place to clear __kmp_serial_init */
6140   /* we'll check this later too, after we get the lock */
6141   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6142   // redundant, because the next check will work in any case.
6143   if (__kmp_global.g.g_abort) {
6144     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6145     /* TODO abort? */
6146     return;
6147   }
6148   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6149     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6150     return;
6151   }
6152 
6153   KMP_MB(); /* Flush all pending memory write invalidates.  */
6154   /* find out who we are and what we should do */
6155   {
6156     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6157     KA_TRACE(
6158         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6159     if (gtid == KMP_GTID_SHUTDOWN) {
6160       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6161                     "already shutdown\n"));
6162       return;
6163     } else if (gtid == KMP_GTID_MONITOR) {
6164       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6165                     "registered, or system shutdown\n"));
6166       return;
6167     } else if (gtid == KMP_GTID_DNE) {
6168       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6169                     "shutdown\n"));
6170       /* we don't know who we are, but we may still shutdown the library */
6171     } else if (KMP_UBER_GTID(gtid)) {
6172       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6173       if (__kmp_root[gtid]->r.r_active) {
6174         __kmp_global.g.g_abort = -1;
6175         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6176         __kmp_unregister_library();
6177         KA_TRACE(10,
6178                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6179                   gtid));
6180         return;
6181       } else {
6182         KA_TRACE(
6183             10,
6184             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6185         __kmp_unregister_root_current_thread(gtid);
6186       }
6187     } else {
6188 /* worker threads may call this function through the atexit handler, if they
6189  * call exit() */
6190 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6191    TODO: do a thorough shutdown instead */
6192 #ifdef DUMP_DEBUG_ON_EXIT
6193       if (__kmp_debug_buf)
6194         __kmp_dump_debug_buffer();
6195 #endif
6196       // added unregister library call here when we switch to shm linux
6197       // if we don't, it will leave lots of files in /dev/shm
6198       // cleanup shared memory file before exiting.
6199       __kmp_unregister_library();
6200       return;
6201     }
6202   }
6203   /* synchronize the termination process */
6204   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6205 
6206   /* have we already finished */
6207   if (__kmp_global.g.g_abort) {
6208     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6209     /* TODO abort? */
6210     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6211     return;
6212   }
6213   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6214     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6215     return;
6216   }
6217 
6218   /* We need this lock to enforce mutex between this reading of
6219      __kmp_threads_capacity and the writing by __kmp_register_root.
6220      Alternatively, we can use a counter of roots that is atomically updated by
6221      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6222      __kmp_internal_end_*.  */
6223   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6224 
6225   /* now we can safely conduct the actual termination */
6226   __kmp_internal_end();
6227 
6228   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6229   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6230 
6231   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6232 
6233 #ifdef DUMP_DEBUG_ON_EXIT
6234   if (__kmp_debug_buf)
6235     __kmp_dump_debug_buffer();
6236 #endif
6237 
6238 #if KMP_OS_WINDOWS
6239   __kmp_close_console();
6240 #endif
6241 
6242   __kmp_fini_allocator();
6243 
6244 } // __kmp_internal_end_library
6245 
6246 void __kmp_internal_end_thread(int gtid_req) {
6247   int i;
6248 
6249   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6250   /* this shouldn't be a race condition because __kmp_internal_end() is the
6251    * only place to clear __kmp_serial_init */
6252   /* we'll check this later too, after we get the lock */
6253   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6254   // redundant, because the next check will work in any case.
6255   if (__kmp_global.g.g_abort) {
6256     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6257     /* TODO abort? */
6258     return;
6259   }
6260   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6261     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6262     return;
6263   }
6264 
6265   // If hidden helper team has been initialized, we need to deinit it
6266   if (TCR_4(__kmp_init_hidden_helper)) {
6267     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6268     // First release the main thread to let it continue its work
6269     __kmp_hidden_helper_main_thread_release();
6270     // Wait until the hidden helper team has been destroyed
6271     __kmp_hidden_helper_threads_deinitz_wait();
6272   }
6273 
6274   KMP_MB(); /* Flush all pending memory write invalidates.  */
6275 
6276   /* find out who we are and what we should do */
6277   {
6278     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6279     KA_TRACE(10,
6280              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6281     if (gtid == KMP_GTID_SHUTDOWN) {
6282       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6283                     "already shutdown\n"));
6284       return;
6285     } else if (gtid == KMP_GTID_MONITOR) {
6286       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6287                     "registered, or system shutdown\n"));
6288       return;
6289     } else if (gtid == KMP_GTID_DNE) {
6290       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6291                     "shutdown\n"));
6292       return;
6293       /* we don't know who we are */
6294     } else if (KMP_UBER_GTID(gtid)) {
6295       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6296       if (__kmp_root[gtid]->r.r_active) {
6297         __kmp_global.g.g_abort = -1;
6298         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6299         KA_TRACE(10,
6300                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6301                   gtid));
6302         return;
6303       } else {
6304         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6305                       gtid));
6306         __kmp_unregister_root_current_thread(gtid);
6307       }
6308     } else {
6309       /* just a worker thread, let's leave */
6310       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6311 
6312       if (gtid >= 0) {
6313         __kmp_threads[gtid]->th.th_task_team = NULL;
6314       }
6315 
6316       KA_TRACE(10,
6317                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6318                 gtid));
6319       return;
6320     }
6321   }
6322 #if KMP_DYNAMIC_LIB
6323   if (__kmp_pause_status != kmp_hard_paused)
6324   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6325   // because we will better shutdown later in the library destructor.
6326   {
6327     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6328     return;
6329   }
6330 #endif
6331   /* synchronize the termination process */
6332   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6333 
6334   /* have we already finished */
6335   if (__kmp_global.g.g_abort) {
6336     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6337     /* TODO abort? */
6338     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6339     return;
6340   }
6341   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6342     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6343     return;
6344   }
6345 
6346   /* We need this lock to enforce mutex between this reading of
6347      __kmp_threads_capacity and the writing by __kmp_register_root.
6348      Alternatively, we can use a counter of roots that is atomically updated by
6349      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6350      __kmp_internal_end_*.  */
6351 
6352   /* should we finish the run-time?  are all siblings done? */
6353   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6354 
6355   for (i = 0; i < __kmp_threads_capacity; ++i) {
6356     if (KMP_UBER_GTID(i)) {
6357       KA_TRACE(
6358           10,
6359           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6360       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6361       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6362       return;
6363     }
6364   }
6365 
6366   /* now we can safely conduct the actual termination */
6367 
6368   __kmp_internal_end();
6369 
6370   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6371   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6372 
6373   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6374 
6375 #ifdef DUMP_DEBUG_ON_EXIT
6376   if (__kmp_debug_buf)
6377     __kmp_dump_debug_buffer();
6378 #endif
6379 } // __kmp_internal_end_thread
6380 
6381 // -----------------------------------------------------------------------------
6382 // Library registration stuff.
6383 
6384 static long __kmp_registration_flag = 0;
6385 // Random value used to indicate library initialization.
6386 static char *__kmp_registration_str = NULL;
6387 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6388 
6389 static inline char *__kmp_reg_status_name() {
6390 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6391    each thread. If registration and unregistration go in different threads
6392    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6393    env var can not be found, because the name will contain different pid. */
6394 // macOS* complains about name being too long with additional getuid()
6395 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6396   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6397                           (int)getuid());
6398 #else
6399   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6400 #endif
6401 } // __kmp_reg_status_get
6402 
6403 void __kmp_register_library_startup(void) {
6404 
6405   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6406   int done = 0;
6407   union {
6408     double dtime;
6409     long ltime;
6410   } time;
6411 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6412   __kmp_initialize_system_tick();
6413 #endif
6414   __kmp_read_system_time(&time.dtime);
6415   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6416   __kmp_registration_str =
6417       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6418                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6419 
6420   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6421                 __kmp_registration_str));
6422 
6423   while (!done) {
6424 
6425     char *value = NULL; // Actual value of the environment variable.
6426 
6427 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6428     char *shm_name = __kmp_str_format("/%s", name);
6429     int shm_preexist = 0;
6430     char *data1;
6431     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6432     if ((fd1 == -1) && (errno == EEXIST)) {
6433       // file didn't open because it already exists.
6434       // try opening existing file
6435       fd1 = shm_open(shm_name, O_RDWR, 0666);
6436       if (fd1 == -1) { // file didn't open
6437         // error out here
6438         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6439                     __kmp_msg_null);
6440       } else {
6441         // able to open existing file
6442         shm_preexist = 1;
6443       }
6444     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6445       // already exists.
6446       // error out here.
6447       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6448                   __kmp_msg_null);
6449     }
6450     if (shm_preexist == 0) {
6451       // we created SHM now set size
6452       if (ftruncate(fd1, SHM_SIZE) == -1) {
6453         // error occured setting size;
6454         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6455                     KMP_ERR(errno), __kmp_msg_null);
6456       }
6457     }
6458     data1 =
6459         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6460     if (data1 == MAP_FAILED) {
6461       // failed to map shared memory
6462       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6463                   __kmp_msg_null);
6464     }
6465     if (shm_preexist == 0) { // set data to SHM, set value
6466       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6467     }
6468     // Read value from either what we just wrote or existing file.
6469     value = __kmp_str_format("%s", data1); // read value from SHM
6470     munmap(data1, SHM_SIZE);
6471     close(fd1);
6472 #else // Windows and unix with static library
6473     // Set environment variable, but do not overwrite if it is exist.
6474     __kmp_env_set(name, __kmp_registration_str, 0);
6475     // read value to see if it got set
6476     value = __kmp_env_get(name);
6477 #endif
6478 
6479     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6480       done = 1; // Ok, environment variable set successfully, exit the loop.
6481     } else {
6482       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6483       // Check whether it alive or dead.
6484       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6485       char *tail = value;
6486       char *flag_addr_str = NULL;
6487       char *flag_val_str = NULL;
6488       char const *file_name = NULL;
6489       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6490       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6491       file_name = tail;
6492       if (tail != NULL) {
6493         long *flag_addr = 0;
6494         unsigned long flag_val = 0;
6495         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6496         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6497         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6498           // First, check whether environment-encoded address is mapped into
6499           // addr space.
6500           // If so, dereference it to see if it still has the right value.
6501           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6502             neighbor = 1;
6503           } else {
6504             // If not, then we know the other copy of the library is no longer
6505             // running.
6506             neighbor = 2;
6507           }
6508         }
6509       }
6510       switch (neighbor) {
6511       case 0: // Cannot parse environment variable -- neighbor status unknown.
6512         // Assume it is the incompatible format of future version of the
6513         // library. Assume the other library is alive.
6514         // WARN( ... ); // TODO: Issue a warning.
6515         file_name = "unknown library";
6516         KMP_FALLTHROUGH();
6517       // Attention! Falling to the next case. That's intentional.
6518       case 1: { // Neighbor is alive.
6519         // Check it is allowed.
6520         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6521         if (!__kmp_str_match_true(duplicate_ok)) {
6522           // That's not allowed. Issue fatal error.
6523           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6524                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6525         }
6526         KMP_INTERNAL_FREE(duplicate_ok);
6527         __kmp_duplicate_library_ok = 1;
6528         done = 1; // Exit the loop.
6529       } break;
6530       case 2: { // Neighbor is dead.
6531 
6532 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6533         // close shared memory.
6534         shm_unlink(shm_name); // this removes file in /dev/shm
6535 #else
6536         // Clear the variable and try to register library again.
6537         __kmp_env_unset(name);
6538 #endif
6539       } break;
6540       default: {
6541         KMP_DEBUG_ASSERT(0);
6542       } break;
6543       }
6544     }
6545     KMP_INTERNAL_FREE((void *)value);
6546 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6547     KMP_INTERNAL_FREE((void *)shm_name);
6548 #endif
6549   } // while
6550   KMP_INTERNAL_FREE((void *)name);
6551 
6552 } // func __kmp_register_library_startup
6553 
6554 void __kmp_unregister_library(void) {
6555 
6556   char *name = __kmp_reg_status_name();
6557   char *value = NULL;
6558 
6559 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6560   char *shm_name = __kmp_str_format("/%s", name);
6561   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6562   if (fd1 == -1) {
6563     // file did not open. return.
6564     return;
6565   }
6566   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6567   if (data1 != MAP_FAILED) {
6568     value = __kmp_str_format("%s", data1); // read value from SHM
6569     munmap(data1, SHM_SIZE);
6570   }
6571   close(fd1);
6572 #else
6573   value = __kmp_env_get(name);
6574 #endif
6575 
6576   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6577   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6578   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6579 //  Ok, this is our variable. Delete it.
6580 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6581     shm_unlink(shm_name); // this removes file in /dev/shm
6582 #else
6583     __kmp_env_unset(name);
6584 #endif
6585   }
6586 
6587 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6588   KMP_INTERNAL_FREE(shm_name);
6589 #endif
6590 
6591   KMP_INTERNAL_FREE(__kmp_registration_str);
6592   KMP_INTERNAL_FREE(value);
6593   KMP_INTERNAL_FREE(name);
6594 
6595   __kmp_registration_flag = 0;
6596   __kmp_registration_str = NULL;
6597 
6598 } // __kmp_unregister_library
6599 
6600 // End of Library registration stuff.
6601 // -----------------------------------------------------------------------------
6602 
6603 #if KMP_MIC_SUPPORTED
6604 
6605 static void __kmp_check_mic_type() {
6606   kmp_cpuid_t cpuid_state = {0};
6607   kmp_cpuid_t *cs_p = &cpuid_state;
6608   __kmp_x86_cpuid(1, 0, cs_p);
6609   // We don't support mic1 at the moment
6610   if ((cs_p->eax & 0xff0) == 0xB10) {
6611     __kmp_mic_type = mic2;
6612   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6613     __kmp_mic_type = mic3;
6614   } else {
6615     __kmp_mic_type = non_mic;
6616   }
6617 }
6618 
6619 #endif /* KMP_MIC_SUPPORTED */
6620 
6621 #if KMP_HAVE_UMWAIT
6622 static void __kmp_user_level_mwait_init() {
6623   struct kmp_cpuid buf;
6624   __kmp_x86_cpuid(7, 0, &buf);
6625   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6626   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6627                 __kmp_umwait_enabled));
6628 }
6629 #elif KMP_HAVE_MWAIT
6630 #ifndef AT_INTELPHIUSERMWAIT
6631 // Spurious, non-existent value that should always fail to return anything.
6632 // Will be replaced with the correct value when we know that.
6633 #define AT_INTELPHIUSERMWAIT 10000
6634 #endif
6635 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6636 // earlier OS is used to build the RTL, we'll use the following internal
6637 // function when the entry is not found.
6638 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6639 unsigned long getauxval(unsigned long) { return 0; }
6640 
6641 static void __kmp_user_level_mwait_init() {
6642   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6643   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6644   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6645   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6646   if (__kmp_mic_type == mic3) {
6647     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6648     if ((res & 0x1) || __kmp_user_level_mwait) {
6649       __kmp_mwait_enabled = TRUE;
6650       if (__kmp_user_level_mwait) {
6651         KMP_INFORM(EnvMwaitWarn);
6652       }
6653     } else {
6654       __kmp_mwait_enabled = FALSE;
6655     }
6656   }
6657   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6658                 "__kmp_mwait_enabled = %d\n",
6659                 __kmp_mic_type, __kmp_mwait_enabled));
6660 }
6661 #endif /* KMP_HAVE_UMWAIT */
6662 
6663 static void __kmp_do_serial_initialize(void) {
6664   int i, gtid;
6665   size_t size;
6666 
6667   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6668 
6669   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6670   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6671   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6672   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6673   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6674 
6675 #if OMPT_SUPPORT
6676   ompt_pre_init();
6677 #endif
6678 
6679   __kmp_validate_locks();
6680 
6681   /* Initialize internal memory allocator */
6682   __kmp_init_allocator();
6683 
6684   /* Register the library startup via an environment variable and check to see
6685      whether another copy of the library is already registered. */
6686 
6687   __kmp_register_library_startup();
6688 
6689   /* TODO reinitialization of library */
6690   if (TCR_4(__kmp_global.g.g_done)) {
6691     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6692   }
6693 
6694   __kmp_global.g.g_abort = 0;
6695   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6696 
6697 /* initialize the locks */
6698 #if KMP_USE_ADAPTIVE_LOCKS
6699 #if KMP_DEBUG_ADAPTIVE_LOCKS
6700   __kmp_init_speculative_stats();
6701 #endif
6702 #endif
6703 #if KMP_STATS_ENABLED
6704   __kmp_stats_init();
6705 #endif
6706   __kmp_init_lock(&__kmp_global_lock);
6707   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6708   __kmp_init_lock(&__kmp_debug_lock);
6709   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6710   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6711   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6712   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6713   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6714   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6715   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6716   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6717   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6718   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6719   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6720   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6721   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6722   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6723   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6724 #if KMP_USE_MONITOR
6725   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6726 #endif
6727   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6728 
6729   /* conduct initialization and initial setup of configuration */
6730 
6731   __kmp_runtime_initialize();
6732 
6733 #if KMP_MIC_SUPPORTED
6734   __kmp_check_mic_type();
6735 #endif
6736 
6737 // Some global variable initialization moved here from kmp_env_initialize()
6738 #ifdef KMP_DEBUG
6739   kmp_diag = 0;
6740 #endif
6741   __kmp_abort_delay = 0;
6742 
6743   // From __kmp_init_dflt_team_nth()
6744   /* assume the entire machine will be used */
6745   __kmp_dflt_team_nth_ub = __kmp_xproc;
6746   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6747     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6748   }
6749   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6750     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6751   }
6752   __kmp_max_nth = __kmp_sys_max_nth;
6753   __kmp_cg_max_nth = __kmp_sys_max_nth;
6754   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6755   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6756     __kmp_teams_max_nth = __kmp_sys_max_nth;
6757   }
6758 
6759   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6760   // part
6761   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6762 #if KMP_USE_MONITOR
6763   __kmp_monitor_wakeups =
6764       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6765   __kmp_bt_intervals =
6766       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6767 #endif
6768   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6769   __kmp_library = library_throughput;
6770   // From KMP_SCHEDULE initialization
6771   __kmp_static = kmp_sch_static_balanced;
6772 // AC: do not use analytical here, because it is non-monotonous
6773 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6774 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6775 // need to repeat assignment
6776 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6777 // bit control and barrier method control parts
6778 #if KMP_FAST_REDUCTION_BARRIER
6779 #define kmp_reduction_barrier_gather_bb ((int)1)
6780 #define kmp_reduction_barrier_release_bb ((int)1)
6781 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6782 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6783 #endif // KMP_FAST_REDUCTION_BARRIER
6784   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6785     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6786     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6787     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6788     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6789 #if KMP_FAST_REDUCTION_BARRIER
6790     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6791       // lin_64 ): hyper,1
6792       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6793       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6794       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6795       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6796     }
6797 #endif // KMP_FAST_REDUCTION_BARRIER
6798   }
6799 #if KMP_FAST_REDUCTION_BARRIER
6800 #undef kmp_reduction_barrier_release_pat
6801 #undef kmp_reduction_barrier_gather_pat
6802 #undef kmp_reduction_barrier_release_bb
6803 #undef kmp_reduction_barrier_gather_bb
6804 #endif // KMP_FAST_REDUCTION_BARRIER
6805 #if KMP_MIC_SUPPORTED
6806   if (__kmp_mic_type == mic2) { // KNC
6807     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6808     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6809     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6810         1; // forkjoin release
6811     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6812     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6813   }
6814 #if KMP_FAST_REDUCTION_BARRIER
6815   if (__kmp_mic_type == mic2) { // KNC
6816     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6817     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6818   }
6819 #endif // KMP_FAST_REDUCTION_BARRIER
6820 #endif // KMP_MIC_SUPPORTED
6821 
6822 // From KMP_CHECKS initialization
6823 #ifdef KMP_DEBUG
6824   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6825 #else
6826   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6827 #endif
6828 
6829   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6830   __kmp_foreign_tp = TRUE;
6831 
6832   __kmp_global.g.g_dynamic = FALSE;
6833   __kmp_global.g.g_dynamic_mode = dynamic_default;
6834 
6835   __kmp_env_initialize(NULL);
6836 
6837 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6838   __kmp_user_level_mwait_init();
6839 #endif
6840 // Print all messages in message catalog for testing purposes.
6841 #ifdef KMP_DEBUG
6842   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6843   if (__kmp_str_match_true(val)) {
6844     kmp_str_buf_t buffer;
6845     __kmp_str_buf_init(&buffer);
6846     __kmp_i18n_dump_catalog(&buffer);
6847     __kmp_printf("%s", buffer.str);
6848     __kmp_str_buf_free(&buffer);
6849   }
6850   __kmp_env_free(&val);
6851 #endif
6852 
6853   __kmp_threads_capacity =
6854       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6855   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6856   __kmp_tp_capacity = __kmp_default_tp_capacity(
6857       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6858 
6859   // If the library is shut down properly, both pools must be NULL. Just in
6860   // case, set them to NULL -- some memory may leak, but subsequent code will
6861   // work even if pools are not freed.
6862   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6863   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6864   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6865   __kmp_thread_pool = NULL;
6866   __kmp_thread_pool_insert_pt = NULL;
6867   __kmp_team_pool = NULL;
6868 
6869   /* Allocate all of the variable sized records */
6870   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6871    * expandable */
6872   /* Since allocation is cache-aligned, just add extra padding at the end */
6873   size =
6874       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6875       CACHE_LINE;
6876   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6877   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6878                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6879 
6880   /* init thread counts */
6881   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6882                    0); // Asserts fail if the library is reinitializing and
6883   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6884   __kmp_all_nth = 0;
6885   __kmp_nth = 0;
6886 
6887   /* setup the uber master thread and hierarchy */
6888   gtid = __kmp_register_root(TRUE);
6889   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6890   KMP_ASSERT(KMP_UBER_GTID(gtid));
6891   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6892 
6893   KMP_MB(); /* Flush all pending memory write invalidates.  */
6894 
6895   __kmp_common_initialize();
6896 
6897 #if KMP_OS_UNIX
6898   /* invoke the child fork handler */
6899   __kmp_register_atfork();
6900 #endif
6901 
6902 #if !KMP_DYNAMIC_LIB
6903   {
6904     /* Invoke the exit handler when the program finishes, only for static
6905        library. For dynamic library, we already have _fini and DllMain. */
6906     int rc = atexit(__kmp_internal_end_atexit);
6907     if (rc != 0) {
6908       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6909                   __kmp_msg_null);
6910     }
6911   }
6912 #endif
6913 
6914 #if KMP_HANDLE_SIGNALS
6915 #if KMP_OS_UNIX
6916   /* NOTE: make sure that this is called before the user installs their own
6917      signal handlers so that the user handlers are called first. this way they
6918      can return false, not call our handler, avoid terminating the library, and
6919      continue execution where they left off. */
6920   __kmp_install_signals(FALSE);
6921 #endif /* KMP_OS_UNIX */
6922 #if KMP_OS_WINDOWS
6923   __kmp_install_signals(TRUE);
6924 #endif /* KMP_OS_WINDOWS */
6925 #endif
6926 
6927   /* we have finished the serial initialization */
6928   __kmp_init_counter++;
6929 
6930   __kmp_init_serial = TRUE;
6931 
6932   if (__kmp_settings) {
6933     __kmp_env_print();
6934   }
6935 
6936   if (__kmp_display_env || __kmp_display_env_verbose) {
6937     __kmp_env_print_2();
6938   }
6939 
6940 #if OMPT_SUPPORT
6941   ompt_post_init();
6942 #endif
6943 
6944   KMP_MB();
6945 
6946   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6947 }
6948 
6949 void __kmp_serial_initialize(void) {
6950   if (__kmp_init_serial) {
6951     return;
6952   }
6953   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6954   if (__kmp_init_serial) {
6955     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6956     return;
6957   }
6958   __kmp_do_serial_initialize();
6959   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6960 }
6961 
6962 static void __kmp_do_middle_initialize(void) {
6963   int i, j;
6964   int prev_dflt_team_nth;
6965 
6966   if (!__kmp_init_serial) {
6967     __kmp_do_serial_initialize();
6968   }
6969 
6970   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6971 
6972   // Save the previous value for the __kmp_dflt_team_nth so that
6973   // we can avoid some reinitialization if it hasn't changed.
6974   prev_dflt_team_nth = __kmp_dflt_team_nth;
6975 
6976 #if KMP_AFFINITY_SUPPORTED
6977   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6978   // number of cores on the machine.
6979   __kmp_affinity_initialize();
6980 
6981   // Run through the __kmp_threads array and set the affinity mask
6982   // for each root thread that is currently registered with the RTL.
6983   for (i = 0; i < __kmp_threads_capacity; i++) {
6984     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6985       __kmp_affinity_set_init_mask(i, TRUE);
6986     }
6987   }
6988 #endif /* KMP_AFFINITY_SUPPORTED */
6989 
6990   KMP_ASSERT(__kmp_xproc > 0);
6991   if (__kmp_avail_proc == 0) {
6992     __kmp_avail_proc = __kmp_xproc;
6993   }
6994 
6995   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6996   // correct them now
6997   j = 0;
6998   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6999     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7000         __kmp_avail_proc;
7001     j++;
7002   }
7003 
7004   if (__kmp_dflt_team_nth == 0) {
7005 #ifdef KMP_DFLT_NTH_CORES
7006     // Default #threads = #cores
7007     __kmp_dflt_team_nth = __kmp_ncores;
7008     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7009                   "__kmp_ncores (%d)\n",
7010                   __kmp_dflt_team_nth));
7011 #else
7012     // Default #threads = #available OS procs
7013     __kmp_dflt_team_nth = __kmp_avail_proc;
7014     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7015                   "__kmp_avail_proc(%d)\n",
7016                   __kmp_dflt_team_nth));
7017 #endif /* KMP_DFLT_NTH_CORES */
7018   }
7019 
7020   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7021     __kmp_dflt_team_nth = KMP_MIN_NTH;
7022   }
7023   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7024     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7025   }
7026 
7027   // There's no harm in continuing if the following check fails,
7028   // but it indicates an error in the previous logic.
7029   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7030 
7031   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7032     // Run through the __kmp_threads array and set the num threads icv for each
7033     // root thread that is currently registered with the RTL (which has not
7034     // already explicitly set its nthreads-var with a call to
7035     // omp_set_num_threads()).
7036     for (i = 0; i < __kmp_threads_capacity; i++) {
7037       kmp_info_t *thread = __kmp_threads[i];
7038       if (thread == NULL)
7039         continue;
7040       if (thread->th.th_current_task->td_icvs.nproc != 0)
7041         continue;
7042 
7043       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7044     }
7045   }
7046   KA_TRACE(
7047       20,
7048       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7049        __kmp_dflt_team_nth));
7050 
7051 #ifdef KMP_ADJUST_BLOCKTIME
7052   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7053   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7054     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7055     if (__kmp_nth > __kmp_avail_proc) {
7056       __kmp_zero_bt = TRUE;
7057     }
7058   }
7059 #endif /* KMP_ADJUST_BLOCKTIME */
7060 
7061   /* we have finished middle initialization */
7062   TCW_SYNC_4(__kmp_init_middle, TRUE);
7063 
7064   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7065 }
7066 
7067 void __kmp_middle_initialize(void) {
7068   if (__kmp_init_middle) {
7069     return;
7070   }
7071   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7072   if (__kmp_init_middle) {
7073     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7074     return;
7075   }
7076   __kmp_do_middle_initialize();
7077   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7078 }
7079 
7080 void __kmp_parallel_initialize(void) {
7081   int gtid = __kmp_entry_gtid(); // this might be a new root
7082 
7083   /* synchronize parallel initialization (for sibling) */
7084   if (TCR_4(__kmp_init_parallel))
7085     return;
7086   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7087   if (TCR_4(__kmp_init_parallel)) {
7088     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7089     return;
7090   }
7091 
7092   /* TODO reinitialization after we have already shut down */
7093   if (TCR_4(__kmp_global.g.g_done)) {
7094     KA_TRACE(
7095         10,
7096         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7097     __kmp_infinite_loop();
7098   }
7099 
7100   /* jc: The lock __kmp_initz_lock is already held, so calling
7101      __kmp_serial_initialize would cause a deadlock.  So we call
7102      __kmp_do_serial_initialize directly. */
7103   if (!__kmp_init_middle) {
7104     __kmp_do_middle_initialize();
7105   }
7106   __kmp_resume_if_hard_paused();
7107 
7108   /* begin initialization */
7109   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7110   KMP_ASSERT(KMP_UBER_GTID(gtid));
7111 
7112 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7113   // Save the FP control regs.
7114   // Worker threads will set theirs to these values at thread startup.
7115   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7116   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7117   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7118 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7119 
7120 #if KMP_OS_UNIX
7121 #if KMP_HANDLE_SIGNALS
7122   /*  must be after __kmp_serial_initialize  */
7123   __kmp_install_signals(TRUE);
7124 #endif
7125 #endif
7126 
7127   __kmp_suspend_initialize();
7128 
7129 #if defined(USE_LOAD_BALANCE)
7130   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7131     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7132   }
7133 #else
7134   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7135     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7136   }
7137 #endif
7138 
7139   if (__kmp_version) {
7140     __kmp_print_version_2();
7141   }
7142 
7143   /* we have finished parallel initialization */
7144   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7145 
7146   KMP_MB();
7147   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7148 
7149   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7150 }
7151 
7152 void __kmp_hidden_helper_initialize() {
7153   if (TCR_4(__kmp_init_hidden_helper))
7154     return;
7155 
7156   // __kmp_parallel_initialize is required before we initialize hidden helper
7157   if (!TCR_4(__kmp_init_parallel))
7158     __kmp_parallel_initialize();
7159 
7160   // Double check. Note that this double check should not be placed before
7161   // __kmp_parallel_initialize as it will cause dead lock.
7162   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7163   if (TCR_4(__kmp_init_hidden_helper)) {
7164     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7165     return;
7166   }
7167 
7168   // Set the count of hidden helper tasks to be executed to zero
7169   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7170 
7171   // Set the global variable indicating that we're initializing hidden helper
7172   // team/threads
7173   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7174 
7175   // Platform independent initialization
7176   __kmp_do_initialize_hidden_helper_threads();
7177 
7178   // Wait here for the finish of initialization of hidden helper teams
7179   __kmp_hidden_helper_threads_initz_wait();
7180 
7181   // We have finished hidden helper initialization
7182   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7183 
7184   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7185 }
7186 
7187 /* ------------------------------------------------------------------------ */
7188 
7189 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7190                                    kmp_team_t *team) {
7191   kmp_disp_t *dispatch;
7192 
7193   KMP_MB();
7194 
7195   /* none of the threads have encountered any constructs, yet. */
7196   this_thr->th.th_local.this_construct = 0;
7197 #if KMP_CACHE_MANAGE
7198   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7199 #endif /* KMP_CACHE_MANAGE */
7200   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7201   KMP_DEBUG_ASSERT(dispatch);
7202   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7203   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7204   // this_thr->th.th_info.ds.ds_tid ] );
7205 
7206   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7207   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7208   if (__kmp_env_consistency_check)
7209     __kmp_push_parallel(gtid, team->t.t_ident);
7210 
7211   KMP_MB(); /* Flush all pending memory write invalidates.  */
7212 }
7213 
7214 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7215                                   kmp_team_t *team) {
7216   if (__kmp_env_consistency_check)
7217     __kmp_pop_parallel(gtid, team->t.t_ident);
7218 
7219   __kmp_finish_implicit_task(this_thr);
7220 }
7221 
7222 int __kmp_invoke_task_func(int gtid) {
7223   int rc;
7224   int tid = __kmp_tid_from_gtid(gtid);
7225   kmp_info_t *this_thr = __kmp_threads[gtid];
7226   kmp_team_t *team = this_thr->th.th_team;
7227 
7228   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7229 #if USE_ITT_BUILD
7230   if (__itt_stack_caller_create_ptr) {
7231     // inform ittnotify about entering user's code
7232     if (team->t.t_stack_id != NULL) {
7233       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7234     } else {
7235       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7236       __kmp_itt_stack_callee_enter(
7237           (__itt_caller)team->t.t_parent->t.t_stack_id);
7238     }
7239   }
7240 #endif /* USE_ITT_BUILD */
7241 #if INCLUDE_SSC_MARKS
7242   SSC_MARK_INVOKING();
7243 #endif
7244 
7245 #if OMPT_SUPPORT
7246   void *dummy;
7247   void **exit_frame_p;
7248   ompt_data_t *my_task_data;
7249   ompt_data_t *my_parallel_data;
7250   int ompt_team_size;
7251 
7252   if (ompt_enabled.enabled) {
7253     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7254                          .ompt_task_info.frame.exit_frame.ptr);
7255   } else {
7256     exit_frame_p = &dummy;
7257   }
7258 
7259   my_task_data =
7260       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7261   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7262   if (ompt_enabled.ompt_callback_implicit_task) {
7263     ompt_team_size = team->t.t_nproc;
7264     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7265         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7266         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7267     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7268   }
7269 #endif
7270 
7271 #if KMP_STATS_ENABLED
7272   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7273   if (previous_state == stats_state_e::TEAMS_REGION) {
7274     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7275   } else {
7276     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7277   }
7278   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7279 #endif
7280 
7281   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7282                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7283 #if OMPT_SUPPORT
7284                               ,
7285                               exit_frame_p
7286 #endif
7287   );
7288 #if OMPT_SUPPORT
7289   *exit_frame_p = NULL;
7290   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7291 #endif
7292 
7293 #if KMP_STATS_ENABLED
7294   if (previous_state == stats_state_e::TEAMS_REGION) {
7295     KMP_SET_THREAD_STATE(previous_state);
7296   }
7297   KMP_POP_PARTITIONED_TIMER();
7298 #endif
7299 
7300 #if USE_ITT_BUILD
7301   if (__itt_stack_caller_create_ptr) {
7302     // inform ittnotify about leaving user's code
7303     if (team->t.t_stack_id != NULL) {
7304       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7305     } else {
7306       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7307       __kmp_itt_stack_callee_leave(
7308           (__itt_caller)team->t.t_parent->t.t_stack_id);
7309     }
7310   }
7311 #endif /* USE_ITT_BUILD */
7312   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7313 
7314   return rc;
7315 }
7316 
7317 void __kmp_teams_master(int gtid) {
7318   // This routine is called by all primary threads in teams construct
7319   kmp_info_t *thr = __kmp_threads[gtid];
7320   kmp_team_t *team = thr->th.th_team;
7321   ident_t *loc = team->t.t_ident;
7322   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7323   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7324   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7325   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7326                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7327 
7328   // This thread is a new CG root.  Set up the proper variables.
7329   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7330   tmp->cg_root = thr; // Make thr the CG root
7331   // Init to thread limit stored when league primary threads were forked
7332   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7333   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7334   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7335                  " cg_nthreads to 1\n",
7336                  thr, tmp));
7337   tmp->up = thr->th.th_cg_roots;
7338   thr->th.th_cg_roots = tmp;
7339 
7340 // Launch league of teams now, but not let workers execute
7341 // (they hang on fork barrier until next parallel)
7342 #if INCLUDE_SSC_MARKS
7343   SSC_MARK_FORKING();
7344 #endif
7345   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7346                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7347                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7348 #if INCLUDE_SSC_MARKS
7349   SSC_MARK_JOINING();
7350 #endif
7351   // If the team size was reduced from the limit, set it to the new size
7352   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7353     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7354   // AC: last parameter "1" eliminates join barrier which won't work because
7355   // worker threads are in a fork barrier waiting for more parallel regions
7356   __kmp_join_call(loc, gtid
7357 #if OMPT_SUPPORT
7358                   ,
7359                   fork_context_intel
7360 #endif
7361                   ,
7362                   1);
7363 }
7364 
7365 int __kmp_invoke_teams_master(int gtid) {
7366   kmp_info_t *this_thr = __kmp_threads[gtid];
7367   kmp_team_t *team = this_thr->th.th_team;
7368 #if KMP_DEBUG
7369   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7370     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7371                      (void *)__kmp_teams_master);
7372 #endif
7373   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7374 #if OMPT_SUPPORT
7375   int tid = __kmp_tid_from_gtid(gtid);
7376   ompt_data_t *task_data =
7377       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7378   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7379   if (ompt_enabled.ompt_callback_implicit_task) {
7380     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7381         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7382         ompt_task_initial);
7383     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7384   }
7385 #endif
7386   __kmp_teams_master(gtid);
7387 #if OMPT_SUPPORT
7388   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7389 #endif
7390   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7391   return 1;
7392 }
7393 
7394 /* this sets the requested number of threads for the next parallel region
7395    encountered by this team. since this should be enclosed in the forkjoin
7396    critical section it should avoid race conditions with asymmetrical nested
7397    parallelism */
7398 
7399 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7400   kmp_info_t *thr = __kmp_threads[gtid];
7401 
7402   if (num_threads > 0)
7403     thr->th.th_set_nproc = num_threads;
7404 }
7405 
7406 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7407                                     int num_threads) {
7408   KMP_DEBUG_ASSERT(thr);
7409   // Remember the number of threads for inner parallel regions
7410   if (!TCR_4(__kmp_init_middle))
7411     __kmp_middle_initialize(); // get internal globals calculated
7412   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7413   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7414 
7415   if (num_threads == 0) {
7416     if (__kmp_teams_thread_limit > 0) {
7417       num_threads = __kmp_teams_thread_limit;
7418     } else {
7419       num_threads = __kmp_avail_proc / num_teams;
7420     }
7421     // adjust num_threads w/o warning as it is not user setting
7422     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7423     // no thread_limit clause specified -  do not change thread-limit-var ICV
7424     if (num_threads > __kmp_dflt_team_nth) {
7425       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7426     }
7427     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7428       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7429     } // prevent team size to exceed thread-limit-var
7430     if (num_teams * num_threads > __kmp_teams_max_nth) {
7431       num_threads = __kmp_teams_max_nth / num_teams;
7432     }
7433     if (num_threads == 0) {
7434       num_threads = 1;
7435     }
7436   } else {
7437     // This thread will be the primary thread of the league primary threads
7438     // Store new thread limit; old limit is saved in th_cg_roots list
7439     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7440     // num_threads = min(num_threads, nthreads-var)
7441     if (num_threads > __kmp_dflt_team_nth) {
7442       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7443     }
7444     if (num_teams * num_threads > __kmp_teams_max_nth) {
7445       int new_threads = __kmp_teams_max_nth / num_teams;
7446       if (new_threads == 0) {
7447         new_threads = 1;
7448       }
7449       if (new_threads != num_threads) {
7450         if (!__kmp_reserve_warn) { // user asked for too many threads
7451           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7452           __kmp_msg(kmp_ms_warning,
7453                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7454                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7455         }
7456       }
7457       num_threads = new_threads;
7458     }
7459   }
7460   thr->th.th_teams_size.nth = num_threads;
7461 }
7462 
7463 /* this sets the requested number of teams for the teams region and/or
7464    the number of threads for the next parallel region encountered  */
7465 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7466                           int num_threads) {
7467   kmp_info_t *thr = __kmp_threads[gtid];
7468   KMP_DEBUG_ASSERT(num_teams >= 0);
7469   KMP_DEBUG_ASSERT(num_threads >= 0);
7470 
7471   if (num_teams == 0) {
7472     if (__kmp_nteams > 0) {
7473       num_teams = __kmp_nteams;
7474     } else {
7475       num_teams = 1; // default number of teams is 1.
7476     }
7477   }
7478   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7479     if (!__kmp_reserve_warn) {
7480       __kmp_reserve_warn = 1;
7481       __kmp_msg(kmp_ms_warning,
7482                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7483                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7484     }
7485     num_teams = __kmp_teams_max_nth;
7486   }
7487   // Set number of teams (number of threads in the outer "parallel" of the
7488   // teams)
7489   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7490 
7491   __kmp_push_thread_limit(thr, num_teams, num_threads);
7492 }
7493 
7494 /* This sets the requested number of teams for the teams region and/or
7495    the number of threads for the next parallel region encountered  */
7496 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7497                              int num_teams_ub, int num_threads) {
7498   kmp_info_t *thr = __kmp_threads[gtid];
7499   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7500   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7501   KMP_DEBUG_ASSERT(num_threads >= 0);
7502 
7503   if (num_teams_lb > num_teams_ub) {
7504     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7505                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7506   }
7507 
7508   int num_teams = 1; // defalt number of teams is 1.
7509 
7510   if (num_teams_lb == 0 && num_teams_ub > 0)
7511     num_teams_lb = num_teams_ub;
7512 
7513   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7514     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7515     if (num_teams > __kmp_teams_max_nth) {
7516       if (!__kmp_reserve_warn) {
7517         __kmp_reserve_warn = 1;
7518         __kmp_msg(kmp_ms_warning,
7519                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7520                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7521       }
7522       num_teams = __kmp_teams_max_nth;
7523     }
7524   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7525     num_teams = num_teams_ub;
7526   } else { // num_teams_lb <= num_teams <= num_teams_ub
7527     if (num_threads == 0) {
7528       if (num_teams_ub > __kmp_teams_max_nth) {
7529         num_teams = num_teams_lb;
7530       } else {
7531         num_teams = num_teams_ub;
7532       }
7533     } else {
7534       num_teams = (num_threads > __kmp_teams_max_nth)
7535                       ? num_teams
7536                       : __kmp_teams_max_nth / num_threads;
7537       if (num_teams < num_teams_lb) {
7538         num_teams = num_teams_lb;
7539       } else if (num_teams > num_teams_ub) {
7540         num_teams = num_teams_ub;
7541       }
7542     }
7543   }
7544   // Set number of teams (number of threads in the outer "parallel" of the
7545   // teams)
7546   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7547 
7548   __kmp_push_thread_limit(thr, num_teams, num_threads);
7549 }
7550 
7551 // Set the proc_bind var to use in the following parallel region.
7552 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7553   kmp_info_t *thr = __kmp_threads[gtid];
7554   thr->th.th_set_proc_bind = proc_bind;
7555 }
7556 
7557 /* Launch the worker threads into the microtask. */
7558 
7559 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7560   kmp_info_t *this_thr = __kmp_threads[gtid];
7561 
7562 #ifdef KMP_DEBUG
7563   int f;
7564 #endif /* KMP_DEBUG */
7565 
7566   KMP_DEBUG_ASSERT(team);
7567   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7568   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7569   KMP_MB(); /* Flush all pending memory write invalidates.  */
7570 
7571   team->t.t_construct = 0; /* no single directives seen yet */
7572   team->t.t_ordered.dt.t_value =
7573       0; /* thread 0 enters the ordered section first */
7574 
7575   /* Reset the identifiers on the dispatch buffer */
7576   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7577   if (team->t.t_max_nproc > 1) {
7578     int i;
7579     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7580       team->t.t_disp_buffer[i].buffer_index = i;
7581       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7582     }
7583   } else {
7584     team->t.t_disp_buffer[0].buffer_index = 0;
7585     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7586   }
7587 
7588   KMP_MB(); /* Flush all pending memory write invalidates.  */
7589   KMP_ASSERT(this_thr->th.th_team == team);
7590 
7591 #ifdef KMP_DEBUG
7592   for (f = 0; f < team->t.t_nproc; f++) {
7593     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7594                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7595   }
7596 #endif /* KMP_DEBUG */
7597 
7598   /* release the worker threads so they may begin working */
7599   __kmp_fork_barrier(gtid, 0);
7600 }
7601 
7602 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7603   kmp_info_t *this_thr = __kmp_threads[gtid];
7604 
7605   KMP_DEBUG_ASSERT(team);
7606   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7607   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7608   KMP_MB(); /* Flush all pending memory write invalidates.  */
7609 
7610   /* Join barrier after fork */
7611 
7612 #ifdef KMP_DEBUG
7613   if (__kmp_threads[gtid] &&
7614       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7615     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7616                  __kmp_threads[gtid]);
7617     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7618                  "team->t.t_nproc=%d\n",
7619                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7620                  team->t.t_nproc);
7621     __kmp_print_structure();
7622   }
7623   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7624                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7625 #endif /* KMP_DEBUG */
7626 
7627   __kmp_join_barrier(gtid); /* wait for everyone */
7628 #if OMPT_SUPPORT
7629   if (ompt_enabled.enabled &&
7630       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7631     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7632     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7633     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7634 #if OMPT_OPTIONAL
7635     void *codeptr = NULL;
7636     if (KMP_MASTER_TID(ds_tid) &&
7637         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7638          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7639       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7640 
7641     if (ompt_enabled.ompt_callback_sync_region_wait) {
7642       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7643           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7644           codeptr);
7645     }
7646     if (ompt_enabled.ompt_callback_sync_region) {
7647       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7648           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7649           codeptr);
7650     }
7651 #endif
7652     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7653       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7654           ompt_scope_end, NULL, task_data, 0, ds_tid,
7655           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7656     }
7657   }
7658 #endif
7659 
7660   KMP_MB(); /* Flush all pending memory write invalidates.  */
7661   KMP_ASSERT(this_thr->th.th_team == team);
7662 }
7663 
7664 /* ------------------------------------------------------------------------ */
7665 
7666 #ifdef USE_LOAD_BALANCE
7667 
7668 // Return the worker threads actively spinning in the hot team, if we
7669 // are at the outermost level of parallelism.  Otherwise, return 0.
7670 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7671   int i;
7672   int retval;
7673   kmp_team_t *hot_team;
7674 
7675   if (root->r.r_active) {
7676     return 0;
7677   }
7678   hot_team = root->r.r_hot_team;
7679   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7680     return hot_team->t.t_nproc - 1; // Don't count primary thread
7681   }
7682 
7683   // Skip the primary thread - it is accounted for elsewhere.
7684   retval = 0;
7685   for (i = 1; i < hot_team->t.t_nproc; i++) {
7686     if (hot_team->t.t_threads[i]->th.th_active) {
7687       retval++;
7688     }
7689   }
7690   return retval;
7691 }
7692 
7693 // Perform an automatic adjustment to the number of
7694 // threads used by the next parallel region.
7695 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7696   int retval;
7697   int pool_active;
7698   int hot_team_active;
7699   int team_curr_active;
7700   int system_active;
7701 
7702   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7703                 set_nproc));
7704   KMP_DEBUG_ASSERT(root);
7705   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7706                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7707   KMP_DEBUG_ASSERT(set_nproc > 1);
7708 
7709   if (set_nproc == 1) {
7710     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7711     return 1;
7712   }
7713 
7714   // Threads that are active in the thread pool, active in the hot team for this
7715   // particular root (if we are at the outer par level), and the currently
7716   // executing thread (to become the primary thread) are available to add to the
7717   // new team, but are currently contributing to the system load, and must be
7718   // accounted for.
7719   pool_active = __kmp_thread_pool_active_nth;
7720   hot_team_active = __kmp_active_hot_team_nproc(root);
7721   team_curr_active = pool_active + hot_team_active + 1;
7722 
7723   // Check the system load.
7724   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7725   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7726                 "hot team active = %d\n",
7727                 system_active, pool_active, hot_team_active));
7728 
7729   if (system_active < 0) {
7730     // There was an error reading the necessary info from /proc, so use the
7731     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7732     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7733     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7734     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7735 
7736     // Make this call behave like the thread limit algorithm.
7737     retval = __kmp_avail_proc - __kmp_nth +
7738              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7739     if (retval > set_nproc) {
7740       retval = set_nproc;
7741     }
7742     if (retval < KMP_MIN_NTH) {
7743       retval = KMP_MIN_NTH;
7744     }
7745 
7746     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7747                   retval));
7748     return retval;
7749   }
7750 
7751   // There is a slight delay in the load balance algorithm in detecting new
7752   // running procs. The real system load at this instant should be at least as
7753   // large as the #active omp thread that are available to add to the team.
7754   if (system_active < team_curr_active) {
7755     system_active = team_curr_active;
7756   }
7757   retval = __kmp_avail_proc - system_active + team_curr_active;
7758   if (retval > set_nproc) {
7759     retval = set_nproc;
7760   }
7761   if (retval < KMP_MIN_NTH) {
7762     retval = KMP_MIN_NTH;
7763   }
7764 
7765   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7766   return retval;
7767 } // __kmp_load_balance_nproc()
7768 
7769 #endif /* USE_LOAD_BALANCE */
7770 
7771 /* ------------------------------------------------------------------------ */
7772 
7773 /* NOTE: this is called with the __kmp_init_lock held */
7774 void __kmp_cleanup(void) {
7775   int f;
7776 
7777   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7778 
7779   if (TCR_4(__kmp_init_parallel)) {
7780 #if KMP_HANDLE_SIGNALS
7781     __kmp_remove_signals();
7782 #endif
7783     TCW_4(__kmp_init_parallel, FALSE);
7784   }
7785 
7786   if (TCR_4(__kmp_init_middle)) {
7787 #if KMP_AFFINITY_SUPPORTED
7788     __kmp_affinity_uninitialize();
7789 #endif /* KMP_AFFINITY_SUPPORTED */
7790     __kmp_cleanup_hierarchy();
7791     TCW_4(__kmp_init_middle, FALSE);
7792   }
7793 
7794   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7795 
7796   if (__kmp_init_serial) {
7797     __kmp_runtime_destroy();
7798     __kmp_init_serial = FALSE;
7799   }
7800 
7801   __kmp_cleanup_threadprivate_caches();
7802 
7803   for (f = 0; f < __kmp_threads_capacity; f++) {
7804     if (__kmp_root[f] != NULL) {
7805       __kmp_free(__kmp_root[f]);
7806       __kmp_root[f] = NULL;
7807     }
7808   }
7809   __kmp_free(__kmp_threads);
7810   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7811   // there is no need in freeing __kmp_root.
7812   __kmp_threads = NULL;
7813   __kmp_root = NULL;
7814   __kmp_threads_capacity = 0;
7815 
7816 #if KMP_USE_DYNAMIC_LOCK
7817   __kmp_cleanup_indirect_user_locks();
7818 #else
7819   __kmp_cleanup_user_locks();
7820 #endif
7821 
7822 #if KMP_AFFINITY_SUPPORTED
7823   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7824   __kmp_cpuinfo_file = NULL;
7825 #endif /* KMP_AFFINITY_SUPPORTED */
7826 
7827 #if KMP_USE_ADAPTIVE_LOCKS
7828 #if KMP_DEBUG_ADAPTIVE_LOCKS
7829   __kmp_print_speculative_stats();
7830 #endif
7831 #endif
7832   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7833   __kmp_nested_nth.nth = NULL;
7834   __kmp_nested_nth.size = 0;
7835   __kmp_nested_nth.used = 0;
7836   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7837   __kmp_nested_proc_bind.bind_types = NULL;
7838   __kmp_nested_proc_bind.size = 0;
7839   __kmp_nested_proc_bind.used = 0;
7840   if (__kmp_affinity_format) {
7841     KMP_INTERNAL_FREE(__kmp_affinity_format);
7842     __kmp_affinity_format = NULL;
7843   }
7844 
7845   __kmp_i18n_catclose();
7846 
7847 #if KMP_USE_HIER_SCHED
7848   __kmp_hier_scheds.deallocate();
7849 #endif
7850 
7851 #if KMP_STATS_ENABLED
7852   __kmp_stats_fini();
7853 #endif
7854 
7855   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7856 }
7857 
7858 /* ------------------------------------------------------------------------ */
7859 
7860 int __kmp_ignore_mppbeg(void) {
7861   char *env;
7862 
7863   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7864     if (__kmp_str_match_false(env))
7865       return FALSE;
7866   }
7867   // By default __kmpc_begin() is no-op.
7868   return TRUE;
7869 }
7870 
7871 int __kmp_ignore_mppend(void) {
7872   char *env;
7873 
7874   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7875     if (__kmp_str_match_false(env))
7876       return FALSE;
7877   }
7878   // By default __kmpc_end() is no-op.
7879   return TRUE;
7880 }
7881 
7882 void __kmp_internal_begin(void) {
7883   int gtid;
7884   kmp_root_t *root;
7885 
7886   /* this is a very important step as it will register new sibling threads
7887      and assign these new uber threads a new gtid */
7888   gtid = __kmp_entry_gtid();
7889   root = __kmp_threads[gtid]->th.th_root;
7890   KMP_ASSERT(KMP_UBER_GTID(gtid));
7891 
7892   if (root->r.r_begin)
7893     return;
7894   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7895   if (root->r.r_begin) {
7896     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7897     return;
7898   }
7899 
7900   root->r.r_begin = TRUE;
7901 
7902   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7903 }
7904 
7905 /* ------------------------------------------------------------------------ */
7906 
7907 void __kmp_user_set_library(enum library_type arg) {
7908   int gtid;
7909   kmp_root_t *root;
7910   kmp_info_t *thread;
7911 
7912   /* first, make sure we are initialized so we can get our gtid */
7913 
7914   gtid = __kmp_entry_gtid();
7915   thread = __kmp_threads[gtid];
7916 
7917   root = thread->th.th_root;
7918 
7919   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7920                 library_serial));
7921   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7922                                   thread */
7923     KMP_WARNING(SetLibraryIncorrectCall);
7924     return;
7925   }
7926 
7927   switch (arg) {
7928   case library_serial:
7929     thread->th.th_set_nproc = 0;
7930     set__nproc(thread, 1);
7931     break;
7932   case library_turnaround:
7933     thread->th.th_set_nproc = 0;
7934     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7935                                            : __kmp_dflt_team_nth_ub);
7936     break;
7937   case library_throughput:
7938     thread->th.th_set_nproc = 0;
7939     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7940                                            : __kmp_dflt_team_nth_ub);
7941     break;
7942   default:
7943     KMP_FATAL(UnknownLibraryType, arg);
7944   }
7945 
7946   __kmp_aux_set_library(arg);
7947 }
7948 
7949 void __kmp_aux_set_stacksize(size_t arg) {
7950   if (!__kmp_init_serial)
7951     __kmp_serial_initialize();
7952 
7953 #if KMP_OS_DARWIN
7954   if (arg & (0x1000 - 1)) {
7955     arg &= ~(0x1000 - 1);
7956     if (arg + 0x1000) /* check for overflow if we round up */
7957       arg += 0x1000;
7958   }
7959 #endif
7960   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7961 
7962   /* only change the default stacksize before the first parallel region */
7963   if (!TCR_4(__kmp_init_parallel)) {
7964     size_t value = arg; /* argument is in bytes */
7965 
7966     if (value < __kmp_sys_min_stksize)
7967       value = __kmp_sys_min_stksize;
7968     else if (value > KMP_MAX_STKSIZE)
7969       value = KMP_MAX_STKSIZE;
7970 
7971     __kmp_stksize = value;
7972 
7973     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7974   }
7975 
7976   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7977 }
7978 
7979 /* set the behaviour of the runtime library */
7980 /* TODO this can cause some odd behaviour with sibling parallelism... */
7981 void __kmp_aux_set_library(enum library_type arg) {
7982   __kmp_library = arg;
7983 
7984   switch (__kmp_library) {
7985   case library_serial: {
7986     KMP_INFORM(LibraryIsSerial);
7987   } break;
7988   case library_turnaround:
7989     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7990       __kmp_use_yield = 2; // only yield when oversubscribed
7991     break;
7992   case library_throughput:
7993     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7994       __kmp_dflt_blocktime = 200;
7995     break;
7996   default:
7997     KMP_FATAL(UnknownLibraryType, arg);
7998   }
7999 }
8000 
8001 /* Getting team information common for all team API */
8002 // Returns NULL if not in teams construct
8003 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8004   kmp_info_t *thr = __kmp_entry_thread();
8005   teams_serialized = 0;
8006   if (thr->th.th_teams_microtask) {
8007     kmp_team_t *team = thr->th.th_team;
8008     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8009     int ii = team->t.t_level;
8010     teams_serialized = team->t.t_serialized;
8011     int level = tlevel + 1;
8012     KMP_DEBUG_ASSERT(ii >= tlevel);
8013     while (ii > level) {
8014       for (teams_serialized = team->t.t_serialized;
8015            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8016       }
8017       if (team->t.t_serialized && (!teams_serialized)) {
8018         team = team->t.t_parent;
8019         continue;
8020       }
8021       if (ii > level) {
8022         team = team->t.t_parent;
8023         ii--;
8024       }
8025     }
8026     return team;
8027   }
8028   return NULL;
8029 }
8030 
8031 int __kmp_aux_get_team_num() {
8032   int serialized;
8033   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8034   if (team) {
8035     if (serialized > 1) {
8036       return 0; // teams region is serialized ( 1 team of 1 thread ).
8037     } else {
8038       return team->t.t_master_tid;
8039     }
8040   }
8041   return 0;
8042 }
8043 
8044 int __kmp_aux_get_num_teams() {
8045   int serialized;
8046   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8047   if (team) {
8048     if (serialized > 1) {
8049       return 1;
8050     } else {
8051       return team->t.t_parent->t.t_nproc;
8052     }
8053   }
8054   return 1;
8055 }
8056 
8057 /* ------------------------------------------------------------------------ */
8058 
8059 /*
8060  * Affinity Format Parser
8061  *
8062  * Field is in form of: %[[[0].]size]type
8063  * % and type are required (%% means print a literal '%')
8064  * type is either single char or long name surrounded by {},
8065  * e.g., N or {num_threads}
8066  * 0 => leading zeros
8067  * . => right justified when size is specified
8068  * by default output is left justified
8069  * size is the *minimum* field length
8070  * All other characters are printed as is
8071  *
8072  * Available field types:
8073  * L {thread_level}      - omp_get_level()
8074  * n {thread_num}        - omp_get_thread_num()
8075  * h {host}              - name of host machine
8076  * P {process_id}        - process id (integer)
8077  * T {thread_identifier} - native thread identifier (integer)
8078  * N {num_threads}       - omp_get_num_threads()
8079  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8080  * a {thread_affinity}   - comma separated list of integers or integer ranges
8081  *                         (values of affinity mask)
8082  *
8083  * Implementation-specific field types can be added
8084  * If a type is unknown, print "undefined"
8085  */
8086 
8087 // Structure holding the short name, long name, and corresponding data type
8088 // for snprintf.  A table of these will represent the entire valid keyword
8089 // field types.
8090 typedef struct kmp_affinity_format_field_t {
8091   char short_name; // from spec e.g., L -> thread level
8092   const char *long_name; // from spec thread_level -> thread level
8093   char field_format; // data type for snprintf (typically 'd' or 's'
8094   // for integer or string)
8095 } kmp_affinity_format_field_t;
8096 
8097 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8098 #if KMP_AFFINITY_SUPPORTED
8099     {'A', "thread_affinity", 's'},
8100 #endif
8101     {'t', "team_num", 'd'},
8102     {'T', "num_teams", 'd'},
8103     {'L', "nesting_level", 'd'},
8104     {'n', "thread_num", 'd'},
8105     {'N', "num_threads", 'd'},
8106     {'a', "ancestor_tnum", 'd'},
8107     {'H', "host", 's'},
8108     {'P', "process_id", 'd'},
8109     {'i', "native_thread_id", 'd'}};
8110 
8111 // Return the number of characters it takes to hold field
8112 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8113                                             const char **ptr,
8114                                             kmp_str_buf_t *field_buffer) {
8115   int rc, format_index, field_value;
8116   const char *width_left, *width_right;
8117   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8118   static const int FORMAT_SIZE = 20;
8119   char format[FORMAT_SIZE] = {0};
8120   char absolute_short_name = 0;
8121 
8122   KMP_DEBUG_ASSERT(gtid >= 0);
8123   KMP_DEBUG_ASSERT(th);
8124   KMP_DEBUG_ASSERT(**ptr == '%');
8125   KMP_DEBUG_ASSERT(field_buffer);
8126 
8127   __kmp_str_buf_clear(field_buffer);
8128 
8129   // Skip the initial %
8130   (*ptr)++;
8131 
8132   // Check for %% first
8133   if (**ptr == '%') {
8134     __kmp_str_buf_cat(field_buffer, "%", 1);
8135     (*ptr)++; // skip over the second %
8136     return 1;
8137   }
8138 
8139   // Parse field modifiers if they are present
8140   pad_zeros = false;
8141   if (**ptr == '0') {
8142     pad_zeros = true;
8143     (*ptr)++; // skip over 0
8144   }
8145   right_justify = false;
8146   if (**ptr == '.') {
8147     right_justify = true;
8148     (*ptr)++; // skip over .
8149   }
8150   // Parse width of field: [width_left, width_right)
8151   width_left = width_right = NULL;
8152   if (**ptr >= '0' && **ptr <= '9') {
8153     width_left = *ptr;
8154     SKIP_DIGITS(*ptr);
8155     width_right = *ptr;
8156   }
8157 
8158   // Create the format for KMP_SNPRINTF based on flags parsed above
8159   format_index = 0;
8160   format[format_index++] = '%';
8161   if (!right_justify)
8162     format[format_index++] = '-';
8163   if (pad_zeros)
8164     format[format_index++] = '0';
8165   if (width_left && width_right) {
8166     int i = 0;
8167     // Only allow 8 digit number widths.
8168     // This also prevents overflowing format variable
8169     while (i < 8 && width_left < width_right) {
8170       format[format_index++] = *width_left;
8171       width_left++;
8172       i++;
8173     }
8174   }
8175 
8176   // Parse a name (long or short)
8177   // Canonicalize the name into absolute_short_name
8178   found_valid_name = false;
8179   parse_long_name = (**ptr == '{');
8180   if (parse_long_name)
8181     (*ptr)++; // skip initial left brace
8182   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8183                              sizeof(__kmp_affinity_format_table[0]);
8184        ++i) {
8185     char short_name = __kmp_affinity_format_table[i].short_name;
8186     const char *long_name = __kmp_affinity_format_table[i].long_name;
8187     char field_format = __kmp_affinity_format_table[i].field_format;
8188     if (parse_long_name) {
8189       size_t length = KMP_STRLEN(long_name);
8190       if (strncmp(*ptr, long_name, length) == 0) {
8191         found_valid_name = true;
8192         (*ptr) += length; // skip the long name
8193       }
8194     } else if (**ptr == short_name) {
8195       found_valid_name = true;
8196       (*ptr)++; // skip the short name
8197     }
8198     if (found_valid_name) {
8199       format[format_index++] = field_format;
8200       format[format_index++] = '\0';
8201       absolute_short_name = short_name;
8202       break;
8203     }
8204   }
8205   if (parse_long_name) {
8206     if (**ptr != '}') {
8207       absolute_short_name = 0;
8208     } else {
8209       (*ptr)++; // skip over the right brace
8210     }
8211   }
8212 
8213   // Attempt to fill the buffer with the requested
8214   // value using snprintf within __kmp_str_buf_print()
8215   switch (absolute_short_name) {
8216   case 't':
8217     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8218     break;
8219   case 'T':
8220     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8221     break;
8222   case 'L':
8223     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8224     break;
8225   case 'n':
8226     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8227     break;
8228   case 'H': {
8229     static const int BUFFER_SIZE = 256;
8230     char buf[BUFFER_SIZE];
8231     __kmp_expand_host_name(buf, BUFFER_SIZE);
8232     rc = __kmp_str_buf_print(field_buffer, format, buf);
8233   } break;
8234   case 'P':
8235     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8236     break;
8237   case 'i':
8238     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8239     break;
8240   case 'N':
8241     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8242     break;
8243   case 'a':
8244     field_value =
8245         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8246     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8247     break;
8248 #if KMP_AFFINITY_SUPPORTED
8249   case 'A': {
8250     kmp_str_buf_t buf;
8251     __kmp_str_buf_init(&buf);
8252     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8253     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8254     __kmp_str_buf_free(&buf);
8255   } break;
8256 #endif
8257   default:
8258     // According to spec, If an implementation does not have info for field
8259     // type, then "undefined" is printed
8260     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8261     // Skip the field
8262     if (parse_long_name) {
8263       SKIP_TOKEN(*ptr);
8264       if (**ptr == '}')
8265         (*ptr)++;
8266     } else {
8267       (*ptr)++;
8268     }
8269   }
8270 
8271   KMP_ASSERT(format_index <= FORMAT_SIZE);
8272   return rc;
8273 }
8274 
8275 /*
8276  * Return number of characters needed to hold the affinity string
8277  * (not including null byte character)
8278  * The resultant string is printed to buffer, which the caller can then
8279  * handle afterwards
8280  */
8281 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8282                                   kmp_str_buf_t *buffer) {
8283   const char *parse_ptr;
8284   size_t retval;
8285   const kmp_info_t *th;
8286   kmp_str_buf_t field;
8287 
8288   KMP_DEBUG_ASSERT(buffer);
8289   KMP_DEBUG_ASSERT(gtid >= 0);
8290 
8291   __kmp_str_buf_init(&field);
8292   __kmp_str_buf_clear(buffer);
8293 
8294   th = __kmp_threads[gtid];
8295   retval = 0;
8296 
8297   // If format is NULL or zero-length string, then we use
8298   // affinity-format-var ICV
8299   parse_ptr = format;
8300   if (parse_ptr == NULL || *parse_ptr == '\0') {
8301     parse_ptr = __kmp_affinity_format;
8302   }
8303   KMP_DEBUG_ASSERT(parse_ptr);
8304 
8305   while (*parse_ptr != '\0') {
8306     // Parse a field
8307     if (*parse_ptr == '%') {
8308       // Put field in the buffer
8309       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8310       __kmp_str_buf_catbuf(buffer, &field);
8311       retval += rc;
8312     } else {
8313       // Put literal character in buffer
8314       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8315       retval++;
8316       parse_ptr++;
8317     }
8318   }
8319   __kmp_str_buf_free(&field);
8320   return retval;
8321 }
8322 
8323 // Displays the affinity string to stdout
8324 void __kmp_aux_display_affinity(int gtid, const char *format) {
8325   kmp_str_buf_t buf;
8326   __kmp_str_buf_init(&buf);
8327   __kmp_aux_capture_affinity(gtid, format, &buf);
8328   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8329   __kmp_str_buf_free(&buf);
8330 }
8331 
8332 /* ------------------------------------------------------------------------ */
8333 
8334 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8335   int blocktime = arg; /* argument is in milliseconds */
8336 #if KMP_USE_MONITOR
8337   int bt_intervals;
8338 #endif
8339   kmp_int8 bt_set;
8340 
8341   __kmp_save_internal_controls(thread);
8342 
8343   /* Normalize and set blocktime for the teams */
8344   if (blocktime < KMP_MIN_BLOCKTIME)
8345     blocktime = KMP_MIN_BLOCKTIME;
8346   else if (blocktime > KMP_MAX_BLOCKTIME)
8347     blocktime = KMP_MAX_BLOCKTIME;
8348 
8349   set__blocktime_team(thread->th.th_team, tid, blocktime);
8350   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8351 
8352 #if KMP_USE_MONITOR
8353   /* Calculate and set blocktime intervals for the teams */
8354   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8355 
8356   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8357   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8358 #endif
8359 
8360   /* Set whether blocktime has been set to "TRUE" */
8361   bt_set = TRUE;
8362 
8363   set__bt_set_team(thread->th.th_team, tid, bt_set);
8364   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8365 #if KMP_USE_MONITOR
8366   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8367                 "bt_intervals=%d, monitor_updates=%d\n",
8368                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8369                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8370                 __kmp_monitor_wakeups));
8371 #else
8372   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8373                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8374                 thread->th.th_team->t.t_id, tid, blocktime));
8375 #endif
8376 }
8377 
8378 void __kmp_aux_set_defaults(char const *str, size_t len) {
8379   if (!__kmp_init_serial) {
8380     __kmp_serial_initialize();
8381   }
8382   __kmp_env_initialize(str);
8383 
8384   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8385     __kmp_env_print();
8386   }
8387 } // __kmp_aux_set_defaults
8388 
8389 /* ------------------------------------------------------------------------ */
8390 /* internal fast reduction routines */
8391 
8392 PACKED_REDUCTION_METHOD_T
8393 __kmp_determine_reduction_method(
8394     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8395     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8396     kmp_critical_name *lck) {
8397 
8398   // Default reduction method: critical construct ( lck != NULL, like in current
8399   // PAROPT )
8400   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8401   // can be selected by RTL
8402   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8403   // can be selected by RTL
8404   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8405   // among generated by PAROPT.
8406 
8407   PACKED_REDUCTION_METHOD_T retval;
8408 
8409   int team_size;
8410 
8411   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8412   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8413 
8414 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8415   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8416 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8417 
8418   retval = critical_reduce_block;
8419 
8420   // another choice of getting a team size (with 1 dynamic deference) is slower
8421   team_size = __kmp_get_team_num_threads(global_tid);
8422   if (team_size == 1) {
8423 
8424     retval = empty_reduce_block;
8425 
8426   } else {
8427 
8428     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8429 
8430 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8431     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8432 
8433 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8434     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8435 
8436     int teamsize_cutoff = 4;
8437 
8438 #if KMP_MIC_SUPPORTED
8439     if (__kmp_mic_type != non_mic) {
8440       teamsize_cutoff = 8;
8441     }
8442 #endif
8443     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8444     if (tree_available) {
8445       if (team_size <= teamsize_cutoff) {
8446         if (atomic_available) {
8447           retval = atomic_reduce_block;
8448         }
8449       } else {
8450         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8451       }
8452     } else if (atomic_available) {
8453       retval = atomic_reduce_block;
8454     }
8455 #else
8456 #error "Unknown or unsupported OS"
8457 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8458        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8459 
8460 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8461 
8462 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8463 
8464     // basic tuning
8465 
8466     if (atomic_available) {
8467       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8468         retval = atomic_reduce_block;
8469       }
8470     } // otherwise: use critical section
8471 
8472 #elif KMP_OS_DARWIN
8473 
8474     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8475     if (atomic_available && (num_vars <= 3)) {
8476       retval = atomic_reduce_block;
8477     } else if (tree_available) {
8478       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8479           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8480         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8481       }
8482     } // otherwise: use critical section
8483 
8484 #else
8485 #error "Unknown or unsupported OS"
8486 #endif
8487 
8488 #else
8489 #error "Unknown or unsupported architecture"
8490 #endif
8491   }
8492 
8493   // KMP_FORCE_REDUCTION
8494 
8495   // If the team is serialized (team_size == 1), ignore the forced reduction
8496   // method and stay with the unsynchronized method (empty_reduce_block)
8497   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8498       team_size != 1) {
8499 
8500     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8501 
8502     int atomic_available, tree_available;
8503 
8504     switch ((forced_retval = __kmp_force_reduction_method)) {
8505     case critical_reduce_block:
8506       KMP_ASSERT(lck); // lck should be != 0
8507       break;
8508 
8509     case atomic_reduce_block:
8510       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8511       if (!atomic_available) {
8512         KMP_WARNING(RedMethodNotSupported, "atomic");
8513         forced_retval = critical_reduce_block;
8514       }
8515       break;
8516 
8517     case tree_reduce_block:
8518       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8519       if (!tree_available) {
8520         KMP_WARNING(RedMethodNotSupported, "tree");
8521         forced_retval = critical_reduce_block;
8522       } else {
8523 #if KMP_FAST_REDUCTION_BARRIER
8524         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8525 #endif
8526       }
8527       break;
8528 
8529     default:
8530       KMP_ASSERT(0); // "unsupported method specified"
8531     }
8532 
8533     retval = forced_retval;
8534   }
8535 
8536   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8537 
8538 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8539 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8540 
8541   return (retval);
8542 }
8543 // this function is for testing set/get/determine reduce method
8544 kmp_int32 __kmp_get_reduce_method(void) {
8545   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8546 }
8547 
8548 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8549 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8550 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8551 
8552 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8553 // OpenMP is used subsequently.
8554 void __kmp_hard_pause() {
8555   __kmp_pause_status = kmp_hard_paused;
8556   __kmp_internal_end_thread(-1);
8557 }
8558 
8559 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8560 void __kmp_resume_if_soft_paused() {
8561   if (__kmp_pause_status == kmp_soft_paused) {
8562     __kmp_pause_status = kmp_not_paused;
8563 
8564     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8565       kmp_info_t *thread = __kmp_threads[gtid];
8566       if (thread) { // Wake it if sleeping
8567         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8568                          thread);
8569         if (fl.is_sleeping())
8570           fl.resume(gtid);
8571         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8572           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8573         } else { // thread holds the lock and may sleep soon
8574           do { // until either the thread sleeps, or we can get the lock
8575             if (fl.is_sleeping()) {
8576               fl.resume(gtid);
8577               break;
8578             } else if (__kmp_try_suspend_mx(thread)) {
8579               __kmp_unlock_suspend_mx(thread);
8580               break;
8581             }
8582           } while (1);
8583         }
8584       }
8585     }
8586   }
8587 }
8588 
8589 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8590 // TODO: add warning messages
8591 int __kmp_pause_resource(kmp_pause_status_t level) {
8592   if (level == kmp_not_paused) { // requesting resume
8593     if (__kmp_pause_status == kmp_not_paused) {
8594       // error message about runtime not being paused, so can't resume
8595       return 1;
8596     } else {
8597       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8598                        __kmp_pause_status == kmp_hard_paused);
8599       __kmp_pause_status = kmp_not_paused;
8600       return 0;
8601     }
8602   } else if (level == kmp_soft_paused) { // requesting soft pause
8603     if (__kmp_pause_status != kmp_not_paused) {
8604       // error message about already being paused
8605       return 1;
8606     } else {
8607       __kmp_soft_pause();
8608       return 0;
8609     }
8610   } else if (level == kmp_hard_paused) { // requesting hard pause
8611     if (__kmp_pause_status != kmp_not_paused) {
8612       // error message about already being paused
8613       return 1;
8614     } else {
8615       __kmp_hard_pause();
8616       return 0;
8617     }
8618   } else {
8619     // error message about invalid level
8620     return 1;
8621   }
8622 }
8623 
8624 void __kmp_omp_display_env(int verbose) {
8625   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8626   if (__kmp_init_serial == 0)
8627     __kmp_do_serial_initialize();
8628   __kmp_display_env_impl(!verbose, verbose);
8629   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8630 }
8631 
8632 // Globals and functions for hidden helper task
8633 kmp_info_t **__kmp_hidden_helper_threads;
8634 kmp_info_t *__kmp_hidden_helper_main_thread;
8635 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8636 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8637 #if KMP_OS_LINUX
8638 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8639 #else
8640 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8641 #endif
8642 
8643 namespace {
8644 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8645 
8646 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8647   // This is an explicit synchronization on all hidden helper threads in case
8648   // that when a regular thread pushes a hidden helper task to one hidden
8649   // helper thread, the thread has not been awaken once since they're released
8650   // by the main thread after creating the team.
8651   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8652   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8653          __kmp_hidden_helper_threads_num)
8654     ;
8655 
8656   // If main thread, then wait for signal
8657   if (__kmpc_master(nullptr, *gtid)) {
8658     // First, unset the initial state and release the initial thread
8659     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8660     __kmp_hidden_helper_initz_release();
8661     __kmp_hidden_helper_main_thread_wait();
8662     // Now wake up all worker threads
8663     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8664       __kmp_hidden_helper_worker_thread_signal();
8665     }
8666   }
8667 }
8668 } // namespace
8669 
8670 void __kmp_hidden_helper_threads_initz_routine() {
8671   // Create a new root for hidden helper team/threads
8672   const int gtid = __kmp_register_root(TRUE);
8673   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8674   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8675   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8676       __kmp_hidden_helper_threads_num;
8677 
8678   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8679 
8680   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8681 
8682   // Set the initialization flag to FALSE
8683   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8684 
8685   __kmp_hidden_helper_threads_deinitz_release();
8686 }
8687